Merge branch 'core/percpu' into percpu-cpumask-x86-for-linus-2

Conflicts: arch/parisc/kernel/irq.c arch/x86/include/asm/fixmap_64.h arch/x86/include/asm/setup.h kernel/irq/handle.c Semantic merge: arch/x86/include/asm/fixmap.h Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-03-26 16:39:17 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-03-27 12:28:43 -0400
commit: 6e15cf04860074ad032e88c306bea656bbdd0f22 (patch)
tree: c346383bb7563e8d66b2f4a502f875b259c34870 /arch/x86/xen
parent: be0ea69674ed95e1e98cb3687a241badc756d228 (diff)
parent: 60db56422043aaa455ac7f858ce23c273220f9d9 (diff)
16 files changed, 1204 insertions, 1242 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 87b9ab166423..b83e119fbeb0 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,7 +6,7 @@ config XEN
        bool "Xen guest support"
        select PARAVIRT
        select PARAVIRT_CLOCK
-        depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
+        depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
        depends on X86_CMPXCHG && X86_TSC
        help
          This is the Linux Xen port.  Enabling this will allow the
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 6dcefba7836f..3b767d03fd6a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
 endif
 obj-y           := enlighten.o setup.o multicalls.o mmu.o irq.o \
-                        time.o xen-asm_$(BITS).o grant-table.o suspend.o
+                        time.o xen-asm.o xen-asm_$(BITS).o \
+                        grant-table.o suspend.o
 obj-$(CONFIG_SMP)               += smp.o spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS)      += debugfs.o
 \ No newline at end of file
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b58e96338149..82cd39a6cbd3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 enum xen_domain_type xen_domain_type = XEN_NATIVE;
 EXPORT_SYMBOL_GPL(xen_domain_type);
-/*
- * Identity map, in addition to plain kernel map.  This needs to be
- * large enough to allocate page table pages to allocate the rest.
- * Each page can map 2MB.
- */
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
-#ifdef CONFIG_X86_64
-/* l3 pud for userspace vsyscall mapping */
-static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
-#endif /* CONFIG_X86_64 */
-/*
- * Note about cr3 (pagetable base) values:
- *
- * xen_cr3 contains the current logical cr3 value; it contains the
- * last set cr3.  This may not be the current effective cr3, because
- * its update may be being lazily deferred.  However, a vcpu looking
- * at its own cr3 can use this value knowing that it everything will
- * be self-consistent.
- *
- * xen_current_cr3 contains the actual vcpu cr3; it is set once the
- * hypercall to set the vcpu cr3 is complete (so it may be a little
- * out of date, but it will never be set early).  If one vcpu is
- * looking at another vcpu's cr3 value, it should use this variable.
- */
-DEFINE_PER_CPU(unsigned long, xen_cr3);  /* cr3 stored as physaddr */
-DEFINE_PER_CPU(unsigned long, xen_current_cr3);  /* actual vcpu cr3 */
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);
 struct shared_info xen_dummy_shared_info;
+void *xen_initial_gdt;
 /*
 * Point at some empty memory to start with. We map the real shared_info
 * page as soon as fixmap is up and running.
@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
 *
 * 0: not available, 1: available
 */
-static int have_vcpu_info_placement =
+static int have_vcpu_info_placement = 1;
-#ifdef CONFIG_X86_32
-        1
-#else
-        0
-#endif
-        ;
 static void xen_vcpu_setup(int cpu)
 {
@@ -137,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
        vcpup = &per_cpu(xen_vcpu_info, cpu);
-        info.mfn = virt_to_mfn(vcpup);
+        info.mfn = arbitrary_virt_to_mfn(vcpup);
        info.offset = offset_in_page(vcpup);
        printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
        return HYPERVISOR_get_debugreg(reg);
 }
-static void xen_leave_lazy(void)
+void xen_leave_lazy(void)
 {
        paravirt_leave_lazy(paravirt_get_lazy_mode());
        xen_mc_flush();
@@ -335,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
        frames = mcs.args;
        for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-                frames[f] = virt_to_mfn(va);
+                frames[f] = arbitrary_virt_to_mfn((void *)va);
                make_lowmem_page_readonly((void *)va);
+                make_lowmem_page_readonly(mfn_to_virt(frames[f]));
        }
        MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -348,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
                                unsigned int cpu, unsigned int i)
 {
        struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-        xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+        xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
        struct multicall_space mc = __xen_mc_entry(0);
        MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -357,13 +325,14 @@ static void load_TLS_descriptor(struct thread_struct *t,
 static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
 {
        /*
-         * XXX sleazy hack: If we're being called in a lazy-cpu zone,
+         * XXX sleazy hack: If we're being called in a lazy-cpu zone
-         * it means we're in a context switch, and %gs has just been
+         * and lazy gs handling is enabled, it means we're in a
-         * saved.  This means we can zero it out to prevent faults on
+         * context switch, and %gs has just been saved.  This means we
-         * exit from the hypervisor if the next process has no %gs.
+         * can zero it out to prevent faults on exit from the
-         * Either way, it has been saved, and the new value will get
+         * hypervisor if the next process has no %gs.  Either way, it
-         * loaded properly.  This will go away as soon as Xen has been
+         * has been saved, and the new value will get loaded properly.
-         * modified to not save/restore %gs for normal hypercalls.
+         * This will go away as soon as Xen has been modified to not
+         * save/restore %gs for normal hypercalls.
         *
         * On x86_64, this hack is not used for %gs, because gs points
         * to KERNEL_GS_BASE (and uses it for PDA references), so we
@@ -375,7 +344,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
         */
        if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
 #ifdef CONFIG_X86_32
-                loadsegment(gs, 0);
+                lazy_load_gs(0);
 #else
                loadsegment(fs, 0);
 #endif
@@ -521,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
                break;
        default: {
-                xmaddr_t maddr = virt_to_machine(&dt[entry]);
+                xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
                xen_mc_flush();
                if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
@@ -587,94 +556,18 @@ static u32 xen_safe_apic_wait_icr_idle(void)
        return 0;
 }
-static struct apic_ops xen_basic_apic_ops = {
+static void set_xen_basic_apic_ops(void)
-        .read = xen_apic_read,
-        .write = xen_apic_write,
-        .icr_read = xen_apic_icr_read,
-        .icr_write = xen_apic_icr_write,
-        .wait_icr_idle = xen_apic_wait_icr_idle,
-        .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
-};
-#endif
-static void xen_flush_tlb(void)
-{
-        struct mmuext_op *op;
-        struct multicall_space mcs;
-        preempt_disable();
-        mcs = xen_mc_entry(sizeof(*op));
-        op = mcs.args;
-        op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
-        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-        xen_mc_issue(PARAVIRT_LAZY_MMU);
-        preempt_enable();
-}
-static void xen_flush_tlb_single(unsigned long addr)
 {
-        struct mmuext_op *op;
+        apic->read = xen_apic_read;
-        struct multicall_space mcs;
+        apic->write = xen_apic_write;
+        apic->icr_read = xen_apic_icr_read;
-        preempt_disable();
+        apic->icr_write = xen_apic_icr_write;
+        apic->wait_icr_idle = xen_apic_wait_icr_idle;
-        mcs = xen_mc_entry(sizeof(*op));
+        apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
-        op = mcs.args;
-        op->cmd = MMUEXT_INVLPG_LOCAL;
-        op->arg1.linear_addr = addr & PAGE_MASK;
-        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-        xen_mc_issue(PARAVIRT_LAZY_MMU);
-        preempt_enable();
 }
-static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
+#endif
-                                 unsigned long va)
-{
-        struct {
-                struct mmuext_op op;
-                cpumask_t mask;
-        } *args;
-        cpumask_t cpumask = *cpus;
-        struct multicall_space mcs;
-        /*
-         * A couple of (to be removed) sanity checks:
-         *
-         * - current CPU must not be in mask
-         * - mask must exist :)
-         */
-        BUG_ON(cpus_empty(cpumask));
-        BUG_ON(cpu_isset(smp_processor_id(), cpumask));
-        BUG_ON(!mm);
-        /* If a CPU which we ran on has gone down, OK. */
-        cpus_and(cpumask, cpumask, cpu_online_map);
-        if (cpus_empty(cpumask))
-                return;
-        mcs = xen_mc_entry(sizeof(*args));
-        args = mcs.args;
-        args->mask = cpumask;
-        args->op.arg2.vcpumask = &args->mask;
-        if (va == TLB_FLUSH_ALL) {
-                args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-        } else {
-                args->op.cmd = MMUEXT_INVLPG_MULTI;
-                args->op.arg1.linear_addr = va;
-        }
-        MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
-        xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
 static void xen_clts(void)
 {
@@ -700,21 +593,6 @@ static void xen_write_cr0(unsigned long cr0)
        xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
-static void xen_write_cr2(unsigned long cr2)
-{
-        x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
-}
-static unsigned long xen_read_cr2(void)
-{
-        return x86_read_percpu(xen_vcpu)->arch.cr2;
-}
-static unsigned long xen_read_cr2_direct(void)
-{
-        return x86_read_percpu(xen_vcpu_info.arch.cr2);
-}
 static void xen_write_cr4(unsigned long cr4)
 {
        cr4 &= ~X86_CR4_PGE;
@@ -723,71 +601,6 @@ static void xen_write_cr4(unsigned long cr4)
        native_write_cr4(cr4);
 }
-static unsigned long xen_read_cr3(void)
-{
-        return x86_read_percpu(xen_cr3);
-}
-static void set_current_cr3(void *v)
-{
-        x86_write_percpu(xen_current_cr3, (unsigned long)v);
-}
-static void __xen_write_cr3(bool kernel, unsigned long cr3)
-{
-        struct mmuext_op *op;
-        struct multicall_space mcs;
-        unsigned long mfn;
-        if (cr3)
-                mfn = pfn_to_mfn(PFN_DOWN(cr3));
-        else
-                mfn = 0;
-        WARN_ON(mfn == 0 && kernel);
-        mcs = __xen_mc_entry(sizeof(*op));
-        op = mcs.args;
-        op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
-        op->arg1.mfn = mfn;
-        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-        if (kernel) {
-                x86_write_percpu(xen_cr3, cr3);
-                /* Update xen_current_cr3 once the batch has actually
-                   been submitted. */
-                xen_mc_callback(set_current_cr3, (void *)cr3);
-        }
-}
-static void xen_write_cr3(unsigned long cr3)
-{
-        BUG_ON(preemptible());
-        xen_mc_batch();  /* disables interrupts */
-        /* Update while interrupts are disabled, so its atomic with
-           respect to ipis */
-        x86_write_percpu(xen_cr3, cr3);
-        __xen_write_cr3(true, cr3);
-#ifdef CONFIG_X86_64
-        {
-                pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
-                if (user_pgd)
-                        __xen_write_cr3(false, __pa(user_pgd));
-                else
-                        __xen_write_cr3(false, 0);
-        }
-#endif
-        xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
-}
 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 {
        int ret;
@@ -829,185 +642,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
        return ret;
 }
-/* Early in boot, while setting up the initial pagetable, assume
-   everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
-{
-#ifdef CONFIG_FLATMEM
-        BUG_ON(mem_map);        /* should only be used early */
-#endif
-        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-}
-/* Early release_pte assumes that all pts are pinned, since there's
-   only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
-{
-        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
-}
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
-        struct mmuext_op op;
-        op.cmd = cmd;
-        op.arg1.mfn = pfn_to_mfn(pfn);
-        if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-                BUG();
-}
-/* This needs to make sure the new pte page is pinned iff its being
-   attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
-{
-        struct page *page = pfn_to_page(pfn);
-        if (PagePinned(virt_to_page(mm->pgd))) {
-                SetPagePinned(page);
-                vm_unmap_aliases();
-                if (!PageHighMem(page)) {
-                        make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
-                        if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-                                pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
-                } else {
-                        /* make sure there are no stray mappings of
-                           this page */
-                        kmap_flush_unused();
-                }
-        }
-}
-static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
-{
-        xen_alloc_ptpage(mm, pfn, PT_PTE);
-}
-static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
-{
-        xen_alloc_ptpage(mm, pfn, PT_PMD);
-}
-static int xen_pgd_alloc(struct mm_struct *mm)
-{
-        pgd_t *pgd = mm->pgd;
-        int ret = 0;
-        BUG_ON(PagePinned(virt_to_page(pgd)));
-#ifdef CONFIG_X86_64
-        {
-                struct page *page = virt_to_page(pgd);
-                pgd_t *user_pgd;
-                BUG_ON(page->private != 0);
-                ret = -ENOMEM;
-                user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-                page->private = (unsigned long)user_pgd;
-                if (user_pgd != NULL) {
-                        user_pgd[pgd_index(VSYSCALL_START)] =
-                                __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
-                        ret = 0;
-                }
-                BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
-        }
-#endif
-        return ret;
-}
-static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-#ifdef CONFIG_X86_64
-        pgd_t *user_pgd = xen_get_user_pgd(pgd);
-        if (user_pgd)
-                free_page((unsigned long)user_pgd);
-#endif
-}
-/* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(unsigned long pfn, unsigned level)
-{
-        struct page *page = pfn_to_page(pfn);
-        if (PagePinned(page)) {
-                if (!PageHighMem(page)) {
-                        if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-                                pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
-                        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
-                }
-                ClearPagePinned(page);
-        }
-}
-static void xen_release_pte(unsigned long pfn)
-{
-        xen_release_ptpage(pfn, PT_PTE);
-}
-static void xen_release_pmd(unsigned long pfn)
-{
-        xen_release_ptpage(pfn, PT_PMD);
-}
-#if PAGETABLE_LEVELS == 4
-static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
-{
-        xen_alloc_ptpage(mm, pfn, PT_PUD);
-}
-static void xen_release_pud(unsigned long pfn)
-{
-        xen_release_ptpage(pfn, PT_PUD);
-}
-#endif
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
-{
-        pgprot_t prot = PAGE_KERNEL;
-        if (PagePinned(page))
-                prot = PAGE_KERNEL_RO;
-        if (0 && PageHighMem(page))
-                printk("mapping highpte %lx type %d prot %s\n",
-                       page_to_pfn(page), type,
-                       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
-        return kmap_atomic_prot(page, type, prot);
-}
-#endif
-#ifdef CONFIG_X86_32
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
-{
-        /* If there's an existing pte, then don't allow _PAGE_RW to be set */
-        if (pte_val_ma(*ptep) & _PAGE_PRESENT)
-                pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
-                               pte_val_ma(pte));
-        return pte;
-}
-/* Init-time set_pte while constructing initial pagetables, which
-   doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
-{
-        pte = mask_rw_pte(ptep, pte);
-        xen_set_pte(ptep, pte);
-}
-#endif
-static __init void xen_pagetable_setup_start(pgd_t *base)
-{
-}
 void xen_setup_shared_info(void)
 {
        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1028,37 +662,6 @@ void xen_setup_shared_info(void)
        xen_setup_mfn_list_list();
 }
-static __init void xen_pagetable_setup_done(pgd_t *base)
-{
-        xen_setup_shared_info();
-}
-static __init void xen_post_allocator_init(void)
-{
-        pv_mmu_ops.set_pte = xen_set_pte;
-        pv_mmu_ops.set_pmd = xen_set_pmd;
-        pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
-        pv_mmu_ops.set_pgd = xen_set_pgd;
-#endif
-        /* This will work as long as patching hasn't happened yet
-           (which it hasn't) */
-        pv_mmu_ops.alloc_pte = xen_alloc_pte;
-        pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
-        pv_mmu_ops.release_pte = xen_release_pte;
-        pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
-        pv_mmu_ops.alloc_pud = xen_alloc_pud;
-        pv_mmu_ops.release_pud = xen_release_pud;
-#endif
-#ifdef CONFIG_X86_64
-        SetPagePinned(virt_to_page(level3_user_vsyscall));
-#endif
-        xen_mark_init_mm_pinned();
-}
 /* This is called once we have the cpu_possible_map */
 void xen_setup_vcpu_info_placement(void)
 {
@@ -1072,10 +675,10 @@ void xen_setup_vcpu_info_placement(void)
        if (have_vcpu_info_placement) {
                printk(KERN_INFO "Xen: using vcpu_info placement\n");
-                pv_irq_ops.save_fl = xen_save_fl_direct;
+                pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
-                pv_irq_ops.restore_fl = xen_restore_fl_direct;
+                pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
-                pv_irq_ops.irq_disable = xen_irq_disable_direct;
+                pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
-                pv_irq_ops.irq_enable = xen_irq_enable_direct;
+                pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
                pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
        }
 }
@@ -1133,49 +736,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
        return ret;
 }
-static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
-{
-        pte_t pte;
-        phys >>= PAGE_SHIFT;
-        switch (idx) {
-        case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-#ifdef CONFIG_X86_F00F_BUG
-        case FIX_F00F_IDT:
-#endif
-#ifdef CONFIG_X86_32
-        case FIX_WP_TEST:
-        case FIX_VDSO:
-# ifdef CONFIG_HIGHMEM
-        case FIX_KMAP_BEGIN ... FIX_KMAP_END:
-# endif
-#else
-        case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-        case FIX_APIC_BASE:     /* maps dummy local APIC */
-#endif
-                pte = pfn_pte(phys, prot);
-                break;
-        default:
-                pte = mfn_pte(phys, prot);
-                break;
-        }
-        __native_set_fixmap(idx, pte);
-#ifdef CONFIG_X86_64
-        /* Replicate changes to map the vsyscall page into the user
-           pagetable vsyscall mapping. */
-        if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
-                unsigned long vaddr = __fix_to_virt(idx);
-                set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
-        }
-#endif
-}
 static const struct pv_info xen_info __initdata = {
        .paravirt_enabled = 1,
        .shared_kernel_pmd = 0,
@@ -1271,87 +831,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
 #endif
 };
-static const struct pv_mmu_ops xen_mmu_ops __initdata = {
-        .pagetable_setup_start = xen_pagetable_setup_start,
-        .pagetable_setup_done = xen_pagetable_setup_done,
-        .read_cr2 = xen_read_cr2,
-        .write_cr2 = xen_write_cr2,
-        .read_cr3 = xen_read_cr3,
-        .write_cr3 = xen_write_cr3,
-        .flush_tlb_user = xen_flush_tlb,
-        .flush_tlb_kernel = xen_flush_tlb,
-        .flush_tlb_single = xen_flush_tlb_single,
-        .flush_tlb_others = xen_flush_tlb_others,
-        .pte_update = paravirt_nop,
-        .pte_update_defer = paravirt_nop,
-        .pgd_alloc = xen_pgd_alloc,
-        .pgd_free = xen_pgd_free,
-        .alloc_pte = xen_alloc_pte_init,
-        .release_pte = xen_release_pte_init,
-        .alloc_pmd = xen_alloc_pte_init,
-        .alloc_pmd_clone = paravirt_nop,
-        .release_pmd = xen_release_pte_init,
-#ifdef CONFIG_HIGHPTE
-        .kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-#ifdef CONFIG_X86_64
-        .set_pte = xen_set_pte,
-#else
-        .set_pte = xen_set_pte_init,
-#endif
-        .set_pte_at = xen_set_pte_at,
-        .set_pmd = xen_set_pmd_hyper,
-        .ptep_modify_prot_start = __ptep_modify_prot_start,
-        .ptep_modify_prot_commit = __ptep_modify_prot_commit,
-        .pte_val = xen_pte_val,
-        .pte_flags = native_pte_flags,
-        .pgd_val = xen_pgd_val,
-        .make_pte = xen_make_pte,
-        .make_pgd = xen_make_pgd,
-#ifdef CONFIG_X86_PAE
-        .set_pte_atomic = xen_set_pte_atomic,
-        .set_pte_present = xen_set_pte_at,
-        .pte_clear = xen_pte_clear,
-        .pmd_clear = xen_pmd_clear,
-#endif  /* CONFIG_X86_PAE */
-        .set_pud = xen_set_pud_hyper,
-        .make_pmd = xen_make_pmd,
-        .pmd_val = xen_pmd_val,
-#if PAGETABLE_LEVELS == 4
-        .pud_val = xen_pud_val,
-        .make_pud = xen_make_pud,
-        .set_pgd = xen_set_pgd_hyper,
-        .alloc_pud = xen_alloc_pte_init,
-        .release_pud = xen_release_pte_init,
-#endif  /* PAGETABLE_LEVELS == 4 */
-        .activate_mm = xen_activate_mm,
-        .dup_mmap = xen_dup_mmap,
-        .exit_mmap = xen_exit_mmap,
-        .lazy_mode = {
-                .enter = paravirt_enter_lazy_mmu,
-                .leave = xen_leave_lazy,
-        },
-        .set_fixmap = xen_set_fixmap,
-};
 static void xen_reboot(int reason)
 {
        struct sched_shutdown r = { .reason = reason };
@@ -1394,223 +873,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
 };
-static void __init xen_reserve_top(void)
-{
-#ifdef CONFIG_X86_32
-        unsigned long top = HYPERVISOR_VIRT_START;
-        struct xen_platform_parameters pp;
-        if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
-                top = pp.virt_start;
-        reserve_top_address(-top);
-#endif  /* CONFIG_X86_32 */
-}
-/*
- * Like __va(), but returns address in the kernel mapping (which is
- * all we have until the physical memory mapping has been set up.
- */
-static void *__ka(phys_addr_t paddr)
-{
-#ifdef CONFIG_X86_64
-        return (void *)(paddr + __START_KERNEL_map);
-#else
-        return __va(paddr);
-#endif
-}
-/* Convert a machine address to physical address */
-static unsigned long m2p(phys_addr_t maddr)
-{
-        phys_addr_t paddr;
-        maddr &= PTE_PFN_MASK;
-        paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
-        return paddr;
-}
-/* Convert a machine address to kernel virtual */
-static void *m2v(phys_addr_t maddr)
-{
-        return __ka(m2p(maddr));
-}
-static void set_page_prot(void *addr, pgprot_t prot)
-{
-        unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
-        pte_t pte = pfn_pte(pfn, prot);
-        if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
-                BUG();
-}
-static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
-{
-        unsigned pmdidx, pteidx;
-        unsigned ident_pte;
-        unsigned long pfn;
-        ident_pte = 0;
-        pfn = 0;
-        for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
-                pte_t *pte_page;
-                /* Reuse or allocate a page of ptes */
-                if (pmd_present(pmd[pmdidx]))
-                        pte_page = m2v(pmd[pmdidx].pmd);
-                else {
-                        /* Check for free pte pages */
-                        if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
-                                break;
-                        pte_page = &level1_ident_pgt[ident_pte];
-                        ident_pte += PTRS_PER_PTE;
-                        pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
-                }
-                /* Install mappings */
-                for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
-                        pte_t pte;
-                        if (pfn > max_pfn_mapped)
-                                max_pfn_mapped = pfn;
-                        if (!pte_none(pte_page[pteidx]))
-                                continue;
-                        pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
-                        pte_page[pteidx] = pte;
-                }
-        }
-        for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
-                set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
-        set_page_prot(pmd, PAGE_KERNEL_RO);
-}
-#ifdef CONFIG_X86_64
-static void convert_pfn_mfn(void *v)
-{
-        pte_t *pte = v;
-        int i;
-        /* All levels are converted the same way, so just treat them
-           as ptes. */
-        for (i = 0; i < PTRS_PER_PTE; i++)
-                pte[i] = xen_make_pte(pte[i].pte);
-}
-/*
- * Set up the inital kernel pagetable.
- *
- * We can construct this by grafting the Xen provided pagetable into
- * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
- * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
- * means that only the kernel has a physical mapping to start with -
- * but that's enough to get __va working.  We need to fill in the rest
- * of the physical mapping once some sort of allocator has been set
- * up.
- */
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
-                                                unsigned long max_pfn)
-{
-        pud_t *l3;
-        pmd_t *l2;
-        /* Zap identity mapping */
-        init_level4_pgt[0] = __pgd(0);
-        /* Pre-constructed entries are in pfn, so convert to mfn */
-        convert_pfn_mfn(init_level4_pgt);
-        convert_pfn_mfn(level3_ident_pgt);
-        convert_pfn_mfn(level3_kernel_pgt);
-        l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
-        l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
-        memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-        memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-        l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
-        l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
-        memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-        /* Set up identity map */
-        xen_map_identity_early(level2_ident_pgt, max_pfn);
-        /* Make pagetable pieces RO */
-        set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
-        set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
-        set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
-        set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
-        set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
-        set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
-        /* Pin down new L4 */
-        pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
-                          PFN_DOWN(__pa_symbol(init_level4_pgt)));
-        /* Unpin Xen-provided one */
-        pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-        /* Switch over */
-        pgd = init_level4_pgt;
-        /*
-         * At this stage there can be no user pgd, and no page
-         * structure to attach it to, so make sure we just set kernel
-         * pgd.
-         */
-        xen_mc_batch();
-        __xen_write_cr3(true, __pa(pgd));
-        xen_mc_issue(PARAVIRT_LAZY_CPU);
-        reserve_early(__pa(xen_start_info->pt_base),
-                      __pa(xen_start_info->pt_base +
-                           xen_start_info->nr_pt_frames * PAGE_SIZE),
-                      "XEN PAGETABLES");
-        return pgd;
-}
-#else   /* !CONFIG_X86_64 */
-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
-                                                unsigned long max_pfn)
-{
-        pmd_t *kernel_pmd;
-        init_pg_tables_start = __pa(pgd);
-        init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-        max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
-        kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
-        memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
-        xen_map_identity_early(level2_kernel_pgt, max_pfn);
-        memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
-        set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
-                        __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
-        set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
-        set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
-        set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
-        pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-        xen_write_cr3(__pa(swapper_pg_dir));
-        pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
-        return swapper_pg_dir;
-}
-#endif  /* CONFIG_X86_64 */
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -1639,7 +901,7 @@ asmlinkage void __init xen_start_kernel(void)
        /*
         * set up the basic apic ops.
         */
-        apic_ops = &xen_basic_apic_ops;
+        set_xen_basic_apic_ops();
 #endif
        if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
@@ -1650,10 +912,18 @@ asmlinkage void __init xen_start_kernel(void)
        machine_ops = xen_machine_ops;
 #ifdef CONFIG_X86_64
-        /* Disable until direct per-cpu data access. */
+        /*
-        have_vcpu_info_placement = 0;
+         * Setup percpu state.  We only need to do this for 64-bit
-        x86_64_init_pda();
+         * because 32-bit already has %fs set properly.
+         */
+        load_percpu_segment(0);
 #endif
+        /*
+         * The only reliable way to retain the initial address of the
+         * percpu gdt_page is to remember it here, so we can go and
+         * mark it RW later, when the initial percpu area is freed.
+         */
+        xen_initial_gdt = &per_cpu(gdt_page, 0);
        xen_smp_init();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index bb042608c602..cfd17799bd6d 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -19,27 +19,12 @@ void xen_force_evtchn_callback(void)
        (void)HYPERVISOR_xen_version(0, NULL);
 }
-static void __init __xen_init_IRQ(void)
-{
-        int i;
-        /* Create identity vector->irq map */
-        for(i = 0; i < NR_VECTORS; i++) {
-                int cpu;
-                for_each_possible_cpu(cpu)
-                        per_cpu(vector_irq, cpu)[i] = i;
-        }
-        xen_init_IRQ();
-}
 static unsigned long xen_save_fl(void)
 {
        struct vcpu_info *vcpu;
        unsigned long flags;
-        vcpu = x86_read_percpu(xen_vcpu);
+        vcpu = percpu_read(xen_vcpu);
        /* flag has opposite sense of mask */
        flags = !vcpu->evtchn_upcall_mask;
@@ -50,6 +35,7 @@ static unsigned long xen_save_fl(void)
        */
        return (-flags) & X86_EFLAGS_IF;
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
 static void xen_restore_fl(unsigned long flags)
 {
@@ -62,7 +48,7 @@ static void xen_restore_fl(unsigned long flags)
           make sure we're don't switch CPUs between getting the vcpu
           pointer and updating the mask. */
        preempt_disable();
-        vcpu = x86_read_percpu(xen_vcpu);
+        vcpu = percpu_read(xen_vcpu);
        vcpu->evtchn_upcall_mask = flags;
        preempt_enable_no_resched();
@@ -76,6 +62,7 @@ static void xen_restore_fl(unsigned long flags)
                        xen_force_evtchn_callback();
        }
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
 static void xen_irq_disable(void)
 {
@@ -83,9 +70,10 @@ static void xen_irq_disable(void)
           make sure we're don't switch CPUs between getting the vcpu
           pointer and updating the mask. */
        preempt_disable();
-        x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+        percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
        preempt_enable_no_resched();
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
 static void xen_irq_enable(void)
 {
@@ -96,7 +84,7 @@ static void xen_irq_enable(void)
           the caller is confused and is trying to re-enable interrupts
           on an indeterminate processor. */
-        vcpu = x86_read_percpu(xen_vcpu);
+        vcpu = percpu_read(xen_vcpu);
        vcpu->evtchn_upcall_mask = 0;
        /* Doesn't matter if we get preempted here, because any
@@ -106,6 +94,7 @@ static void xen_irq_enable(void)
        if (unlikely(vcpu->evtchn_upcall_pending))
                xen_force_evtchn_callback();
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
 static void xen_safe_halt(void)
 {
@@ -123,11 +112,13 @@ static void xen_halt(void)
 }
 static const struct pv_irq_ops xen_irq_ops __initdata = {
-        .init_IRQ = __xen_init_IRQ,
+        .init_IRQ = xen_init_IRQ,
-        .save_fl = xen_save_fl,
-        .restore_fl = xen_restore_fl,
+        .save_fl = PV_CALLEE_SAVE(xen_save_fl),
-        .irq_disable = xen_irq_disable,
+        .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
-        .irq_enable = xen_irq_enable,
+        .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
+        .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
        .safe_halt = xen_safe_halt,
        .halt = xen_halt,
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240e26c7..cb6afa4ec95c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,6 +47,7 @@
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
 #include <asm/mmu_context.h>
+#include <asm/setup.h>
 #include <asm/paravirt.h>
 #include <asm/linkage.h>
@@ -55,6 +56,8 @@
 #include <xen/page.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+#include <xen/hvc-console.h>
 #include "multicalls.h"
 #include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
 #endif /* CONFIG_XEN_DEBUG_FS */
+/*
+ * Identity map, in addition to plain kernel map.  This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+/*
+ * Note about cr3 (pagetable base) values:
+ *
+ * xen_cr3 contains the current logical cr3 value; it contains the
+ * last set cr3.  This may not be the current effective cr3, because
+ * its update may be being lazily deferred.  However, a vcpu looking
+ * at its own cr3 can use this value knowing that it everything will
+ * be self-consistent.
+ *
+ * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+ * hypercall to set the vcpu cr3 is complete (so it may be a little
+ * out of date, but it will never be set early).  If one vcpu is
+ * looking at another vcpu's cr3 value, it should use this variable.
+ */
+DEFINE_PER_CPU(unsigned long, xen_cr3);  /* cr3 stored as physaddr */
+DEFINE_PER_CPU(unsigned long, xen_current_cr3);  /* actual vcpu cr3 */
 /*
 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 * redzone above it, so round it up to a PGD boundary.
@@ -242,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
        p2m_top[topidx][idx] = mfn;
 }
+unsigned long arbitrary_virt_to_mfn(void *vaddr)
+{
+        xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
+        return PFN_DOWN(maddr.maddr);
+}
 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 {
        unsigned long address = (unsigned long)vaddr;
@@ -458,28 +499,33 @@ pteval_t xen_pte_val(pte_t pte)
 {
        return pte_mfn_to_pfn(pte.pte);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 pgdval_t xen_pgd_val(pgd_t pgd)
 {
        return pte_mfn_to_pfn(pgd.pgd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 pte_t xen_make_pte(pteval_t pte)
 {
        pte = pte_pfn_to_mfn(pte);
        return native_make_pte(pte);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 pgd_t xen_make_pgd(pgdval_t pgd)
 {
        pgd = pte_pfn_to_mfn(pgd);
        return native_make_pgd(pgd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 pmdval_t xen_pmd_val(pmd_t pmd)
 {
        return pte_mfn_to_pfn(pmd.pmd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 {
@@ -556,12 +602,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
        pmd = pte_pfn_to_mfn(pmd);
        return native_make_pmd(pmd);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 #if PAGETABLE_LEVELS == 4
 pudval_t xen_pud_val(pud_t pud)
 {
        return pte_mfn_to_pfn(pud.pud);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 pud_t xen_make_pud(pudval_t pud)
 {
@@ -569,6 +617,7 @@ pud_t xen_make_pud(pudval_t pud)
        return native_make_pud(pud);
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 pgd_t *xen_get_user_pgd(pgd_t *pgd)
 {
@@ -1063,18 +1112,14 @@ static void drop_other_mm_ref(void *info)
        struct mm_struct *mm = info;
        struct mm_struct *active_mm;
-#ifdef CONFIG_X86_64
+        active_mm = percpu_read(cpu_tlbstate.active_mm);
-        active_mm = read_pda(active_mm);
-#else
-        active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
-#endif
        if (active_mm == mm)
                leave_mm(smp_processor_id());
        /* If this cpu still has a stale cr3 reference, then make sure
           it has been flushed. */
-        if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+        if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
                load_cr3(swapper_pg_dir);
                arch_flush_lazy_cpu_mode();
        }
@@ -1156,6 +1201,706 @@ void xen_exit_mmap(struct mm_struct *mm)
        spin_unlock(&mm->page_table_lock);
 }
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+}
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+        xen_setup_shared_info();
+}
+static void xen_write_cr2(unsigned long cr2)
+{
+        percpu_read(xen_vcpu)->arch.cr2 = cr2;
+}
+static unsigned long xen_read_cr2(void)
+{
+        return percpu_read(xen_vcpu)->arch.cr2;
+}
+unsigned long xen_read_cr2_direct(void)
+{
+        return percpu_read(xen_vcpu_info.arch.cr2);
+}
+static void xen_flush_tlb(void)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
+}
+static void xen_flush_tlb_single(unsigned long addr)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_INVLPG_LOCAL;
+        op->arg1.linear_addr = addr & PAGE_MASK;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
+}
+static void xen_flush_tlb_others(const struct cpumask *cpus,
+                                 struct mm_struct *mm, unsigned long va)
+{
+        struct {
+                struct mmuext_op op;
+                DECLARE_BITMAP(mask, NR_CPUS);
+        } *args;
+        struct multicall_space mcs;
+        BUG_ON(cpumask_empty(cpus));
+        BUG_ON(!mm);
+        mcs = xen_mc_entry(sizeof(*args));
+        args = mcs.args;
+        args->op.arg2.vcpumask = to_cpumask(args->mask);
+        /* Remove us, and any offline CPUS. */
+        cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
+        cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
+        if (va == TLB_FLUSH_ALL) {
+                args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+        } else {
+                args->op.cmd = MMUEXT_INVLPG_MULTI;
+                args->op.arg1.linear_addr = va;
+        }
+        MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+static unsigned long xen_read_cr3(void)
+{
+        return percpu_read(xen_cr3);
+}
+static void set_current_cr3(void *v)
+{
+        percpu_write(xen_current_cr3, (unsigned long)v);
+}
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs;
+        unsigned long mfn;
+        if (cr3)
+                mfn = pfn_to_mfn(PFN_DOWN(cr3));
+        else
+                mfn = 0;
+        WARN_ON(mfn == 0 && kernel);
+        mcs = __xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+        op->arg1.mfn = mfn;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        if (kernel) {
+                percpu_write(xen_cr3, cr3);
+                /* Update xen_current_cr3 once the batch has actually
+                   been submitted. */
+                xen_mc_callback(set_current_cr3, (void *)cr3);
+        }
+}
+static void xen_write_cr3(unsigned long cr3)
+{
+        BUG_ON(preemptible());
+        xen_mc_batch();  /* disables interrupts */
+        /* Update while interrupts are disabled, so its atomic with
+           respect to ipis */
+        percpu_write(xen_cr3, cr3);
+        __xen_write_cr3(true, cr3);
+#ifdef CONFIG_X86_64
+        {
+                pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+                if (user_pgd)
+                        __xen_write_cr3(false, __pa(user_pgd));
+                else
+                        __xen_write_cr3(false, 0);
+        }
+#endif
+        xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
+}
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+        pgd_t *pgd = mm->pgd;
+        int ret = 0;
+        BUG_ON(PagePinned(virt_to_page(pgd)));
+#ifdef CONFIG_X86_64
+        {
+                struct page *page = virt_to_page(pgd);
+                pgd_t *user_pgd;
+                BUG_ON(page->private != 0);
+                ret = -ENOMEM;
+                user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+                page->private = (unsigned long)user_pgd;
+                if (user_pgd != NULL) {
+                        user_pgd[pgd_index(VSYSCALL_START)] =
+                                __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+                        ret = 0;
+                }
+                BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+        }
+#endif
+        return ret;
+}
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+        pgd_t *user_pgd = xen_get_user_pgd(pgd);
+        if (user_pgd)
+                free_page((unsigned long)user_pgd);
+#endif
+}
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+        pgprot_t prot = PAGE_KERNEL;
+        if (PagePinned(page))
+                prot = PAGE_KERNEL_RO;
+        if (0 && PageHighMem(page))
+                printk("mapping highpte %lx type %d prot %s\n",
+                       page_to_pfn(page), type,
+                       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+        return kmap_atomic_prot(page, type, prot);
+}
+#endif
+#ifdef CONFIG_X86_32
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+        /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+        if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+                pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+                               pte_val_ma(pte));
+        return pte;
+}
+/* Init-time set_pte while constructing initial pagetables, which
+   doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+        pte = mask_rw_pte(ptep, pte);
+        xen_set_pte(ptep, pte);
+}
+#endif
+/* Early in boot, while setting up the initial pagetable, assume
+   everything is pinned. */
+static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+        BUG_ON(mem_map);        /* should only be used early */
+#endif
+        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+/* Early release_pte assumes that all pts are pinned, since there's
+   only init_mm and anything attached to that is pinned. */
+static void xen_release_pte_init(unsigned long pfn)
+{
+        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+        struct mmuext_op op;
+        op.cmd = cmd;
+        op.arg1.mfn = pfn_to_mfn(pfn);
+        if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+                BUG();
+}
+/* This needs to make sure the new pte page is pinned iff its being
+   attached to a pinned pagetable. */
+static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
+{
+        struct page *page = pfn_to_page(pfn);
+        if (PagePinned(virt_to_page(mm->pgd))) {
+                SetPagePinned(page);
+                vm_unmap_aliases();
+                if (!PageHighMem(page)) {
+                        make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+                        if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+                                pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+                } else {
+                        /* make sure there are no stray mappings of
+                           this page */
+                        kmap_flush_unused();
+                }
+        }
+}
+static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+{
+        xen_alloc_ptpage(mm, pfn, PT_PTE);
+}
+static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+{
+        xen_alloc_ptpage(mm, pfn, PT_PMD);
+}
+/* This should never happen until we're OK to use struct page */
+static void xen_release_ptpage(unsigned long pfn, unsigned level)
+{
+        struct page *page = pfn_to_page(pfn);
+        if (PagePinned(page)) {
+                if (!PageHighMem(page)) {
+                        if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+                                pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+                        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+                }
+                ClearPagePinned(page);
+        }
+}
+static void xen_release_pte(unsigned long pfn)
+{
+        xen_release_ptpage(pfn, PT_PTE);
+}
+static void xen_release_pmd(unsigned long pfn)
+{
+        xen_release_ptpage(pfn, PT_PMD);
+}
+#if PAGETABLE_LEVELS == 4
+static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+{
+        xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+static void xen_release_pud(unsigned long pfn)
+{
+        xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+void __init xen_reserve_top(void)
+{
+#ifdef CONFIG_X86_32
+        unsigned long top = HYPERVISOR_VIRT_START;
+        struct xen_platform_parameters pp;
+        if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+                top = pp.virt_start;
+        reserve_top_address(-top);
+#endif  /* CONFIG_X86_32 */
+}
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void *__ka(phys_addr_t paddr)
+{
+#ifdef CONFIG_X86_64
+        return (void *)(paddr + __START_KERNEL_map);
+#else
+        return __va(paddr);
+#endif
+}
+/* Convert a machine address to physical address */
+static unsigned long m2p(phys_addr_t maddr)
+{
+        phys_addr_t paddr;
+        maddr &= PTE_PFN_MASK;
+        paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+        return paddr;
+}
+/* Convert a machine address to kernel virtual */
+static void *m2v(phys_addr_t maddr)
+{
+        return __ka(m2p(maddr));
+}
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+        unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+        pte_t pte = pfn_pte(pfn, prot);
+        if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+                BUG();
+}
+static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+{
+        unsigned pmdidx, pteidx;
+        unsigned ident_pte;
+        unsigned long pfn;
+        ident_pte = 0;
+        pfn = 0;
+        for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+                pte_t *pte_page;
+                /* Reuse or allocate a page of ptes */
+                if (pmd_present(pmd[pmdidx]))
+                        pte_page = m2v(pmd[pmdidx].pmd);
+                else {
+                        /* Check for free pte pages */
+                        if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+                                break;
+                        pte_page = &level1_ident_pgt[ident_pte];
+                        ident_pte += PTRS_PER_PTE;
+                        pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+                }
+                /* Install mappings */
+                for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+                        pte_t pte;
+                        if (pfn > max_pfn_mapped)
+                                max_pfn_mapped = pfn;
+                        if (!pte_none(pte_page[pteidx]))
+                                continue;
+                        pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+                        pte_page[pteidx] = pte;
+                }
+        }
+        for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+                set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+        set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+#ifdef CONFIG_X86_64
+static void convert_pfn_mfn(void *v)
+{
+        pte_t *pte = v;
+        int i;
+        /* All levels are converted the same way, so just treat them
+           as ptes. */
+        for (i = 0; i < PTRS_PER_PTE; i++)
+                pte[i] = xen_make_pte(pte[i].pte);
+}
+/*
+ * Set up the inital kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
+ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
+ * means that only the kernel has a physical mapping to start with -
+ * but that's enough to get __va working.  We need to fill in the rest
+ * of the physical mapping once some sort of allocator has been set
+ * up.
+ */
+__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+                                         unsigned long max_pfn)
+{
+        pud_t *l3;
+        pmd_t *l2;
+        /* Zap identity mapping */
+        init_level4_pgt[0] = __pgd(0);
+        /* Pre-constructed entries are in pfn, so convert to mfn */
+        convert_pfn_mfn(init_level4_pgt);
+        convert_pfn_mfn(level3_ident_pgt);
+        convert_pfn_mfn(level3_kernel_pgt);
+        l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+        l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+        memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+        memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+        l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+        l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+        memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+        /* Set up identity map */
+        xen_map_identity_early(level2_ident_pgt, max_pfn);
+        /* Make pagetable pieces RO */
+        set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+        set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+        set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+        set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+        set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+        set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+        /* Pin down new L4 */
+        pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+                          PFN_DOWN(__pa_symbol(init_level4_pgt)));
+        /* Unpin Xen-provided one */
+        pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+        /* Switch over */
+        pgd = init_level4_pgt;
+        /*
+         * At this stage there can be no user pgd, and no page
+         * structure to attach it to, so make sure we just set kernel
+         * pgd.
+         */
+        xen_mc_batch();
+        __xen_write_cr3(true, __pa(pgd));
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+        reserve_early(__pa(xen_start_info->pt_base),
+                      __pa(xen_start_info->pt_base +
+                           xen_start_info->nr_pt_frames * PAGE_SIZE),
+                      "XEN PAGETABLES");
+        return pgd;
+}
+#else   /* !CONFIG_X86_64 */
+static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+                                         unsigned long max_pfn)
+{
+        pmd_t *kernel_pmd;
+        init_pg_tables_start = __pa(pgd);
+        init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+        max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+        kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+        memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+        xen_map_identity_early(level2_kernel_pgt, max_pfn);
+        memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+        set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+                        __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+        set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+        set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+        set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+        pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+        xen_write_cr3(__pa(swapper_pg_dir));
+        pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+        return swapper_pg_dir;
+}
+#endif  /* CONFIG_X86_64 */
+static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
+{
+        pte_t pte;
+        phys >>= PAGE_SHIFT;
+        switch (idx) {
+        case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
+#ifdef CONFIG_X86_F00F_BUG
+        case FIX_F00F_IDT:
+#endif
+#ifdef CONFIG_X86_32
+        case FIX_WP_TEST:
+        case FIX_VDSO:
+# ifdef CONFIG_HIGHMEM
+        case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
+#else
+        case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+        case FIX_APIC_BASE:     /* maps dummy local APIC */
+#endif
+                pte = pfn_pte(phys, prot);
+                break;
+        default:
+                pte = mfn_pte(phys, prot);
+                break;
+        }
+        __native_set_fixmap(idx, pte);
+#ifdef CONFIG_X86_64
+        /* Replicate changes to map the vsyscall page into the user
+           pagetable vsyscall mapping. */
+        if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+                unsigned long vaddr = __fix_to_virt(idx);
+                set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+        }
+#endif
+}
+__init void xen_post_allocator_init(void)
+{
+        pv_mmu_ops.set_pte = xen_set_pte;
+        pv_mmu_ops.set_pmd = xen_set_pmd;
+        pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+        pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
+        /* This will work as long as patching hasn't happened yet
+           (which it hasn't) */
+        pv_mmu_ops.alloc_pte = xen_alloc_pte;
+        pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+        pv_mmu_ops.release_pte = xen_release_pte;
+        pv_mmu_ops.release_pmd = xen_release_pmd;
+#if PAGETABLE_LEVELS == 4
+        pv_mmu_ops.alloc_pud = xen_alloc_pud;
+        pv_mmu_ops.release_pud = xen_release_pud;
+#endif
+#ifdef CONFIG_X86_64
+        SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
+        xen_mark_init_mm_pinned();
+}
+const struct pv_mmu_ops xen_mmu_ops __initdata = {
+        .pagetable_setup_start = xen_pagetable_setup_start,
+        .pagetable_setup_done = xen_pagetable_setup_done,
+        .read_cr2 = xen_read_cr2,
+        .write_cr2 = xen_write_cr2,
+        .read_cr3 = xen_read_cr3,
+        .write_cr3 = xen_write_cr3,
+        .flush_tlb_user = xen_flush_tlb,
+        .flush_tlb_kernel = xen_flush_tlb,
+        .flush_tlb_single = xen_flush_tlb_single,
+        .flush_tlb_others = xen_flush_tlb_others,
+        .pte_update = paravirt_nop,
+        .pte_update_defer = paravirt_nop,
+        .pgd_alloc = xen_pgd_alloc,
+        .pgd_free = xen_pgd_free,
+        .alloc_pte = xen_alloc_pte_init,
+        .release_pte = xen_release_pte_init,
+        .alloc_pmd = xen_alloc_pte_init,
+        .alloc_pmd_clone = paravirt_nop,
+        .release_pmd = xen_release_pte_init,
+#ifdef CONFIG_HIGHPTE
+        .kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+#ifdef CONFIG_X86_64
+        .set_pte = xen_set_pte,
+#else
+        .set_pte = xen_set_pte_init,
+#endif
+        .set_pte_at = xen_set_pte_at,
+        .set_pmd = xen_set_pmd_hyper,
+        .ptep_modify_prot_start = __ptep_modify_prot_start,
+        .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+        .pte_val = PV_CALLEE_SAVE(xen_pte_val),
+        .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
+        .make_pte = PV_CALLEE_SAVE(xen_make_pte),
+        .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
+#ifdef CONFIG_X86_PAE
+        .set_pte_atomic = xen_set_pte_atomic,
+        .set_pte_present = xen_set_pte_at,
+        .pte_clear = xen_pte_clear,
+        .pmd_clear = xen_pmd_clear,
+#endif  /* CONFIG_X86_PAE */
+        .set_pud = xen_set_pud_hyper,
+        .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
+        .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
+#if PAGETABLE_LEVELS == 4
+        .pud_val = PV_CALLEE_SAVE(xen_pud_val),
+        .make_pud = PV_CALLEE_SAVE(xen_make_pud),
+        .set_pgd = xen_set_pgd_hyper,
+        .alloc_pud = xen_alloc_pte_init,
+        .release_pud = xen_release_pte_init,
+#endif  /* PAGETABLE_LEVELS == 4 */
+        .activate_mm = xen_activate_mm,
+        .dup_mmap = xen_dup_mmap,
+        .exit_mmap = xen_exit_mmap,
+        .lazy_mode = {
+                .enter = paravirt_enter_lazy_mmu,
+                .leave = xen_leave_lazy,
+        },
+        .set_fixmap = xen_set_fixmap,
+};
 #ifdef CONFIG_XEN_DEBUG_FS
 static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 98d71659da5a..24d1b44a337d 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
 void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
                                  pte_t *ptep, pte_t pte);
+unsigned long xen_read_cr2_direct(void);
+extern const struct pv_mmu_ops xen_mmu_ops;
 #endif  /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index c738644b5435..8bff7e7c290b 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -39,6 +39,7 @@ struct mc_buffer {
        struct multicall_entry entries[MC_BATCH];
 #if MC_DEBUG
        struct multicall_entry debug[MC_BATCH];
+        void *caller[MC_BATCH];
 #endif
        unsigned char args[MC_ARGS];
        struct callback {
@@ -154,11 +155,12 @@ void xen_mc_flush(void)
                               ret, smp_processor_id());
                        dump_stack();
                        for (i = 0; i < b->mcidx; i++) {
-                                printk(KERN_DEBUG "  call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
+                                printk(KERN_DEBUG "  call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
                                       i+1, b->mcidx,
                                       b->debug[i].op,
                                       b->debug[i].args[0],
-                                       b->entries[i].result);
+                                       b->entries[i].result,
+                                       b->caller[i]);
                        }
                }
 #endif
@@ -168,8 +170,6 @@ void xen_mc_flush(void)
        } else
                BUG_ON(b->argidx != 0);
-        local_irq_restore(flags);
        for (i = 0; i < b->cbidx; i++) {
                struct callback *cb = &b->callbacks[i];
@@ -177,7 +177,9 @@ void xen_mc_flush(void)
        }
        b->cbidx = 0;
-        BUG_ON(ret);
+        local_irq_restore(flags);
+        WARN_ON(ret);
 }
 struct multicall_space __xen_mc_entry(size_t args)
@@ -197,6 +199,9 @@ struct multicall_space __xen_mc_entry(size_t args)
        }
        ret.mc = &b->entries[b->mcidx];
+#ifdef MC_DEBUG
+        b->caller[b->mcidx] = __builtin_return_address(0);
+#endif
        b->mcidx++;
        ret.args = &b->args[argidx];
        b->argidx = argidx + args;
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index fa3e10725d98..9e565da5d1f7 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -41,7 +41,7 @@ static inline void xen_mc_issue(unsigned mode)
                xen_mc_flush();
        /* restore flags saved in xen_mc_batch */
-        local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+        local_irq_restore(percpu_read(xen_mc_irq_flags));
 }
 /* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c44e2069c7c7..8d470562ffc9 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
 */
 static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
 {
-#ifdef CONFIG_X86_32
+        inc_irq_stat(irq_resched_count);
-        __get_cpu_var(irq_stat).irq_resched_count++;
-#else
-        add_pda(irq_resched_count, 1);
-#endif
        return IRQ_HANDLED;
 }
@@ -78,7 +74,7 @@ static __cpuinit void cpu_bringup(void)
        xen_setup_cpu_clockevents();
        cpu_set(cpu, cpu_online_map);
-        x86_write_percpu(cpu_state, CPU_ONLINE);
+        percpu_write(cpu_state, CPU_ONLINE);
        wmb();
        /* We can take interrupts now: we're officially "up". */
@@ -174,7 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
        /* We've switched to the "real" per-cpu gdt, so make sure the
           old memory can be recycled */
-        make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
+        make_lowmem_page_readwrite(xen_initial_gdt);
        xen_setup_vcpu_info_placement();
 }
@@ -223,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 {
        struct vcpu_guest_context *ctxt;
        struct desc_struct *gdt;
+        unsigned long gdt_mfn;
        if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
                return 0;
@@ -239,6 +236,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        ctxt->user_regs.ss = __KERNEL_DS;
 #ifdef CONFIG_X86_32
        ctxt->user_regs.fs = __KERNEL_PERCPU;
+#else
+        ctxt->gs_base_kernel = per_cpu_offset(cpu);
 #endif
        ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
        ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -250,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        ctxt->ldt_ents = 0;
        BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+        gdt_mfn = arbitrary_virt_to_mfn(gdt);
        make_lowmem_page_readonly(gdt);
+        make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
-        ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+        ctxt->gdt_frames[0] = gdt_mfn;
        ctxt->gdt_ents      = GDT_ENTRIES;
        ctxt->user_regs.cs = __KERNEL_CS;
@@ -283,23 +285,14 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
        struct task_struct *idle = idle_task(cpu);
        int rc;
-#ifdef CONFIG_X86_64
-        /* Allocate node local memory for AP pdas */
-        WARN_ON(cpu == 0);
-        if (cpu > 0) {
-                rc = get_local_pda(cpu);
-                if (rc)
-                        return rc;
-        }
-#endif
-#ifdef CONFIG_X86_32
-        init_gdt(cpu);
        per_cpu(current_task, cpu) = idle;
+#ifdef CONFIG_X86_32
        irq_ctx_init(cpu);
 #else
-        cpu_pda(cpu)->pcurrent = idle;
        clear_tsk_thread_flag(idle, TIF_FORK);
+        per_cpu(kernel_stack, cpu) =
+                (unsigned long)task_stack_page(idle) -
+                KERNEL_STACK_OFFSET + THREAD_SIZE;
 #endif
        xen_setup_timer(cpu);
        xen_init_lock_cpu(cpu);
@@ -445,11 +438,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
 {
        irq_enter();
        generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
+        inc_irq_stat(irq_call_count);
-        __get_cpu_var(irq_stat).irq_call_count++;
-#else
-        add_pda(irq_call_count, 1);
-#endif
        irq_exit();
        return IRQ_HANDLED;
@@ -459,11 +448,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 {
        irq_enter();
        generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
+        inc_irq_stat(irq_call_count);
-        __get_cpu_var(irq_stat).irq_call_count++;
-#else
-        add_pda(irq_call_count, 1);
-#endif
        irq_exit();
        return IRQ_HANDLED;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 212ffe012b76..95be7b434724 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -6,6 +6,7 @@
 #include <asm/xen/hypercall.h>
 #include <asm/xen/page.h>
+#include <asm/fixmap.h>
 #include "xen-ops.h"
 #include "mmu.h"
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 000000000000..79d7362ad6d1
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,142 @@
+/*
+ * Asm versions of Xen pv-ops, suitable for either direct use or
+ * inlining.  The inline versions are the same as the direct-use
+ * versions, with the pre- and post-amble chopped off.
+ *
+ * This code is encoded for size rather than absolute efficiency, with
+ * a view to being able to inline as much as possible.
+ *
+ * We only bother with direct forms (ie, vcpu in percpu data) of the
+ * operations here; the indirect forms are better handled in C, since
+ * they're generally too large to inline anyway.
+ */
+#include <asm/asm-offsets.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include "xen-asm.h"
+/*
+ * Enable events.  This clears the event mask and tests the pending
+ * event status with one and operation.  If there are pending events,
+ * then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+        /* Unmask events */
+        movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+        /*
+         * Preempt here doesn't matter because that will deal with any
+         * pending interrupts.  The pending check may end up being run
+         * on the wrong CPU, but that doesn't hurt.
+         */
+        /* Test for pending */
+        testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+        jz 1f
+2:      call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+        ret
+        ENDPROC(xen_irq_enable_direct)
+        RELOC(xen_irq_enable_direct, 2b+1)
+/*
+ * Disabling events is simply a matter of making the event mask
+ * non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+        movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+        ret
+        ENDPROC(xen_irq_disable_direct)
+        RELOC(xen_irq_disable_direct, 0)
+/*
+ * (xen_)save_fl is used to get the current interrupt enable status.
+ * Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ * may be set in the return value.  We take advantage of this by
+ * making sure that X86_EFLAGS_IF has the right value (and other bits
+ * in that byte are 0), but other bits in the return value are
+ * undefined.  We need to toggle the state of the bit, because Xen and
+ * x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+        testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+        setz %ah
+        addb %ah, %ah
+ENDPATCH(xen_save_fl_direct)
+        ret
+        ENDPROC(xen_save_fl_direct)
+        RELOC(xen_save_fl_direct, 0)
+/*
+ * In principle the caller should be passing us a value return from
+ * xen_save_fl_direct, but for robustness sake we test only the
+ * X86_EFLAGS_IF flag rather than the whole byte. After setting the
+ * interrupt mask state, it checks for unmasked pending events and
+ * enters the hypervisor to get them delivered if so.
+ */
+ENTRY(xen_restore_fl_direct)
+#ifdef CONFIG_X86_64
+        testw $X86_EFLAGS_IF, %di
+#else
+        testb $X86_EFLAGS_IF>>8, %ah
+#endif
+        setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+        /*
+         * Preempt here doesn't matter because that will deal with any
+         * pending interrupts.  The pending check may end up being run
+         * on the wrong CPU, but that doesn't hurt.
+         */
+        /* check for unmasked and pending */
+        cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+        jz 1f
+2:      call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+        ret
+        ENDPROC(xen_restore_fl_direct)
+        RELOC(xen_restore_fl_direct, 2b+1)
+/*
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
+ */
+check_events:
+#ifdef CONFIG_X86_32
+        push %eax
+        push %ecx
+        push %edx
+        call xen_force_evtchn_callback
+        pop %edx
+        pop %ecx
+        pop %eax
+#else
+        push %rax
+        push %rcx
+        push %rdx
+        push %rsi
+        push %rdi
+        push %r8
+        push %r9
+        push %r10
+        push %r11
+        call xen_force_evtchn_callback
+        pop %r11
+        pop %r10
+        pop %r9
+        pop %r8
+        pop %rdi
+        pop %rsi
+        pop %rdx
+        pop %rcx
+        pop %rax
+#endif
+        ret
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
new file mode 100644
index 000000000000..465276467a47
--- /dev/null
+++ b/arch/x86/xen/xen-asm.h
@@ -0,0 +1,12 @@
+#ifndef _XEN_XEN_ASM_H
+#define _XEN_XEN_ASM_H
+#include <linux/linkage.h>
+#define RELOC(x, v)     .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)     .globl x##_end; x##_end=.
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI  0x80000000
+#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 42786f59d9c0..88e15deb8b82 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,117 +1,43 @@
 /*
-        Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+ * Asm versions of Xen pv-ops, suitable for either direct use or
-        The inline versions are the same as the direct-use versions, with the
+ * inlining.  The inline versions are the same as the direct-use
-        pre- and post-amble chopped off.
+ * versions, with the pre- and post-amble chopped off.
+ *
-        This code is encoded for size rather than absolute efficiency,
+ * This code is encoded for size rather than absolute efficiency, with
-        with a view to being able to inline as much as possible.
+ * a view to being able to inline as much as possible.
+ *
-        We only bother with direct forms (ie, vcpu in pda) of the operations
+ * We only bother with direct forms (ie, vcpu in pda) of the
-        here; the indirect forms are better handled in C, since they're
+ * operations here; the indirect forms are better handled in C, since
-        generally too large to inline anyway.
+ * they're generally too large to inline anyway.
 */
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/segment.h>
 #include <xen/interface/xen.h>
-#define RELOC(x, v)     .globl x##_reloc; x##_reloc=v
+#include "xen-asm.h"
-#define ENDPATCH(x)     .globl x##_end; x##_end=.
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI  0x80000000
-/*
-        Enable events.  This clears the event mask and tests the pending
-        event status with one and operation.  If there are pending
-        events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
-        /* Unmask events */
-        movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-        /* Preempt here doesn't matter because that will deal with
-           any pending interrupts.  The pending check may end up being
-           run on the wrong CPU, but that doesn't hurt. */
-        /* Test for pending */
-        testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-        jz 1f
-2:      call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
-        ret
-        ENDPROC(xen_irq_enable_direct)
-        RELOC(xen_irq_enable_direct, 2b+1)
-/*
-        Disabling events is simply a matter of making the event mask
-        non-zero.
- */
-ENTRY(xen_irq_disable_direct)
-        movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
-        ret
-        ENDPROC(xen_irq_disable_direct)
-        RELOC(xen_irq_disable_direct, 0)
 /*
-        (xen_)save_fl is used to get the current interrupt enable status.
+ * Force an event check by making a hypercall, but preserve regs
-        Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ * before making the call.
-        may be set in the return value.  We take advantage of this by
-        making sure that X86_EFLAGS_IF has the right value (and other bits
-        in that byte are 0), but other bits in the return value are
-        undefined.  We need to toggle the state of the bit, because
-        Xen and x86 use opposite senses (mask vs enable).
 */
-ENTRY(xen_save_fl_direct)
+check_events:
-        testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+        push %eax
-        setz %ah
+        push %ecx
-        addb %ah,%ah
+        push %edx
-ENDPATCH(xen_save_fl_direct)
+        call xen_force_evtchn_callback
-        ret
+        pop %edx
-        ENDPROC(xen_save_fl_direct)
+        pop %ecx
-        RELOC(xen_save_fl_direct, 0)
+        pop %eax
-/*
-        In principle the caller should be passing us a value return
-        from xen_save_fl_direct, but for robustness sake we test only
-        the X86_EFLAGS_IF flag rather than the whole byte. After
-        setting the interrupt mask state, it checks for unmasked
-        pending events and enters the hypervisor to get them delivered
-        if so.
- */
-ENTRY(xen_restore_fl_direct)
-        testb $X86_EFLAGS_IF>>8, %ah
-        setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-        /* Preempt here doesn't matter because that will deal with
-           any pending interrupts.  The pending check may end up being
-           run on the wrong CPU, but that doesn't hurt. */
-        /* check for unmasked and pending */
-        cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
-        jz 1f
-2:      call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
        ret
-        ENDPROC(xen_restore_fl_direct)
-        RELOC(xen_restore_fl_direct, 2b+1)
 /*
-        We can't use sysexit directly, because we're not running in ring0.
+ * We can't use sysexit directly, because we're not running in ring0.
-        But we can easily fake it up using iret.  Assuming xen_sysexit
+ * But we can easily fake it up using iret.  Assuming xen_sysexit is
-        is jumped to with a standard stack frame, we can just strip it
+ * jumped to with a standard stack frame, we can just strip it back to
-        back to a standard iret frame and use iret.
+ * a standard iret frame and use iret.
 */
 ENTRY(xen_sysexit)
        movl PT_EAX(%esp), %eax                 /* Shouldn't be necessary? */
@@ -122,33 +48,31 @@ ENTRY(xen_sysexit)
 ENDPROC(xen_sysexit)
 /*
-        This is run where a normal iret would be run, with the same stack setup:
+ * This is run where a normal iret would be run, with the same stack setup:
-              8: eflags
+ *      8: eflags
-              4: cs
+ *      4: cs
-        esp-> 0: eip
+ *      esp-> 0: eip
+ *
-        This attempts to make sure that any pending events are dealt
+ * This attempts to make sure that any pending events are dealt with
-        with on return to usermode, but there is a small window in
+ * on return to usermode, but there is a small window in which an
-        which an event can happen just before entering usermode.  If
+ * event can happen just before entering usermode.  If the nested
-        the nested interrupt ends up setting one of the TIF_WORK_MASK
+ * interrupt ends up setting one of the TIF_WORK_MASK pending work
-        pending work flags, they will not be tested again before
+ * flags, they will not be tested again before returning to
-        returning to usermode. This means that a process can end up
+ * usermode. This means that a process can end up with pending work,
-        with pending work, which will be unprocessed until the process
+ * which will be unprocessed until the process enters and leaves the
-        enters and leaves the kernel again, which could be an
+ * kernel again, which could be an unbounded amount of time.  This
-        unbounded amount of time.  This means that a pending signal or
+ * means that a pending signal or reschedule event could be
-        reschedule event could be indefinitely delayed.
+ * indefinitely delayed.
+ *
-        The fix is to notice a nested interrupt in the critical
+ * The fix is to notice a nested interrupt in the critical window, and
-        window, and if one occurs, then fold the nested interrupt into
+ * if one occurs, then fold the nested interrupt into the current
-        the current interrupt stack frame, and re-process it
+ * interrupt stack frame, and re-process it iteratively rather than
-        iteratively rather than recursively.  This means that it will
+ * recursively.  This means that it will exit via the normal path, and
-        exit via the normal path, and all pending work will be dealt
+ * all pending work will be dealt with appropriately.
-        with appropriately.
+ *
+ * Because the nested interrupt handler needs to deal with the current
-        Because the nested interrupt handler needs to deal with the
+ * stack state in whatever form its in, we keep things simple by only
-        current stack state in whatever form its in, we keep things
+ * using a single register which is pushed/popped on the stack.
-        simple by only using a single register which is pushed/popped
-        on the stack.
 */
 ENTRY(xen_iret)
        /* test eflags for special cases */
@@ -158,13 +82,15 @@ ENTRY(xen_iret)
        push %eax
        ESP_OFFSET=4    # bytes pushed onto stack
-        /* Store vcpu_info pointer for easy access.  Do it this
+        /*
-           way to avoid having to reload %fs */
+         * Store vcpu_info pointer for easy access.  Do it this way to
+         * avoid having to reload %fs
+         */
 #ifdef CONFIG_SMP
        GET_THREAD_INFO(%eax)
-        movl TI_cpu(%eax),%eax
+        movl TI_cpu(%eax), %eax
-        movl __per_cpu_offset(,%eax,4),%eax
+        movl __per_cpu_offset(,%eax,4), %eax
-        mov per_cpu__xen_vcpu(%eax),%eax
+        mov per_cpu__xen_vcpu(%eax), %eax
 #else
        movl per_cpu__xen_vcpu, %eax
 #endif
@@ -172,37 +98,46 @@ ENTRY(xen_iret)
        /* check IF state we're restoring */
        testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
-        /* Maybe enable events.  Once this happens we could get a
+        /*
-           recursive event, so the critical region starts immediately
+         * Maybe enable events.  Once this happens we could get a
-           afterwards.  However, if that happens we don't end up
+         * recursive event, so the critical region starts immediately
-           resuming the code, so we don't have to be worried about
+         * afterwards.  However, if that happens we don't end up
-           being preempted to another CPU. */
+         * resuming the code, so we don't have to be worried about
+         * being preempted to another CPU.
+         */
        setz XEN_vcpu_info_mask(%eax)
 xen_iret_start_crit:
        /* check for unmasked and pending */
        cmpw $0x0001, XEN_vcpu_info_pending(%eax)
-        /* If there's something pending, mask events again so we
+        /*
-           can jump back into xen_hypervisor_callback */
+         * If there's something pending, mask events again so we can
+         * jump back into xen_hypervisor_callback
+         */
        sete XEN_vcpu_info_mask(%eax)
        popl %eax
-        /* From this point on the registers are restored and the stack
+        /*
-           updated, so we don't need to worry about it if we're preempted */
+         * From this point on the registers are restored and the stack
+         * updated, so we don't need to worry about it if we're
+         * preempted
+         */
 iret_restore_end:
-        /* Jump to hypervisor_callback after fixing up the stack.
+        /*
-           Events are masked, so jumping out of the critical
+         * Jump to hypervisor_callback after fixing up the stack.
-           region is OK. */
+         * Events are masked, so jumping out of the critical region is
+         * OK.
+         */
        je xen_hypervisor_callback
 1:      iret
 xen_iret_end_crit:
-.section __ex_table,"a"
+.section __ex_table, "a"
        .align 4
-        .long 1b,iret_exc
+        .long 1b, iret_exc
 .previous
 hyper_iret:
@@ -212,55 +147,55 @@ hyper_iret:
        .globl xen_iret_start_crit, xen_iret_end_crit
 /*
-   This is called by xen_hypervisor_callback in entry.S when it sees
+ * This is called by xen_hypervisor_callback in entry.S when it sees
-   that the EIP at the time of interrupt was between xen_iret_start_crit
+ * that the EIP at the time of interrupt was between
-   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
+ * xen_iret_start_crit and xen_iret_end_crit.  We're passed the EIP in
-   a more refined determination of what to do.
+ * %eax so we can do a more refined determination of what to do.
+ *
-   The stack format at this point is:
+ * The stack format at this point is:
-        ----------------
+ *      ----------------
-         ss             : (ss/esp may be present if we came from usermode)
+ *       ss             : (ss/esp may be present if we came from usermode)
-         esp            :
+ *       esp            :
-         eflags         }  outer exception info
+ *       eflags         }  outer exception info
-         cs             }
+ *       cs             }
-         eip            }
+ *       eip            }
-        ---------------- <- edi (copy dest)
+ *      ---------------- <- edi (copy dest)
-         eax            :  outer eax if it hasn't been restored
+ *       eax            :  outer eax if it hasn't been restored
-        ----------------
+ *      ----------------
-         eflags         }  nested exception info
+ *       eflags         }  nested exception info
-         cs             }   (no ss/esp because we're nested
+ *       cs             }   (no ss/esp because we're nested
-         eip            }    from the same ring)
+ *       eip            }    from the same ring)
-         orig_eax       }<- esi (copy src)
+ *       orig_eax       }<- esi (copy src)
-         - - - - - - - -
+ *       - - - - - - - -
-         fs             }
+ *       fs             }
-         es             }
+ *       es             }
-         ds             }  SAVE_ALL state
+ *       ds             }  SAVE_ALL state
-         eax            }
+ *       eax            }
-          :             :
+ *        :             :
-         ebx            }<- esp
+ *       ebx            }<- esp
-        ----------------
+ *      ----------------
+ *
-   In order to deliver the nested exception properly, we need to shift
+ * In order to deliver the nested exception properly, we need to shift
-   everything from the return addr up to the error code so it
+ * everything from the return addr up to the error code so it sits
-   sits just under the outer exception info.  This means that when we
+ * just under the outer exception info.  This means that when we
-   handle the exception, we do it in the context of the outer exception
+ * handle the exception, we do it in the context of the outer
-   rather than starting a new one.
+ * exception rather than starting a new one.
+ *
-   The only caveat is that if the outer eax hasn't been
+ * The only caveat is that if the outer eax hasn't been restored yet
-   restored yet (ie, it's still on stack), we need to insert
+ * (ie, it's still on stack), we need to insert its value into the
-   its value into the SAVE_ALL state before going on, since
+ * SAVE_ALL state before going on, since it's usermode state which we
-   it's usermode state which we eventually need to restore.
+ * eventually need to restore.
 */
 ENTRY(xen_iret_crit_fixup)
        /*
-           Paranoia: Make sure we're really coming from kernel space.
+         * Paranoia: Make sure we're really coming from kernel space.
-           One could imagine a case where userspace jumps into the
+         * One could imagine a case where userspace jumps into the
-           critical range address, but just before the CPU delivers a GP,
+         * critical range address, but just before the CPU delivers a
-           it decides to deliver an interrupt instead.  Unlikely?
+         * GP, it decides to deliver an interrupt instead.  Unlikely?
-           Definitely.  Easy to avoid?  Yes.  The Intel documents
+         * Definitely.  Easy to avoid?  Yes.  The Intel documents
-           explicitly say that the reported EIP for a bad jump is the
+         * explicitly say that the reported EIP for a bad jump is the
-           jump instruction itself, not the destination, but some virtual
+         * jump instruction itself, not the destination, but some
-           environments get this wrong.
+         * virtual environments get this wrong.
         */
        movl PT_CS(%esp), %ecx
        andl $SEGMENT_RPL_MASK, %ecx
@@ -270,15 +205,17 @@ ENTRY(xen_iret_crit_fixup)
        lea PT_ORIG_EAX(%esp), %esi
        lea PT_EFLAGS(%esp), %edi
-        /* If eip is before iret_restore_end then stack
+        /*
-           hasn't been restored yet. */
+         * If eip is before iret_restore_end then stack
+         * hasn't been restored yet.
+         */
        cmp $iret_restore_end, %eax
        jae 1f
-        movl 0+4(%edi),%eax             /* copy EAX (just above top of frame) */
+        movl 0+4(%edi), %eax            /* copy EAX (just above top of frame) */
        movl %eax, PT_EAX(%esp)
-        lea ESP_OFFSET(%edi),%edi       /* move dest up over saved regs */
+        lea ESP_OFFSET(%edi), %edi      /* move dest up over saved regs */
        /* set up the copy */
 1:      std
@@ -286,20 +223,6 @@ ENTRY(xen_iret_crit_fixup)
        rep movsl
        cld
-        lea 4(%edi),%esp                /* point esp to new frame */
+        lea 4(%edi), %esp               /* point esp to new frame */
 2:      jmp xen_do_upcall
-/*
-        Force an event check by making a hypercall,
-        but preserve regs before making the call.
- */
-check_events:
-        push %eax
-        push %ecx
-        push %edx
-        call xen_force_evtchn_callback
-        pop %edx
-        pop %ecx
-        pop %eax
-        ret
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 05794c566e87..02f496a8dbaa 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,174 +1,45 @@
 /*
-        Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+ * Asm versions of Xen pv-ops, suitable for either direct use or
-        The inline versions are the same as the direct-use versions, with the
+ * inlining.  The inline versions are the same as the direct-use
-        pre- and post-amble chopped off.
+ * versions, with the pre- and post-amble chopped off.
+ *
-        This code is encoded for size rather than absolute efficiency,
+ * This code is encoded for size rather than absolute efficiency, with
-        with a view to being able to inline as much as possible.
+ * a view to being able to inline as much as possible.
+ *
-        We only bother with direct forms (ie, vcpu in pda) of the operations
+ * We only bother with direct forms (ie, vcpu in pda) of the
-        here; the indirect forms are better handled in C, since they're
+ * operations here; the indirect forms are better handled in C, since
-        generally too large to inline anyway.
+ * they're generally too large to inline anyway.
 */
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-#include <asm/processor-flags.h>
 #include <asm/errno.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
 #include <asm/segment.h>
 #include <xen/interface/xen.h>
-#define RELOC(x, v)     .globl x##_reloc; x##_reloc=v
+#include "xen-asm.h"
-#define ENDPATCH(x)     .globl x##_end; x##_end=.
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI  0x80000000
-#if 1
-/*
-        x86-64 does not yet support direct access to percpu variables
-        via a segment override, so we just need to make sure this code
-        never gets used
- */
-#define BUG                     ud2a
-#define PER_CPU_VAR(var, off)   0xdeadbeef
-#endif
-/*
-        Enable events.  This clears the event mask and tests the pending
-        event status with one and operation.  If there are pending
-        events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
-        BUG
-        /* Unmask events */
-        movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-        /* Preempt here doesn't matter because that will deal with
-           any pending interrupts.  The pending check may end up being
-           run on the wrong CPU, but that doesn't hurt. */
-        /* Test for pending */
-        testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
-        jz 1f
-2:      call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
-        ret
-        ENDPROC(xen_irq_enable_direct)
-        RELOC(xen_irq_enable_direct, 2b+1)
-/*
-        Disabling events is simply a matter of making the event mask
-        non-zero.
- */
-ENTRY(xen_irq_disable_direct)
-        BUG
-        movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-ENDPATCH(xen_irq_disable_direct)
-        ret
-        ENDPROC(xen_irq_disable_direct)
-        RELOC(xen_irq_disable_direct, 0)
-/*
-        (xen_)save_fl is used to get the current interrupt enable status.
-        Callers expect the status to be in X86_EFLAGS_IF, and other bits
-        may be set in the return value.  We take advantage of this by
-        making sure that X86_EFLAGS_IF has the right value (and other bits
-        in that byte are 0), but other bits in the return value are
-        undefined.  We need to toggle the state of the bit, because
-        Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
-        BUG
-        testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-        setz %ah
-        addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
-        ret
-        ENDPROC(xen_save_fl_direct)
-        RELOC(xen_save_fl_direct, 0)
-/*
-        In principle the caller should be passing us a value return
-        from xen_save_fl_direct, but for robustness sake we test only
-        the X86_EFLAGS_IF flag rather than the whole byte. After
-        setting the interrupt mask state, it checks for unmasked
-        pending events and enters the hypervisor to get them delivered
-        if so.
- */
-ENTRY(xen_restore_fl_direct)
-        BUG
-        testb $X86_EFLAGS_IF>>8, %ah
-        setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
-        /* Preempt here doesn't matter because that will deal with
-           any pending interrupts.  The pending check may end up being
-           run on the wrong CPU, but that doesn't hurt. */
-        /* check for unmasked and pending */
-        cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
-        jz 1f
-2:      call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
-        ret
-        ENDPROC(xen_restore_fl_direct)
-        RELOC(xen_restore_fl_direct, 2b+1)
-/*
-        Force an event check by making a hypercall,
-        but preserve regs before making the call.
- */
-check_events:
-        push %rax
-        push %rcx
-        push %rdx
-        push %rsi
-        push %rdi
-        push %r8
-        push %r9
-        push %r10
-        push %r11
-        call xen_force_evtchn_callback
-        pop %r11
-        pop %r10
-        pop %r9
-        pop %r8
-        pop %rdi
-        pop %rsi
-        pop %rdx
-        pop %rcx
-        pop %rax
-        ret
 ENTRY(xen_adjust_exception_frame)
-        mov 8+0(%rsp),%rcx
+        mov 8+0(%rsp), %rcx
-        mov 8+8(%rsp),%r11
+        mov 8+8(%rsp), %r11
        ret $16
 hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
 /*
-        Xen64 iret frame:
+ * Xen64 iret frame:
+ *
-        ss
+ *      ss
-        rsp
+ *      rsp
-        rflags
+ *      rflags
-        cs
+ *      cs
-        rip             <-- standard iret frame
+ *      rip             <-- standard iret frame
+ *
-        flags
+ *      flags
+ *
-        rcx             }
+ *      rcx             }
-        r11             }<-- pushed by hypercall page
+ *      r11             }<-- pushed by hypercall page
-rsp ->  rax             }
+ * rsp->rax             }
 */
 ENTRY(xen_iret)
        pushq $0
@@ -177,8 +48,8 @@ ENDPATCH(xen_iret)
 RELOC(xen_iret, 1b+1)
 /*
-        sysexit is not used for 64-bit processes, so it's
+ * sysexit is not used for 64-bit processes, so it's only ever used to
-        only ever used to return to 32-bit compat userspace.
+ * return to 32-bit compat userspace.
 */
 ENTRY(xen_sysexit)
        pushq $__USER32_DS
@@ -193,13 +64,15 @@ ENDPATCH(xen_sysexit)
 RELOC(xen_sysexit, 1b+1)
 ENTRY(xen_sysret64)
-        /* We're already on the usermode stack at this point, but still
+        /*
-           with the kernel gs, so we can easily switch back */
+         * We're already on the usermode stack at this point, but
-        movq %rsp, %gs:pda_oldrsp
+         * still with the kernel gs, so we can easily switch back
-        movq %gs:pda_kernelstack,%rsp
+         */
+        movq %rsp, PER_CPU_VAR(old_rsp)
+        movq PER_CPU_VAR(kernel_stack), %rsp
        pushq $__USER_DS
-        pushq %gs:pda_oldrsp
+        pushq PER_CPU_VAR(old_rsp)
        pushq %r11
        pushq $__USER_CS
        pushq %rcx
@@ -210,13 +83,15 @@ ENDPATCH(xen_sysret64)
 RELOC(xen_sysret64, 1b+1)
 ENTRY(xen_sysret32)
-        /* We're already on the usermode stack at this point, but still
+        /*
-           with the kernel gs, so we can easily switch back */
+         * We're already on the usermode stack at this point, but
-        movq %rsp, %gs:pda_oldrsp
+         * still with the kernel gs, so we can easily switch back
-        movq %gs:pda_kernelstack, %rsp
+         */
+        movq %rsp, PER_CPU_VAR(old_rsp)
+        movq PER_CPU_VAR(kernel_stack), %rsp
        pushq $__USER32_DS
-        pushq %gs:pda_oldrsp
+        pushq PER_CPU_VAR(old_rsp)
        pushq %r11
        pushq $__USER32_CS
        pushq %rcx
@@ -227,28 +102,27 @@ ENDPATCH(xen_sysret32)
 RELOC(xen_sysret32, 1b+1)
 /*
-        Xen handles syscall callbacks much like ordinary exceptions,
+ * Xen handles syscall callbacks much like ordinary exceptions, which
-        which means we have:
+ * means we have:
-         - kernel gs
+ * - kernel gs
-         - kernel rsp
+ * - kernel rsp
-         - an iret-like stack frame on the stack (including rcx and r11):
+ * - an iret-like stack frame on the stack (including rcx and r11):
-                ss
+ *      ss
-                rsp
+ *      rsp
-                rflags
+ *      rflags
-                cs
+ *      cs
-                rip
+ *      rip
-                r11
+ *      r11
-        rsp->   rcx
+ * rsp->rcx
+ *
-        In all the entrypoints, we undo all that to make it look
+ * In all the entrypoints, we undo all that to make it look like a
-        like a CPU-generated syscall/sysenter and jump to the normal
+ * CPU-generated syscall/sysenter and jump to the normal entrypoint.
-        entrypoint.
 */
 .macro undo_xen_syscall
-        mov 0*8(%rsp),%rcx
+        mov 0*8(%rsp), %rcx
-        mov 1*8(%rsp),%r11
+        mov 1*8(%rsp), %r11
-        mov 5*8(%rsp),%rsp
+        mov 5*8(%rsp), %rsp
 .endm
 /* Normal 64-bit system call target */
@@ -275,7 +149,7 @@ ENDPROC(xen_sysenter_target)
 ENTRY(xen_syscall32_target)
 ENTRY(xen_sysenter_target)
-        lea 16(%rsp), %rsp      /* strip %rcx,%r11 */
+        lea 16(%rsp), %rsp      /* strip %rcx, %r11 */
        mov $-ENOSYS, %rax
        pushq $VGCF_in_syscall
        jmp hypercall_iret
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 63d49a523ed3..1a5ff24e29c0 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -8,7 +8,7 @@
 #include <asm/boot.h>
 #include <asm/asm.h>
-#include <asm/page.h>
+#include <asm/page_types.h>
 #include <xen/interface/elfnote.h>
 #include <asm/xen/interface.h>
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c1f8faf0a2c5..2f5ef2632ea2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,9 +10,12 @@
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
+extern void *xen_initial_gdt;
 struct trap_info;
 void xen_copy_trap_info(struct trap_info *traps);
+DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 DECLARE_PER_CPU(unsigned long, xen_cr3);
 DECLARE_PER_CPU(unsigned long, xen_current_cr3);
@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
 void xen_setup_mfn_list_list(void);
 void xen_setup_shared_info(void);
+void xen_setup_machphys_mapping(void);
+pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
+void xen_ident_map_ISA(void);
+void xen_reserve_top(void);
+void xen_leave_lazy(void);
+void xen_post_allocator_init(void);
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
author	Ingo Molnar <mingo@elte.hu>	2009-03-26 16:39:17 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-03-27 12:28:43 -0400
commit	6e15cf04860074ad032e88c306bea656bbdd0f22 (patch)
tree	c346383bb7563e8d66b2f4a502f875b259c34870 /arch/x86/xen
parent	be0ea69674ed95e1e98cb3687a241badc756d228 (diff)
parent	60db56422043aaa455ac7f858ce23c273220f9d9 (diff)