Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-xen-next

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-xen-next: (52 commits) xen: add balloon driver xen: allow compilation with non-flat memory xen: fold xen_sysexit into xen_iret xen: allow set_pte_at on init_mm to be lockless xen: disable preemption during tlb flush xen pvfb: Para-virtual framebuffer, keyboard and pointer driver xen: Add compatibility aliases for frontend drivers xen: Module autoprobing support for frontend drivers xen blkfront: Delay wait for block devices until after the disk is added xen/blkfront: use bdget_disk xen: Make xen-blkfront write its protocol ABI to xenstore xen: import arch generic part of xencomm xen: make grant table arch portable xen: replace callers of alloc_vm_area()/free_vm_area() with xen_ prefixed one xen: make include/xen/page.h portable moving those definitions under asm dir xen: add resend_irq_on_evtchn() definition into events.c Xen: make events.c portable for ia64/xen support xen: move events.c to drivers/xen for IA64/Xen support xen: move features.c from arch/x86/xen/features.c to drivers/xen xen: add missing definitions in include/xen/interface/vcpu.h which ia64/xen needs ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2008-04-25 15:32:10 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-04-25 15:32:10 -0400
commit: 4b7227ca321ccf447cdc04538687c895db8b77f5 (patch)
tree: 72712127fc56aa2579e8a1508998bcabf6bd6c60 /arch/x86
parent: 5dae61b80564a5583ff4b56e357bdbc733fddb76 (diff)
parent: 1775826ceec51187aa868406585799b7e76ffa7d (diff)
23 files changed, 572 insertions, 987 deletions
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f0f8934fc303..2a609dc3271c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -409,7 +409,7 @@ restore_nocheck_notrace:
 irq_return:
        INTERRUPT_RETURN
 .section .fixup,"ax"
-iret_exc:
+ENTRY(iret_exc)
        pushl $0                        # no error code
        pushl $do_iret_error
        jmp error_code
@@ -1017,6 +1017,13 @@ ENTRY(kernel_thread_helper)
 ENDPROC(kernel_thread_helper)
 #ifdef CONFIG_XEN
+/* Xen doesn't set %esp to be precisely what the normal sysenter
+   entrypoint expects, so fix it up before using the normal path. */
+ENTRY(xen_sysenter_target)
+        RING0_INT_FRAME
+        addl $5*4, %esp         /* remove xen-provided frame */
+        jmp sysenter_past_esp
 ENTRY(xen_hypervisor_callback)
        CFI_STARTPROC
        pushl $0
@@ -1035,8 +1042,9 @@ ENTRY(xen_hypervisor_callback)
        cmpl $xen_iret_end_crit,%eax
        jae  1f
-        call xen_iret_crit_fixup
+        jmp  xen_iret_crit_fixup
+ENTRY(xen_do_upcall)
 1:      mov %esp, %eax
        call xen_evtchn_do_upcall
        jmp  ret_from_intr
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3733412d1357..74f0c5ea2a03 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -366,11 +366,13 @@ struct pv_mmu_ops pv_mmu_ops = {
        .flush_tlb_single = native_flush_tlb_single,
        .flush_tlb_others = native_flush_tlb_others,
-        .alloc_pt = paravirt_nop,
+        .alloc_pte = paravirt_nop,
-        .alloc_pd = paravirt_nop,
+        .alloc_pmd = paravirt_nop,
-        .alloc_pd_clone = paravirt_nop,
+        .alloc_pmd_clone = paravirt_nop,
-        .release_pt = paravirt_nop,
+        .alloc_pud = paravirt_nop,
-        .release_pd = paravirt_nop,
+        .release_pte = paravirt_nop,
+        .release_pmd = paravirt_nop,
+        .release_pud = paravirt_nop,
        .set_pte = native_set_pte,
        .set_pte_at = native_set_pte_at,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 19c9386ac118..1791a751a772 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -8,6 +8,7 @@
 #include <asm/apic.h>
 #include <asm/desc.h>
 #include <asm/hpet.h>
+#include <asm/pgtable.h>
 #include <asm/reboot_fixups.h>
 #include <asm/reboot.h>
@@ -15,7 +16,6 @@
 # include <linux/dmi.h>
 # include <linux/ctype.h>
 # include <linux/mc146818rtc.h>
-# include <asm/pgtable.h>
 #else
 # include <asm/iommu.h>
 #endif
@@ -275,7 +275,7 @@ void machine_real_restart(unsigned char *code, int length)
        /* Remap the kernel at virtual address zero, as well as offset zero
           from the kernel segment.  This assumes the kernel segment starts at
           virtual address PAGE_OFFSET. */
-        memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+        memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
                sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
        /*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ade371f9663a..eef79e84145f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1039,8 +1039,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
 #ifdef CONFIG_X86_32
        /* init low mem mapping */
-        clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+        clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-                        min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+                        min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
        flush_tlb_all();
 #endif
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 12affe1f9bce..956f38927aa7 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -320,7 +320,7 @@ static void check_zeroed_page(u32 pfn, int type, struct page *page)
         * pdes need to be zeroed.
         */
        if (type & VMI_PAGE_CLONE)
-                limit = USER_PTRS_PER_PGD;
+                limit = KERNEL_PGD_BOUNDARY;
        for (i = 0; i < limit; i++)
                BUG_ON(ptr[i]);
 }
@@ -392,13 +392,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
 }
 #endif
-static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
 {
        vmi_set_page_type(pfn, VMI_PAGE_L1);
        vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
 }
-static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
 {
        /*
         * This call comes in very early, before mem_map is setup.
@@ -409,20 +409,20 @@ static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
        vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
 }
-static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
 {
        vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
        vmi_check_page_type(clonepfn, VMI_PAGE_L2);
        vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
 }
-static void vmi_release_pt(u32 pfn)
+static void vmi_release_pte(u32 pfn)
 {
        vmi_ops.release_page(pfn, VMI_PAGE_L1);
        vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
 }
-static void vmi_release_pd(u32 pfn)
+static void vmi_release_pmd(u32 pfn)
 {
        vmi_ops.release_page(pfn, VMI_PAGE_L2);
        vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -871,15 +871,15 @@ static inline int __init activate_vmi(void)
        vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
        if (vmi_ops.allocate_page) {
-                pv_mmu_ops.alloc_pt = vmi_allocate_pt;
+                pv_mmu_ops.alloc_pte = vmi_allocate_pte;
-                pv_mmu_ops.alloc_pd = vmi_allocate_pd;
+                pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
-                pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+                pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
        }
        vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
        if (vmi_ops.release_page) {
-                pv_mmu_ops.release_pt = vmi_release_pt;
+                pv_mmu_ops.release_pte = vmi_release_pte;
-                pv_mmu_ops.release_pd = vmi_release_pd;
+                pv_mmu_ops.release_pmd = vmi_release_pmd;
        }
        /* Set linear is needed in all cases */
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index d05722121d24..6e2c4efce0ef 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -543,8 +543,8 @@ static void __init do_boot_cpu(__u8 cpu)
                hijack_source.idt.Offset, stack_start.sp));
        /* init lowmem identity mapping */
-        clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+        clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-                        min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+                        min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
        flush_tlb_all();
        if (quad_boot) {
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 20941d2954e2..b7b3e4c7cfc9 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
 obj-y   :=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-            pat.o
+            pat.o pgtable.o
 obj-$(CONFIG_X86_32)            += pgtable_32.o
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9ec62da85fd7..08aa1878fad4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -71,7 +71,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
        if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
                pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-                paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+                paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
                set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
                pud = pud_offset(pgd, 0);
                BUG_ON(pmd_table != pmd_offset(pud, 0));
@@ -100,7 +100,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
                                (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
                }
-                paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+                paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
                set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
                BUG_ON(page_table != pte_offset_kernel(pmd, 0));
        }
@@ -365,7 +365,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
                pte_clear(NULL, va, pte);
        }
-        paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
+        paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
 }
 void __init native_pagetable_setup_done(pgd_t *base)
@@ -457,7 +457,7 @@ void zap_low_mappings(void)
         * Note that "pgd_clear()" doesn't do it for
         * us, because pgd_clear() is a no-op on i386.
         */
-        for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+        for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
 #ifdef CONFIG_X86_PAE
                set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
 #else
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3a4baf95e24d..36a3f7ded626 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -407,7 +407,7 @@ void __init early_ioremap_clear(void)
        pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
        pmd_clear(pmd);
-        paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT);
+        paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
        __flush_tlb_all();
 }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index c29ebd037254..bd5e05c654dc 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -483,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
                goto out_unlock;
        pbase = (pte_t *)page_address(base);
-#ifdef CONFIG_X86_32
+        paravirt_alloc_pte(&init_mm, page_to_pfn(base));
-        paravirt_alloc_pt(&init_mm, page_to_pfn(base));
-#endif
        ref_prot = pte_pgprot(pte_clrhuge(*kpte));
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
new file mode 100644
index 000000000000..50159764f694
--- /dev/null
+++ b/arch/x86/mm/pgtable.c
@@ -0,0 +1,276 @@
+#include <linux/mm.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+        return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+}
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+        struct page *pte;
+#ifdef CONFIG_HIGHPTE
+        pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+        pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+        if (pte)
+                pgtable_page_ctor(pte);
+        return pte;
+}
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+        pgtable_page_dtor(pte);
+        paravirt_release_pte(page_to_pfn(pte));
+        tlb_remove_page(tlb, pte);
+}
+#if PAGETABLE_LEVELS > 2
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+        paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
+        tlb_remove_page(tlb, virt_to_page(pmd));
+}
+#if PAGETABLE_LEVELS > 3
+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+        paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+        tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif  /* PAGETABLE_LEVELS > 3 */
+#endif  /* PAGETABLE_LEVELS > 2 */
+static inline void pgd_list_add(pgd_t *pgd)
+{
+        struct page *page = virt_to_page(pgd);
+        list_add(&page->lru, &pgd_list);
+}
+static inline void pgd_list_del(pgd_t *pgd)
+{
+        struct page *page = virt_to_page(pgd);
+        list_del(&page->lru);
+}
+#define UNSHARED_PTRS_PER_PGD                           \
+        (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+static void pgd_ctor(void *p)
+{
+        pgd_t *pgd = p;
+        unsigned long flags;
+        /* Clear usermode parts of PGD */
+        memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
+        spin_lock_irqsave(&pgd_lock, flags);
+        /* If the pgd points to a shared pagetable level (either the
+           ptes in non-PAE, or shared PMD in PAE), then just copy the
+           references from swapper_pg_dir. */
+        if (PAGETABLE_LEVELS == 2 ||
+            (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+            PAGETABLE_LEVELS == 4) {
+                clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+                                swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+                                KERNEL_PGD_PTRS);
+                paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
+                                         __pa(swapper_pg_dir) >> PAGE_SHIFT,
+                                         KERNEL_PGD_BOUNDARY,
+                                         KERNEL_PGD_PTRS);
+        }
+        /* list required to sync kernel mapping updates */
+        if (!SHARED_KERNEL_PMD)
+                pgd_list_add(pgd);
+        spin_unlock_irqrestore(&pgd_lock, flags);
+}
+static void pgd_dtor(void *pgd)
+{
+        unsigned long flags; /* can be called from interrupt context */
+        if (SHARED_KERNEL_PMD)
+                return;
+        spin_lock_irqsave(&pgd_lock, flags);
+        pgd_list_del(pgd);
+        spin_unlock_irqrestore(&pgd_lock, flags);
+}
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+#ifdef CONFIG_X86_PAE
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+        int i;
+        for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+                pgd_t pgd = pgdp[i];
+                if (pgd_val(pgd) != 0) {
+                        pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+                        pgdp[i] = native_make_pgd(0);
+                        paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+                        pmd_free(mm, pmd);
+                }
+        }
+}
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update.  Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+        pud_t *pud;
+        unsigned long addr;
+        int i;
+        pud = pud_offset(pgd, 0);
+        for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+             i++, pud++, addr += PUD_SIZE) {
+                pmd_t *pmd = pmd_alloc_one(mm, addr);
+                if (!pmd) {
+                        pgd_mop_up_pmds(mm, pgd);
+                        return 0;
+                }
+                if (i >= KERNEL_PGD_BOUNDARY)
+                        memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+                               sizeof(pmd_t) * PTRS_PER_PMD);
+                pud_populate(mm, pud, pmd);
+        }
+        return 1;
+}
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+        /* Note: almost everything apart from _PAGE_PRESENT is
+           reserved at the pmd (PDPT) level. */
+        set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+        /*
+         * According to Intel App note "TLBs, Paging-Structure Caches,
+         * and Their Invalidation", April 2007, document 317080-001,
+         * section 8.1: in PAE mode we explicitly have to flush the
+         * TLB via cr3 if the top-level pgd is changed...
+         */
+        if (mm == current->active_mm)
+                write_cr3(read_cr3());
+}
+#else  /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+        return 1;
+}
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+{
+}
+#endif  /* CONFIG_X86_PAE */
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+        pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+        /* so that alloc_pmd can use it */
+        mm->pgd = pgd;
+        if (pgd)
+                pgd_ctor(pgd);
+        if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+                pgd_dtor(pgd);
+                free_page((unsigned long)pgd);
+                pgd = NULL;
+        }
+        return pgd;
+}
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+        pgd_mop_up_pmds(mm, pgd);
+        pgd_dtor(pgd);
+        free_page((unsigned long)pgd);
+}
+int ptep_set_access_flags(struct vm_area_struct *vma,
+                          unsigned long address, pte_t *ptep,
+                          pte_t entry, int dirty)
+{
+        int changed = !pte_same(*ptep, entry);
+        if (changed && dirty) {
+                *ptep = entry;
+                pte_update_defer(vma->vm_mm, address, ptep);
+                flush_tlb_page(vma, address);
+        }
+        return changed;
+}
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+                              unsigned long addr, pte_t *ptep)
+{
+        int ret = 0;
+        if (pte_young(*ptep))
+                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+                                         &ptep->pte);
+        if (ret)
+                pte_update(vma->vm_mm, addr, ptep);
+        return ret;
+}
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+                           unsigned long address, pte_t *ptep)
+{
+        int young;
+        young = ptep_test_and_clear_young(vma, address, ptep);
+        if (young)
+                flush_tlb_page(vma, address);
+        return young;
+}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 6fb9e7c6893f..9ee007be9142 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -173,210 +173,6 @@ void reserve_top_address(unsigned long reserve)
        __VMALLOC_RESERVE += reserve;
 }
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
-        return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-}
-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-        struct page *pte;
-#ifdef CONFIG_HIGHPTE
-        pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
-#else
-        pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-#endif
-        if (pte)
-                pgtable_page_ctor(pte);
-        return pte;
-}
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
- * vmalloc faults work because attached pagetables are never freed.
- * -- wli
- */
-static inline void pgd_list_add(pgd_t *pgd)
-{
-        struct page *page = virt_to_page(pgd);
-        list_add(&page->lru, &pgd_list);
-}
-static inline void pgd_list_del(pgd_t *pgd)
-{
-        struct page *page = virt_to_page(pgd);
-        list_del(&page->lru);
-}
-#define UNSHARED_PTRS_PER_PGD                           \
-        (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
-static void pgd_ctor(void *p)
-{
-        pgd_t *pgd = p;
-        unsigned long flags;
-        /* Clear usermode parts of PGD */
-        memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
-        spin_lock_irqsave(&pgd_lock, flags);
-        /* If the pgd points to a shared pagetable level (either the
-           ptes in non-PAE, or shared PMD in PAE), then just copy the
-           references from swapper_pg_dir. */
-        if (PAGETABLE_LEVELS == 2 ||
-            (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
-                clone_pgd_range(pgd + USER_PTRS_PER_PGD,
-                                swapper_pg_dir + USER_PTRS_PER_PGD,
-                                KERNEL_PGD_PTRS);
-                paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
-                                        __pa(swapper_pg_dir) >> PAGE_SHIFT,
-                                        USER_PTRS_PER_PGD,
-                                        KERNEL_PGD_PTRS);
-        }
-        /* list required to sync kernel mapping updates */
-        if (!SHARED_KERNEL_PMD)
-                pgd_list_add(pgd);
-        spin_unlock_irqrestore(&pgd_lock, flags);
-}
-static void pgd_dtor(void *pgd)
-{
-        unsigned long flags; /* can be called from interrupt context */
-        if (SHARED_KERNEL_PMD)
-                return;
-        spin_lock_irqsave(&pgd_lock, flags);
-        pgd_list_del(pgd);
-        spin_unlock_irqrestore(&pgd_lock, flags);
-}
-#ifdef CONFIG_X86_PAE
-/*
- * Mop up any pmd pages which may still be attached to the pgd.
- * Normally they will be freed by munmap/exit_mmap, but any pmd we
- * preallocate which never got a corresponding vma will need to be
- * freed manually.
- */
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-        int i;
-        for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
-                pgd_t pgd = pgdp[i];
-                if (pgd_val(pgd) != 0) {
-                        pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
-                        pgdp[i] = native_make_pgd(0);
-                        paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
-                        pmd_free(mm, pmd);
-                }
-        }
-}
-/*
- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
- * updating the top-level pagetable entries to guarantee the
- * processor notices the update.  Since this is expensive, and
- * all 4 top-level entries are used almost immediately in a
- * new process's life, we just pre-populate them here.
- *
- * Also, if we're in a paravirt environment where the kernel pmd is
- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
- * and initialize the kernel pmds here.
- */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-        pud_t *pud;
-        unsigned long addr;
-        int i;
-        pud = pud_offset(pgd, 0);
-        for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
-             i++, pud++, addr += PUD_SIZE) {
-                pmd_t *pmd = pmd_alloc_one(mm, addr);
-                if (!pmd) {
-                        pgd_mop_up_pmds(mm, pgd);
-                        return 0;
-                }
-                if (i >= USER_PTRS_PER_PGD)
-                        memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
-                               sizeof(pmd_t) * PTRS_PER_PMD);
-                pud_populate(mm, pud, pmd);
-        }
-        return 1;
-}
-#else  /* !CONFIG_X86_PAE */
-/* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-        return 1;
-}
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-}
-#endif  /* CONFIG_X86_PAE */
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-        pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-        /* so that alloc_pd can use it */
-        mm->pgd = pgd;
-        if (pgd)
-                pgd_ctor(pgd);
-        if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
-                pgd_dtor(pgd);
-                free_page((unsigned long)pgd);
-                pgd = NULL;
-        }
-        return pgd;
-}
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-        pgd_mop_up_pmds(mm, pgd);
-        pgd_dtor(pgd);
-        free_page((unsigned long)pgd);
-}
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-{
-        pgtable_page_dtor(pte);
-        paravirt_release_pt(page_to_pfn(pte));
-        tlb_remove_page(tlb, pte);
-}
-#ifdef CONFIG_X86_PAE
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-{
-        paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-        tlb_remove_page(tlb, virt_to_page(pmd));
-}
-#endif
 int pmd_bad(pmd_t pmd)
 {
        WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 4d5f2649bee4..2e641be2737e 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,7 +6,7 @@ config XEN
        bool "Xen guest support"
        select PARAVIRT
        depends on X86_32
-        depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER)
+        depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
        help
          This is the Linux Xen port.  Enabling this will allow the
          kernel to boot in a paravirtualized environment under the
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 343df246bd3e..3d8df981d5fd 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
-obj-y           := enlighten.o setup.o features.o multicalls.o mmu.o \
+obj-y           := enlighten.o setup.o multicalls.o mmu.o \
-                        events.o time.o manage.o xen-asm.o
+                        time.o manage.o xen-asm.o grant-table.o
 obj-$(CONFIG_SMP)       += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c0388220cf97..c8a56e457d61 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -155,7 +155,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
        if (*ax == 1)
                maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
                            (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
-                            (1 << X86_FEATURE_SEP)  |  /* disable SEP */
+                            (1 << X86_FEATURE_MCE)  |  /* disable MCE */
+                            (1 << X86_FEATURE_MCA)  |  /* disable MCA */
                            (1 << X86_FEATURE_ACC));   /* thermal monitoring */
        asm(XEN_EMULATE_PREFIX "cpuid"
@@ -531,26 +532,37 @@ static void xen_apic_write(unsigned long reg, u32 val)
 static void xen_flush_tlb(void)
 {
        struct mmuext_op *op;
-        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+        struct multicall_space mcs;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*op));
        op = mcs.args;
        op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
 }
 static void xen_flush_tlb_single(unsigned long addr)
 {
        struct mmuext_op *op;
-        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+        struct multicall_space mcs;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*op));
        op = mcs.args;
        op->cmd = MMUEXT_INVLPG_LOCAL;
        op->arg1.linear_addr = addr & PAGE_MASK;
        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
 }
 static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
@@ -655,15 +667,17 @@ static void xen_write_cr3(unsigned long cr3)
 /* Early in boot, while setting up the initial pagetable, assume
   everything is pinned. */
-static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
 {
+#ifdef CONFIG_FLATMEM
        BUG_ON(mem_map);        /* should only be used early */
+#endif
        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
 }
-/* Early release_pt assumes that all pts are pinned, since there's
+/* Early release_pte assumes that all pts are pinned, since there's
   only init_mm and anything attached to that is pinned. */
-static void xen_release_pt_init(u32 pfn)
+static void xen_release_pte_init(u32 pfn)
 {
        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
@@ -697,12 +711,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
        }
 }
-static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pte(struct mm_struct *mm, u32 pfn)
 {
        xen_alloc_ptpage(mm, pfn, PT_PTE);
 }
-static void xen_alloc_pd(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
 {
        xen_alloc_ptpage(mm, pfn, PT_PMD);
 }
@@ -722,12 +736,12 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
        }
 }
-static void xen_release_pt(u32 pfn)
+static void xen_release_pte(u32 pfn)
 {
        xen_release_ptpage(pfn, PT_PTE);
 }
-static void xen_release_pd(u32 pfn)
+static void xen_release_pmd(u32 pfn)
 {
        xen_release_ptpage(pfn, PT_PMD);
 }
@@ -849,10 +863,10 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 {
        /* This will work as long as patching hasn't happened yet
           (which it hasn't) */
-        pv_mmu_ops.alloc_pt = xen_alloc_pt;
+        pv_mmu_ops.alloc_pte = xen_alloc_pte;
-        pv_mmu_ops.alloc_pd = xen_alloc_pd;
+        pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
-        pv_mmu_ops.release_pt = xen_release_pt;
+        pv_mmu_ops.release_pte = xen_release_pte;
-        pv_mmu_ops.release_pd = xen_release_pd;
+        pv_mmu_ops.release_pmd = xen_release_pmd;
        pv_mmu_ops.set_pte = xen_set_pte;
        setup_shared_info();
@@ -994,7 +1008,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
        .read_pmc = native_read_pmc,
        .iret = xen_iret,
-        .irq_enable_syscall_ret = NULL,  /* never called */
+        .irq_enable_syscall_ret = xen_sysexit,
        .load_tr_desc = paravirt_nop,
        .set_ldt = xen_set_ldt,
@@ -1059,11 +1073,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
        .pte_update = paravirt_nop,
        .pte_update_defer = paravirt_nop,
-        .alloc_pt = xen_alloc_pt_init,
+        .alloc_pte = xen_alloc_pte_init,
-        .release_pt = xen_release_pt_init,
+        .release_pte = xen_release_pte_init,
-        .alloc_pd = xen_alloc_pt_init,
+        .alloc_pmd = xen_alloc_pte_init,
-        .alloc_pd_clone = paravirt_nop,
+        .alloc_pmd_clone = paravirt_nop,
-        .release_pd = xen_release_pt_init,
+        .release_pmd = xen_release_pte_init,
 #ifdef CONFIG_HIGHPTE
        .kmap_atomic_pte = xen_kmap_atomic_pte,
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
deleted file mode 100644
index dcf613e17581..000000000000
--- a/arch/x86/xen/events.c
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Xen event channels
- *
- * Xen models interrupts with abstract event channels.  Because each
- * domain gets 1024 event channels, but NR_IRQ is not that large, we
- * must dynamically map irqs<->event channels.  The event channels
- * interface with the rest of the kernel by defining a xen interrupt
- * chip.  When an event is recieved, it is mapped to an irq and sent
- * through the normal interrupt processing path.
- *
- * There are four kinds of events which can be mapped to an event
- * channel:
- *
- * 1. Inter-domain notifications.  This includes all the virtual
- *    device events, since they're driven by front-ends in another domain
- *    (typically dom0).
- * 2. VIRQs, typically used for timers.  These are per-cpu events.
- * 3. IPIs.
- * 4. Hardware interrupts. Not supported at present.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-#include <linux/linkage.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <asm/ptrace.h>
-#include <asm/irq.h>
-#include <asm/sync_bitops.h>
-#include <asm/xen/hypercall.h>
-#include <asm/xen/hypervisor.h>
-#include <xen/events.h>
-#include <xen/interface/xen.h>
-#include <xen/interface/event_channel.h>
-#include "xen-ops.h"
-/*
- * This lock protects updates to the following mapping and reference-count
- * arrays. The lock does not need to be acquired to read the mapping tables.
- */
-static DEFINE_SPINLOCK(irq_mapping_update_lock);
-/* IRQ <-> VIRQ mapping. */
-static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
-/* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
-/* Packed IRQ information: binding type, sub-type index, and event channel. */
-struct packed_irq
-{
-        unsigned short evtchn;
-        unsigned char index;
-        unsigned char type;
-};
-static struct packed_irq irq_info[NR_IRQS];
-/* Binding types. */
-enum {
-        IRQT_UNBOUND,
-        IRQT_PIRQ,
-        IRQT_VIRQ,
-        IRQT_IPI,
-        IRQT_EVTCHN
-};
-/* Convenient shorthand for packed representation of an unbound IRQ. */
-#define IRQ_UNBOUND     mk_irq_info(IRQT_UNBOUND, 0, 0)
-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
-        [0 ... NR_EVENT_CHANNELS-1] = -1
-};
-static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
-static u8 cpu_evtchn[NR_EVENT_CHANNELS];
-/* Reference counts for bindings to IRQs. */
-static int irq_bindcount[NR_IRQS];
-/* Xen will never allocate port zero for any purpose. */
-#define VALID_EVTCHN(chn)       ((chn) != 0)
-/*
- * Force a proper event-channel callback from Xen after clearing the
- * callback mask. We do this in a very simple manner, by making a call
- * down into Xen. The pending flag will be checked by Xen on return.
- */
-void force_evtchn_callback(void)
-{
-        (void)HYPERVISOR_xen_version(0, NULL);
-}
-EXPORT_SYMBOL_GPL(force_evtchn_callback);
-static struct irq_chip xen_dynamic_chip;
-/* Constructor for packed IRQ information. */
-static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
-{
-        return (struct packed_irq) { evtchn, index, type };
-}
-/*
- * Accessors for packed IRQ information.
- */
-static inline unsigned int evtchn_from_irq(int irq)
-{
-        return irq_info[irq].evtchn;
-}
-static inline unsigned int index_from_irq(int irq)
-{
-        return irq_info[irq].index;
-}
-static inline unsigned int type_from_irq(int irq)
-{
-        return irq_info[irq].type;
-}
-static inline unsigned long active_evtchns(unsigned int cpu,
-                                           struct shared_info *sh,
-                                           unsigned int idx)
-{
-        return (sh->evtchn_pending[idx] &
-                cpu_evtchn_mask[cpu][idx] &
-                ~sh->evtchn_mask[idx]);
-}
-static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
-{
-        int irq = evtchn_to_irq[chn];
-        BUG_ON(irq == -1);
-#ifdef CONFIG_SMP
-        irq_desc[irq].affinity = cpumask_of_cpu(cpu);
-#endif
-        __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
-        __set_bit(chn, cpu_evtchn_mask[cpu]);
-        cpu_evtchn[chn] = cpu;
-}
-static void init_evtchn_cpu_bindings(void)
-{
-#ifdef CONFIG_SMP
-        int i;
-        /* By default all event channels notify CPU#0. */
-        for (i = 0; i < NR_IRQS; i++)
-                irq_desc[i].affinity = cpumask_of_cpu(0);
-#endif
-        memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
-        memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
-}
-static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
-{
-        return cpu_evtchn[evtchn];
-}
-static inline void clear_evtchn(int port)
-{
-        struct shared_info *s = HYPERVISOR_shared_info;
-        sync_clear_bit(port, &s->evtchn_pending[0]);
-}
-static inline void set_evtchn(int port)
-{
-        struct shared_info *s = HYPERVISOR_shared_info;
-        sync_set_bit(port, &s->evtchn_pending[0]);
-}
-/**
- * notify_remote_via_irq - send event to remote end of event channel via irq
- * @irq: irq of event channel to send event to
- *
- * Unlike notify_remote_via_evtchn(), this is safe to use across
- * save/restore. Notifications on a broken connection are silently
- * dropped.
- */
-void notify_remote_via_irq(int irq)
-{
-        int evtchn = evtchn_from_irq(irq);
-        if (VALID_EVTCHN(evtchn))
-                notify_remote_via_evtchn(evtchn);
-}
-EXPORT_SYMBOL_GPL(notify_remote_via_irq);
-static void mask_evtchn(int port)
-{
-        struct shared_info *s = HYPERVISOR_shared_info;
-        sync_set_bit(port, &s->evtchn_mask[0]);
-}
-static void unmask_evtchn(int port)
-{
-        struct shared_info *s = HYPERVISOR_shared_info;
-        unsigned int cpu = get_cpu();
-        BUG_ON(!irqs_disabled());
-        /* Slow path (hypercall) if this is a non-local port. */
-        if (unlikely(cpu != cpu_from_evtchn(port))) {
-                struct evtchn_unmask unmask = { .port = port };
-                (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
-        } else {
-                struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
-                sync_clear_bit(port, &s->evtchn_mask[0]);
-                /*
-                 * The following is basically the equivalent of
-                 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
-                 * the interrupt edge' if the channel is masked.
-                 */
-                if (sync_test_bit(port, &s->evtchn_pending[0]) &&
-                    !sync_test_and_set_bit(port / BITS_PER_LONG,
-                                           &vcpu_info->evtchn_pending_sel))
-                        vcpu_info->evtchn_upcall_pending = 1;
-        }
-        put_cpu();
-}
-static int find_unbound_irq(void)
-{
-        int irq;
-        /* Only allocate from dynirq range */
-        for (irq = 0; irq < NR_IRQS; irq++)
-                if (irq_bindcount[irq] == 0)
-                        break;
-        if (irq == NR_IRQS)
-                panic("No available IRQ to bind to: increase NR_IRQS!\n");
-        return irq;
-}
-int bind_evtchn_to_irq(unsigned int evtchn)
-{
-        int irq;
-        spin_lock(&irq_mapping_update_lock);
-        irq = evtchn_to_irq[evtchn];
-        if (irq == -1) {
-                irq = find_unbound_irq();
-                dynamic_irq_init(irq);
-                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
-                                              handle_level_irq, "event");
-                evtchn_to_irq[evtchn] = irq;
-                irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
-        }
-        irq_bindcount[irq]++;
-        spin_unlock(&irq_mapping_update_lock);
-        return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
-static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
-{
-        struct evtchn_bind_ipi bind_ipi;
-        int evtchn, irq;
-        spin_lock(&irq_mapping_update_lock);
-        irq = per_cpu(ipi_to_irq, cpu)[ipi];
-        if (irq == -1) {
-                irq = find_unbound_irq();
-                if (irq < 0)
-                        goto out;
-                dynamic_irq_init(irq);
-                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
-                                              handle_level_irq, "ipi");
-                bind_ipi.vcpu = cpu;
-                if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
-                                                &bind_ipi) != 0)
-                        BUG();
-                evtchn = bind_ipi.port;
-                evtchn_to_irq[evtchn] = irq;
-                irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
-                per_cpu(ipi_to_irq, cpu)[ipi] = irq;
-                bind_evtchn_to_cpu(evtchn, cpu);
-        }
-        irq_bindcount[irq]++;
- out:
-        spin_unlock(&irq_mapping_update_lock);
-        return irq;
-}
-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
-{
-        struct evtchn_bind_virq bind_virq;
-        int evtchn, irq;
-        spin_lock(&irq_mapping_update_lock);
-        irq = per_cpu(virq_to_irq, cpu)[virq];
-        if (irq == -1) {
-                bind_virq.virq = virq;
-                bind_virq.vcpu = cpu;
-                if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
-                                                &bind_virq) != 0)
-                        BUG();
-                evtchn = bind_virq.port;
-                irq = find_unbound_irq();
-                dynamic_irq_init(irq);
-                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
-                                              handle_level_irq, "virq");
-                evtchn_to_irq[evtchn] = irq;
-                irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
-                per_cpu(virq_to_irq, cpu)[virq] = irq;
-                bind_evtchn_to_cpu(evtchn, cpu);
-        }
-        irq_bindcount[irq]++;
-        spin_unlock(&irq_mapping_update_lock);
-        return irq;
-}
-static void unbind_from_irq(unsigned int irq)
-{
-        struct evtchn_close close;
-        int evtchn = evtchn_from_irq(irq);
-        spin_lock(&irq_mapping_update_lock);
-        if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
-                close.port = evtchn;
-                if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
-                        BUG();
-                switch (type_from_irq(irq)) {
-                case IRQT_VIRQ:
-                        per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
-                                [index_from_irq(irq)] = -1;
-                        break;
-                default:
-                        break;
-                }
-                /* Closed ports are implicitly re-bound to VCPU0. */
-                bind_evtchn_to_cpu(evtchn, 0);
-                evtchn_to_irq[evtchn] = -1;
-                irq_info[irq] = IRQ_UNBOUND;
-                dynamic_irq_init(irq);
-        }
-        spin_unlock(&irq_mapping_update_lock);
-}
-int bind_evtchn_to_irqhandler(unsigned int evtchn,
-                              irq_handler_t handler,
-                              unsigned long irqflags,
-                              const char *devname, void *dev_id)
-{
-        unsigned int irq;
-        int retval;
-        irq = bind_evtchn_to_irq(evtchn);
-        retval = request_irq(irq, handler, irqflags, devname, dev_id);
-        if (retval != 0) {
-                unbind_from_irq(irq);
-                return retval;
-        }
-        return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
-int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
-                            irq_handler_t handler,
-                            unsigned long irqflags, const char *devname, void *dev_id)
-{
-        unsigned int irq;
-        int retval;
-        irq = bind_virq_to_irq(virq, cpu);
-        retval = request_irq(irq, handler, irqflags, devname, dev_id);
-        if (retval != 0) {
-                unbind_from_irq(irq);
-                return retval;
-        }
-        return irq;
-}
-EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
-int bind_ipi_to_irqhandler(enum ipi_vector ipi,
-                           unsigned int cpu,
-                           irq_handler_t handler,
-                           unsigned long irqflags,
-                           const char *devname,
-                           void *dev_id)
-{
-        int irq, retval;
-        irq = bind_ipi_to_irq(ipi, cpu);
-        if (irq < 0)
-                return irq;
-        retval = request_irq(irq, handler, irqflags, devname, dev_id);
-        if (retval != 0) {
-                unbind_from_irq(irq);
-                return retval;
-        }
-        return irq;
-}
-void unbind_from_irqhandler(unsigned int irq, void *dev_id)
-{
-        free_irq(irq, dev_id);
-        unbind_from_irq(irq);
-}
-EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
-void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
-{
-        int irq = per_cpu(ipi_to_irq, cpu)[vector];
-        BUG_ON(irq < 0);
-        notify_remote_via_irq(irq);
-}
-/*
- * Search the CPUs pending events bitmasks.  For each one found, map
- * the event number to an irq, and feed it into do_IRQ() for
- * handling.
- *
- * Xen uses a two-level bitmap to speed searching.  The first level is
- * a bitset of words which contain pending event bits.  The second
- * level is a bitset of pending events themselves.
- */
-void xen_evtchn_do_upcall(struct pt_regs *regs)
-{
-        int cpu = get_cpu();
-        struct shared_info *s = HYPERVISOR_shared_info;
-        struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
-        unsigned long pending_words;
-        vcpu_info->evtchn_upcall_pending = 0;
-        /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
-        pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
-        while (pending_words != 0) {
-                unsigned long pending_bits;
-                int word_idx = __ffs(pending_words);
-                pending_words &= ~(1UL << word_idx);
-                while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
-                        int bit_idx = __ffs(pending_bits);
-                        int port = (word_idx * BITS_PER_LONG) + bit_idx;
-                        int irq = evtchn_to_irq[port];
-                        if (irq != -1) {
-                                regs->orig_ax = ~irq;
-                                do_IRQ(regs);
-                        }
-                }
-        }
-        put_cpu();
-}
-/* Rebind an evtchn so that it gets delivered to a specific cpu */
-static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
-{
-        struct evtchn_bind_vcpu bind_vcpu;
-        int evtchn = evtchn_from_irq(irq);
-        if (!VALID_EVTCHN(evtchn))
-                return;
-        /* Send future instances of this interrupt to other vcpu. */
-        bind_vcpu.port = evtchn;
-        bind_vcpu.vcpu = tcpu;
-        /*
-         * If this fails, it usually just indicates that we're dealing with a
-         * virq or IPI channel, which don't actually need to be rebound. Ignore
-         * it, but don't do the xenlinux-level rebind in that case.
-         */
-        if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
-                bind_evtchn_to_cpu(evtchn, tcpu);
-}
-static void set_affinity_irq(unsigned irq, cpumask_t dest)
-{
-        unsigned tcpu = first_cpu(dest);
-        rebind_irq_to_cpu(irq, tcpu);
-}
-static void enable_dynirq(unsigned int irq)
-{
-        int evtchn = evtchn_from_irq(irq);
-        if (VALID_EVTCHN(evtchn))
-                unmask_evtchn(evtchn);
-}
-static void disable_dynirq(unsigned int irq)
-{
-        int evtchn = evtchn_from_irq(irq);
-        if (VALID_EVTCHN(evtchn))
-                mask_evtchn(evtchn);
-}
-static void ack_dynirq(unsigned int irq)
-{
-        int evtchn = evtchn_from_irq(irq);
-        move_native_irq(irq);
-        if (VALID_EVTCHN(evtchn))
-                clear_evtchn(evtchn);
-}
-static int retrigger_dynirq(unsigned int irq)
-{
-        int evtchn = evtchn_from_irq(irq);
-        int ret = 0;
-        if (VALID_EVTCHN(evtchn)) {
-                set_evtchn(evtchn);
-                ret = 1;
-        }
-        return ret;
-}
-static struct irq_chip xen_dynamic_chip __read_mostly = {
-        .name           = "xen-dyn",
-        .mask           = disable_dynirq,
-        .unmask         = enable_dynirq,
-        .ack            = ack_dynirq,
-        .set_affinity   = set_affinity_irq,
-        .retrigger      = retrigger_dynirq,
-};
-void __init xen_init_IRQ(void)
-{
-        int i;
-        init_evtchn_cpu_bindings();
-        /* No event channels are 'live' right now. */
-        for (i = 0; i < NR_EVENT_CHANNELS; i++)
-                mask_evtchn(i);
-        /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-        for (i = 0; i < NR_IRQS; i++)
-                irq_bindcount[i] = 0;
-        irq_ctx_init(smp_processor_id());
-}
diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c
deleted file mode 100644
index 0707714e40d6..000000000000
--- a/arch/x86/xen/features.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/******************************************************************************
- * features.c
- *
- * Xen feature flags.
- *
- * Copyright (c) 2006, Ian Campbell, XenSource Inc.
- */
-#include <linux/types.h>
-#include <linux/cache.h>
-#include <linux/module.h>
-#include <asm/xen/hypervisor.h>
-#include <xen/features.h>
-u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
-EXPORT_SYMBOL_GPL(xen_features);
-void xen_setup_features(void)
-{
-        struct xen_feature_info fi;
-        int i, j;
-        for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
-                fi.submap_idx = i;
-                if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
-                        break;
-                for (j = 0; j < 32; j++)
-                        xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
-        }
-}
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
new file mode 100644
index 000000000000..49ba9b5224d1
--- /dev/null
+++ b/arch/x86/xen/grant-table.c
@@ -0,0 +1,91 @@
+/******************************************************************************
+ * grant_table.c
+ * x86 specific part
+ *
+ * Granting foreign access to our memory reservation.
+ *
+ * Copyright (c) 2005-2006, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan. Split out x86 specific part.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <xen/interface/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <asm/pgtable.h>
+static int map_pte_fn(pte_t *pte, struct page *pmd_page,
+                      unsigned long addr, void *data)
+{
+        unsigned long **frames = (unsigned long **)data;
+        set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
+        (*frames)++;
+        return 0;
+}
+static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
+                        unsigned long addr, void *data)
+{
+        set_pte_at(&init_mm, addr, pte, __pte(0));
+        return 0;
+}
+int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+                           unsigned long max_nr_gframes,
+                           struct grant_entry **__shared)
+{
+        int rc;
+        struct grant_entry *shared = *__shared;
+        if (shared == NULL) {
+                struct vm_struct *area =
+                        xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes);
+                BUG_ON(area == NULL);
+                shared = area->addr;
+                *__shared = shared;
+        }
+        rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+                                 PAGE_SIZE * nr_gframes,
+                                 map_pte_fn, &frames);
+        return rc;
+}
+void arch_gnttab_unmap_shared(struct grant_entry *shared,
+                              unsigned long nr_gframes)
+{
+        apply_to_page_range(&init_mm, (unsigned long)shared,
+                            PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
+}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 2a054ef2a3da..6cbcf65609ad 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -156,6 +156,10 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
                    pte_t *ptep, pte_t pteval)
 {
+        /* updates to init_mm may be done without lock */
+        if (mm == &init_mm)
+                preempt_disable();
        if (mm == current->mm || mm == &init_mm) {
                if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
                        struct multicall_space mcs;
@@ -163,14 +167,61 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
                        MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
                        xen_mc_issue(PARAVIRT_LAZY_MMU);
-                        return;
+                        goto out;
                } else
                        if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
-                                return;
+                                goto out;
        }
        xen_set_pte(ptep, pteval);
+out:
+        if (mm == &init_mm)
+                preempt_enable();
+}
+pteval_t xen_pte_val(pte_t pte)
+{
+        pteval_t ret = pte.pte;
+        if (ret & _PAGE_PRESENT)
+                ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+        return ret;
+}
+pgdval_t xen_pgd_val(pgd_t pgd)
+{
+        pgdval_t ret = pgd.pgd;
+        if (ret & _PAGE_PRESENT)
+                ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+        return ret;
+}
+pte_t xen_make_pte(pteval_t pte)
+{
+        if (pte & _PAGE_PRESENT) {
+                pte = phys_to_machine(XPADDR(pte)).maddr;
+                pte &= ~(_PAGE_PCD | _PAGE_PWT);
+        }
+        return (pte_t){ .pte = pte };
 }
+pgd_t xen_make_pgd(pgdval_t pgd)
+{
+        if (pgd & _PAGE_PRESENT)
+                pgd = phys_to_machine(XPADDR(pgd)).maddr;
+        return (pgd_t){ pgd };
+}
+pmdval_t xen_pmd_val(pmd_t pmd)
+{
+        pmdval_t ret = native_pmd_val(pmd);
+        if (ret & _PAGE_PRESENT)
+                ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+        return ret;
+}
 #ifdef CONFIG_X86_PAE
 void xen_set_pud(pud_t *ptr, pud_t val)
 {
@@ -214,100 +265,18 @@ void xen_pmd_clear(pmd_t *pmdp)
        xen_set_pmd(pmdp, __pmd(0));
 }
-unsigned long long xen_pte_val(pte_t pte)
+pmd_t xen_make_pmd(pmdval_t pmd)
 {
-        unsigned long long ret = 0;
+        if (pmd & _PAGE_PRESENT)
-        if (pte.pte_low) {
-                ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
-                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
-        }
-        return ret;
-}
-unsigned long long xen_pmd_val(pmd_t pmd)
-{
-        unsigned long long ret = pmd.pmd;
-        if (ret)
-                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
-        return ret;
-}
-unsigned long long xen_pgd_val(pgd_t pgd)
-{
-        unsigned long long ret = pgd.pgd;
-        if (ret)
-                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
-        return ret;
-}
-pte_t xen_make_pte(unsigned long long pte)
-{
-        if (pte & _PAGE_PRESENT) {
-                pte = phys_to_machine(XPADDR(pte)).maddr;
-                pte &= ~(_PAGE_PCD | _PAGE_PWT);
-        }
-        return (pte_t){ .pte = pte };
-}
-pmd_t xen_make_pmd(unsigned long long pmd)
-{
-        if (pmd & 1)
                pmd = phys_to_machine(XPADDR(pmd)).maddr;
-        return (pmd_t){ pmd };
+        return native_make_pmd(pmd);
-}
-pgd_t xen_make_pgd(unsigned long long pgd)
-{
-        if (pgd & _PAGE_PRESENT)
-                pgd = phys_to_machine(XPADDR(pgd)).maddr;
-        return (pgd_t){ pgd };
 }
 #else  /* !PAE */
 void xen_set_pte(pte_t *ptep, pte_t pte)
 {
        *ptep = pte;
 }
-unsigned long xen_pte_val(pte_t pte)
-{
-        unsigned long ret = pte.pte_low;
-        if (ret & _PAGE_PRESENT)
-                ret = machine_to_phys(XMADDR(ret)).paddr;
-        return ret;
-}
-unsigned long xen_pgd_val(pgd_t pgd)
-{
-        unsigned long ret = pgd.pgd;
-        if (ret)
-                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
-        return ret;
-}
-pte_t xen_make_pte(unsigned long pte)
-{
-        if (pte & _PAGE_PRESENT) {
-                pte = phys_to_machine(XPADDR(pte)).maddr;
-                pte &= ~(_PAGE_PCD | _PAGE_PWT);
-        }
-        return (pte_t){ pte };
-}
-pgd_t xen_make_pgd(unsigned long pgd)
-{
-        if (pgd & _PAGE_PRESENT)
-                pgd = phys_to_machine(XPADDR(pgd)).maddr;
-        return (pgd_t){ pgd };
-}
 #endif  /* CONFIG_X86_PAE */
 /*
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 2341492bf7a0..82517e4a752a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -16,6 +16,7 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
+#include <xen/interface/callback.h>
 #include <xen/interface/physdev.h>
 #include <xen/features.h>
@@ -68,6 +69,24 @@ static void __init fiddle_vdso(void)
        *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
 }
+void xen_enable_sysenter(void)
+{
+        int cpu = smp_processor_id();
+        extern void xen_sysenter_target(void);
+        /* Mask events on entry, even though they get enabled immediately */
+        static struct callback_register sysenter = {
+                .type = CALLBACKTYPE_sysenter,
+                .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
+                .flags = CALLBACKF_mask_events,
+        };
+        if (!boot_cpu_has(X86_FEATURE_SEP) ||
+            HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
+                clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
+                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
+        }
+}
 void __init xen_arch_setup(void)
 {
        struct physdev_set_iopl set_iopl;
@@ -82,6 +101,8 @@ void __init xen_arch_setup(void)
        HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
                                 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+        xen_enable_sysenter();
        set_iopl.iopl = 1;
        rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
        if (rc != 0)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index e340ff92f6b6..92dd3dbf3ffb 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -36,8 +36,9 @@
 #include "mmu.h"
 static cpumask_t xen_cpu_initialized_map;
-static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, resched_irq) = -1;
-static DEFINE_PER_CPU(int, callfunc_irq);
+static DEFINE_PER_CPU(int, callfunc_irq) = -1;
+static DEFINE_PER_CPU(int, debug_irq) = -1;
 /*
 * Structure and data for smp_call_function(). This is designed to minimise
@@ -72,6 +73,7 @@ static __cpuinit void cpu_bringup_and_idle(void)
        int cpu = smp_processor_id();
        cpu_init();
+        xen_enable_sysenter();
        preempt_disable();
        per_cpu(cpu_state, cpu) = CPU_ONLINE;
@@ -88,9 +90,7 @@ static __cpuinit void cpu_bringup_and_idle(void)
 static int xen_smp_intr_init(unsigned int cpu)
 {
        int rc;
-        const char *resched_name, *callfunc_name;
+        const char *resched_name, *callfunc_name, *debug_name;
-        per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
        resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
        rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -114,6 +114,14 @@ static int xen_smp_intr_init(unsigned int cpu)
                goto fail;
        per_cpu(callfunc_irq, cpu) = rc;
+        debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
+        rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
+                                     IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING,
+                                     debug_name, NULL);
+        if (rc < 0)
+                goto fail;
+        per_cpu(debug_irq, cpu) = rc;
        return 0;
 fail:
@@ -121,6 +129,8 @@ static int xen_smp_intr_init(unsigned int cpu)
                unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
        if (per_cpu(callfunc_irq, cpu) >= 0)
                unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+        if (per_cpu(debug_irq, cpu) >= 0)
+                unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
        return rc;
 }
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index fe161ed4b01e..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -108,6 +108,20 @@ ENDPATCH(xen_restore_fl_direct)
        RELOC(xen_restore_fl_direct, 2b+1)
 /*
+        We can't use sysexit directly, because we're not running in ring0.
+        But we can easily fake it up using iret.  Assuming xen_sysexit
+        is jumped to with a standard stack frame, we can just strip it
+        back to a standard iret frame and use iret.
+ */
+ENTRY(xen_sysexit)
+        movl PT_EAX(%esp), %eax                 /* Shouldn't be necessary? */
+        orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
+        lea PT_EIP(%esp), %esp
+        jmp xen_iret
+ENDPROC(xen_sysexit)
+/*
        This is run where a normal iret would be run, with the same stack setup:
              8: eflags
              4: cs
@@ -184,8 +198,12 @@ iret_restore_end:
           region is OK. */
        je xen_hypervisor_callback
-        iret
+1:      iret
 xen_iret_end_crit:
+.section __ex_table,"a"
+        .align 4
+        .long 1b,iret_exc
+.previous
 hyper_iret:
        /* put this out of line since its very rarely used */
@@ -219,9 +237,7 @@ hyper_iret:
         ds             }  SAVE_ALL state
         eax            }
          :             :
-         ebx            }
+         ebx            }<- esp
-        ----------------
-         return addr     <- esp
        ----------------
   In order to deliver the nested exception properly, we need to shift
@@ -236,10 +252,8 @@ hyper_iret:
   it's usermode state which we eventually need to restore.
 */
 ENTRY(xen_iret_crit_fixup)
-        /* offsets +4 for return address */
        /*
-           Paranoia: Make sure we're really coming from userspace.
+           Paranoia: Make sure we're really coming from kernel space.
           One could imagine a case where userspace jumps into the
           critical range address, but just before the CPU delivers a GP,
           it decides to deliver an interrupt instead.  Unlikely?
@@ -248,32 +262,32 @@ ENTRY(xen_iret_crit_fixup)
           jump instruction itself, not the destination, but some virtual
           environments get this wrong.
         */
-        movl PT_CS+4(%esp), %ecx
+        movl PT_CS(%esp), %ecx
        andl $SEGMENT_RPL_MASK, %ecx
        cmpl $USER_RPL, %ecx
        je 2f
-        lea PT_ORIG_EAX+4(%esp), %esi
+        lea PT_ORIG_EAX(%esp), %esi
-        lea PT_EFLAGS+4(%esp), %edi
+        lea PT_EFLAGS(%esp), %edi
        /* If eip is before iret_restore_end then stack
           hasn't been restored yet. */
        cmp $iret_restore_end, %eax
        jae 1f
-        movl 0+4(%edi),%eax             /* copy EAX */
+        movl 0+4(%edi),%eax             /* copy EAX (just above top of frame) */
-        movl %eax, PT_EAX+4(%esp)
+        movl %eax, PT_EAX(%esp)
        lea ESP_OFFSET(%edi),%edi       /* move dest up over saved regs */
        /* set up the copy */
 1:      std
-        mov $(PT_EIP+4) / 4, %ecx       /* copy ret+saved regs up to orig_eax */
+        mov $PT_EIP / 4, %ecx           /* saved regs up to orig_eax */
        rep movsl
        cld
        lea 4(%edi),%esp                /* point esp to new frame */
-2:      ret
+2:      jmp xen_do_upcall
 /*
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 956a491ea998..f1063ae08037 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,8 @@
 #define XEN_OPS_H
 #include <linux/init.h>
+#include <linux/irqreturn.h>
+#include <xen/xen-ops.h>
 /* These are code, but not functions.  Defined in entry.S */
 extern const char xen_hypervisor_callback[];
@@ -9,7 +11,6 @@ extern const char xen_failsafe_callback[];
 void xen_copy_trap_info(struct trap_info *traps);
-DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DECLARE_PER_CPU(unsigned long, xen_cr3);
 DECLARE_PER_CPU(unsigned long, xen_current_cr3);
@@ -19,6 +20,7 @@ extern struct shared_info *HYPERVISOR_shared_info;
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
 void __init xen_init_IRQ(void);
+void xen_enable_sysenter(void);
 void xen_setup_timer(int cpu);
 void xen_setup_cpu_clockevents(void);
@@ -28,6 +30,8 @@ unsigned long xen_get_wallclock(void);
 int xen_set_wallclock(unsigned long time);
 unsigned long long xen_sched_clock(void);
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 bool xen_vcpu_stolen(int vcpu);
 void xen_mark_init_mm_pinned(void);
@@ -64,4 +68,6 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void);
 DECL_ASM(void, xen_restore_fl_direct, unsigned long);
 void xen_iret(void);
+void xen_sysexit(void);
 #endif /* XEN_OPS_H */
author	Linus Torvalds <torvalds@linux-foundation.org>	2008-04-25 15:32:10 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-04-25 15:32:10 -0400
commit	4b7227ca321ccf447cdc04538687c895db8b77f5 (patch)
tree	72712127fc56aa2579e8a1508998bcabf6bd6c60 /arch/x86
parent	5dae61b80564a5583ff4b56e357bdbc733fddb76 (diff)
parent	1775826ceec51187aa868406585799b7e76ffa7d (diff)