116 files changed, 15031 insertions, 210 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index c7c9c2a15fab..7a11b905ef49 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -222,6 +222,8 @@ config PARAVIRT
          However, when run without a hypervisor the kernel is
          theoretically slower.  If in doubt, say N.
+source "arch/i386/xen/Kconfig"
 config VMI
        bool "VMI Paravirt-ops support"
        depends on PARAVIRT
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 181cc29a7c4f..01f0ff0daaf4 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000)	:= -Iinclude/asm-i386/mach-es7000
 mcore-$(CONFIG_X86_ES7000)      := mach-default
 core-$(CONFIG_X86_ES7000)       := arch/i386/mach-es7000/
+# Xen paravirtualization support
+core-$(CONFIG_XEN)              += arch/i386/xen/
 # default subarch .h files
 mflags-y += -Iinclude/asm-i386/mach-default
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
index ce4fda261aaf..b0e21c3cee5c 100644
--- a/arch/i386/boot/compressed/relocs.c
+++ b/arch/i386/boot/compressed/relocs.c
@@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = {
                "__kernel_rt_sigreturn",
                "__kernel_sigreturn",
                "SYSENTER_RETURN",
+                "xen_irq_disable_direct_reloc",
+                "xen_save_fl_direct_reloc",
 };
 static int is_safe_abs_reloc(const char* sym_name)
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 27a776c9044d..25f7eb513928 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -17,6 +17,8 @@
 #include <asm/thread_info.h>
 #include <asm/elf.h>
+#include <xen/interface/xen.h>
 #define DEFINE(sym, val) \
        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -59,6 +61,7 @@ void foo(void)
        OFFSET(TI_addr_limit, thread_info, addr_limit);
        OFFSET(TI_restart_block, thread_info, restart_block);
        OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+        OFFSET(TI_cpu, thread_info, cpu);
        BLANK();
        OFFSET(GDS_size, Xgt_desc_struct, size);
@@ -115,4 +118,10 @@ void foo(void)
        OFFSET(PARAVIRT_iret, paravirt_ops, iret);
        OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
 #endif
+#ifdef CONFIG_XEN
+        BLANK();
+        OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+        OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
 }
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 3c3c220488c9..32980b834935 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -1023,6 +1023,91 @@ ENTRY(kernel_thread_helper)
        CFI_ENDPROC
 ENDPROC(kernel_thread_helper)
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+        CFI_STARTPROC
+        pushl $0
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        TRACE_IRQS_OFF
+        /* Check to see if we got the event in the critical
+           region in xen_iret_direct, after we've reenabled
+           events and checked for pending events.  This simulates
+           iret instruction's behaviour where it delivers a
+           pending interrupt when enabling interrupts. */
+        movl PT_EIP(%esp),%eax
+        cmpl $xen_iret_start_crit,%eax
+        jb   1f
+        cmpl $xen_iret_end_crit,%eax
+        jae  1f
+        call xen_iret_crit_fixup
+1:      mov %esp, %eax
+        call xen_evtchn_do_upcall
+        jmp  ret_from_intr
+        CFI_ENDPROC
+ENDPROC(xen_hypervisor_callback)
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(xen_failsafe_callback)
+        CFI_STARTPROC
+        pushl %eax
+        CFI_ADJUST_CFA_OFFSET 4
+        movl $1,%eax
+1:      mov 4(%esp),%ds
+2:      mov 8(%esp),%es
+3:      mov 12(%esp),%fs
+4:      mov 16(%esp),%gs
+        testl %eax,%eax
+        popl %eax
+        CFI_ADJUST_CFA_OFFSET -4
+        lea 16(%esp),%esp
+        CFI_ADJUST_CFA_OFFSET -16
+        jz 5f
+        addl $16,%esp
+        jmp iret_exc            # EAX != 0 => Category 2 (Bad IRET)
+5:      pushl $0                # EAX == 0 => Category 1 (Bad segment)
+        CFI_ADJUST_CFA_OFFSET 4
+        SAVE_ALL
+        jmp ret_from_exception
+        CFI_ENDPROC
+.section .fixup,"ax"
+6:      xorl %eax,%eax
+        movl %eax,4(%esp)
+        jmp 1b
+7:      xorl %eax,%eax
+        movl %eax,8(%esp)
+        jmp 2b
+8:      xorl %eax,%eax
+        movl %eax,12(%esp)
+        jmp 3b
+9:      xorl %eax,%eax
+        movl %eax,16(%esp)
+        jmp 4b
+.previous
+.section __ex_table,"a"
+        .align 4
+        .long 1b,6b
+        .long 2b,7b
+        .long 3b,8b
+        .long 4b,9b
+.previous
+ENDPROC(xen_failsafe_callback)
+#endif  /* CONFIG_XEN */
 .section .rodata,"a"
 #include "syscall_table.S"
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 82714668d43b..7c52b222207e 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -510,7 +510,8 @@ ENTRY(_stext)
 /*
 * BSS section
 */
-.section ".bss.page_aligned","w"
+.section ".bss.page_aligned","wa"
+        .align PAGE_SIZE_asm
 ENTRY(swapper_pg_dir)
        .fill 1024,4,0
 ENTRY(swapper_pg_pmd)
@@ -538,6 +539,8 @@ fault_msg:
        .ascii "Int %d: CR2 %p  err %p  EIP %p  CS %p  flags %p\n"
        .asciz "Stack: %p %p %p %p %p %p %p %p\n"
+#include "../xen/xen-head.S"
 /*
 * The IDT and GDT 'descriptors' are a strange 48-bit object
 * only used by the lidt and lgdt instructions. They are not
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index faab09abca5e..53f07a8275e3 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -228,6 +228,41 @@ static int __init print_banner(void)
 }
 core_initcall(print_banner);
+static struct resource reserve_ioports = {
+        .start = 0,
+        .end = IO_SPACE_LIMIT,
+        .name = "paravirt-ioport",
+        .flags = IORESOURCE_IO | IORESOURCE_BUSY,
+};
+static struct resource reserve_iomem = {
+        .start = 0,
+        .end = -1,
+        .name = "paravirt-iomem",
+        .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+/*
+ * Reserve the whole legacy IO space to prevent any legacy drivers
+ * from wasting time probing for their hardware.  This is a fairly
+ * brute-force approach to disabling all non-virtual drivers.
+ *
+ * Note that this must be called very early to have any effect.
+ */
+int paravirt_disable_iospace(void)
+{
+        int ret;
+        ret = request_resource(&ioport_resource, &reserve_ioports);
+        if (ret == 0) {
+                ret = request_resource(&iomem_resource, &reserve_iomem);
+                if (ret)
+                        release_resource(&reserve_ioports);
+        }
+        return ret;
+}
 struct paravirt_ops paravirt_ops = {
        .name = "bare hardware",
        .paravirt_enabled = 0,
@@ -267,7 +302,7 @@ struct paravirt_ops paravirt_ops = {
        .write_msr = native_write_msr_safe,
        .read_tsc = native_read_tsc,
        .read_pmc = native_read_pmc,
-        .get_scheduled_cycles = native_read_tsc,
+        .sched_clock = native_sched_clock,
        .get_cpu_khz = native_calculate_cpu_khz,
        .load_tr_desc = native_load_tr_desc,
        .set_ldt = native_set_ldt,
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 2d61e65eeb50..74871d066c2b 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p)
         * NOTE: at this point the bootmem allocator is fully available.
         */
+        paravirt_post_allocator_init();
        dmi_scan_machine();
 #ifdef CONFIG_X86_GENERICARCH
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 6299c080f6e2..2d35d8502029 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -22,6 +22,7 @@
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
 #include <mach_apic.h>
 /*
@@ -249,13 +250,13 @@ static unsigned long flush_va;
 static DEFINE_SPINLOCK(tlbstate_lock);
 /*
- * We cannot call mmdrop() because we are in interrupt context, 
+ * We cannot call mmdrop() because we are in interrupt context,
 * instead update mm->cpu_vm_mask.
 *
 * We need to reload %cr3 since the page tables may be going
 * away from under us..
 */
-static inline void leave_mm (unsigned long cpu)
+void leave_mm(unsigned long cpu)
 {
        if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
                BUG();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 0b2954534b8e..5910d3fac561 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -148,7 +148,7 @@ void __init smp_alloc_memory(void)
 * a given CPU
 */
-static void __cpuinit smp_store_cpu_info(int id)
+void __cpuinit smp_store_cpu_info(int id)
 {
        struct cpuinfo_x86 *c = cpu_data + id;
@@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu)
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
-static inline void
+void set_cpu_sibling_map(int cpu)
-set_cpu_sibling_map(int cpu)
 {
        int i;
        struct cpuinfo_x86 *c = cpu_data;
@@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void)
 }
 #ifdef CONFIG_HOTPLUG_CPU
-static void
+void remove_siblinginfo(int cpu)
-remove_siblinginfo(int cpu)
 {
        int sibling;
        struct cpuinfo_x86 *c = cpu_data;
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index ea63a30ca3e8..252f9010f283 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void)
 *
 *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
 */
-static unsigned long cyc2ns_scale __read_mostly;
+unsigned long cyc2ns_scale __read_mostly;
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
@@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz)
        cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
 }
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-        return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
 /*
 * Scheduler clock - returns current time in nanosec units.
 */
-unsigned long long sched_clock(void)
+unsigned long long native_sched_clock(void)
 {
        unsigned long long this_offset;
@@ -118,12 +113,24 @@ unsigned long long sched_clock(void)
                return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
        /* read the Time Stamp Counter: */
-        get_scheduled_cycles(this_offset);
+        rdtscll(this_offset);
        /* return the value in ns */
        return cycles_2_ns(this_offset);
 }
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+        return paravirt_sched_clock();
+}
+#else
+unsigned long long sched_clock(void)
+        __attribute__((alias("native_sched_clock")));
+#endif
 unsigned long native_calculate_cpu_khz(void)
 {
        unsigned long long start, end;
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index c12720d7cbc5..72042bb7ec94 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
 }
 #endif
-static void vmi_allocate_pt(u32 pfn)
+static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
 {
        vmi_set_page_type(pfn, VMI_PAGE_L1);
        vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -891,7 +891,7 @@ static inline int __init activate_vmi(void)
                paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
                paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
 #endif
-                paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
+                paravirt_ops.sched_clock = vmi_sched_clock;
                paravirt_ops.get_cpu_khz = vmi_cpu_khz;
                /* We have true wallclock functions; disable CMOS clock sync */
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
index 26a37f8a8762..f9b845f4e692 100644
--- a/arch/i386/kernel/vmiclock.c
+++ b/arch/i386/kernel/vmiclock.c
@@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now)
        return 0;
 }
-/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
+/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_get_sched_cycles(void)
+unsigned long long vmi_sched_clock(void)
 {
-        return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+        return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
 }
 /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index aa87b06c7c82..00f1bc47d3a2 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -88,6 +88,7 @@ SECTIONS
  . = ALIGN(4096);
  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+        *(.data.page_aligned)
        *(.data.idt)
  }
diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S
index d4b5be4f3d5f..271f16a8ca01 100644
--- a/arch/i386/kernel/vsyscall-note.S
+++ b/arch/i386/kernel/vsyscall-note.S
@@ -3,23 +3,40 @@
 * Here we can supply some information useful to userland.
 */
-#include <linux/uts.h>
 #include <linux/version.h>
+#include <linux/elfnote.h>
-#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type)                         \
+/* Ideally this would use UTS_NAME, but using a quoted string here
-        .section name, flags;                                                 \
+   doesn't work. Remember to change this when changing the
-        .balign 4;                                                            \
+   kernel's name. */
-        .long 1f - 0f;          /* name length */                             \
+ELFNOTE_START(Linux, 0, "a")
-        .long 3f - 2f;          /* data length */                             \
+        .long LINUX_VERSION_CODE
-        .long type;             /* note type */                               \
+ELFNOTE_END
-0:      .asciz vendor;          /* vendor name */                             \
-1:      .balign 4;                                                            \
-2:
-#define ASM_ELF_NOTE_END                                                      \
+#ifdef CONFIG_XEN
-3:      .balign 4;              /* pad out section */                         \
-        .previous
-        ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
+/*
-        .long LINUX_VERSION_CODE
+ * Add a special note telling glibc's dynamic linker a fake hardware
-        ASM_ELF_NOTE_END
+ * flavor that it will use to choose the search path for libraries in the
+ * same way it uses real hardware capabilities like "mmx".
+ * We supply "nosegneg" as the fake capability, to indicate that we
+ * do not like negative offsets in instructions using segment overrides,
+ * since we implement those inefficiently.  This makes it possible to
+ * install libraries optimized to avoid those access patterns in someplace
+ * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
+ * corresponding to the bits here is needed to make ldconfig work right.
+ * It should contain:
+ *      hwcap 1 nosegneg
+ * to match the mapping of bit to name that we give here.
+ */
+/* Bit used for the pseudo-hwcap for non-negative segments.  We use
+   bit 1 to avoid bugs in some versions of glibc when bit 0 is
+   used; the choice is otherwise arbitrary. */
+#define VDSO_NOTE_NONEGSEG_BIT  1
+ELFNOTE_START(GNU, 2, "a")
+        .long 1, 1<<VDSO_NOTE_NONEGSEG_BIT              /* ncaps, mask */
+        .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
+ELFNOTE_END
+#endif
diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c
index b4b24e0e45e1..f9d595338159 100644
--- a/arch/i386/mach-voyager/voyager_thread.c
+++ b/arch/i386/mach-voyager/voyager_thread.c
@@ -52,7 +52,7 @@ execute(const char *string)
                NULL,
        };
-        if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) {
+        if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
                printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
                       string, ret);
        }
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 7135946d3663..6a68b1ae061c 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
        if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
                pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-                paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
+                paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
                set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
                BUG_ON(page_table != pte_offset_kernel(pmd, 0));
        }
@@ -473,6 +473,7 @@ void zap_low_mappings (void)
 static int disable_nx __initdata = 0;
 u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
 /*
 * noexec = on|off
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 2eb14a73be9c..37992ffb1633 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
        address = __pa(address);
        addr = address & LARGE_PAGE_MASK; 
        pbase = (pte_t *)page_address(base);
-        paravirt_alloc_pt(page_to_pfn(base));
+        paravirt_alloc_pt(&init_mm, page_to_pfn(base));
        for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
               set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
                                          addr == address ? prot : ref_prot));
diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig
new file mode 100644
index 000000000000..9df99e1885a4
--- /dev/null
+++ b/arch/i386/xen/Kconfig
@@ -0,0 +1,11 @@
+#
+# This Kconfig describes xen options
+#
+config XEN
+        bool "Enable support for Xen hypervisor"
+        depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
+        help
+          This is the Linux Xen port.  Enabling this will allow the
+          kernel to boot in a paravirtualized environment under the
+          Xen hypervisor.
diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
new file mode 100644
index 000000000000..343df246bd3e
--- /dev/null
+++ b/arch/i386/xen/Makefile
@@ -0,0 +1,4 @@
+obj-y           := enlighten.o setup.o features.o multicalls.o mmu.o \
+                        events.o time.o manage.o xen-asm.o
+obj-$(CONFIG_SMP)       += smp.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
new file mode 100644
index 000000000000..9a8c1181c001
--- /dev/null
+++ b/arch/i386/xen/enlighten.c
@@ -0,0 +1,1144 @@
+/*
+ * Core of Xen paravirt_ops implementation.
+ *
+ * This file contains the xen_paravirt_ops structure itself, and the
+ * implementations for:
+ * - privileged instructions
+ * - interrupt flags
+ * - segment operations
+ * - booting and setup
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/hardirq.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/sched.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <asm/paravirt.h>
+#include <asm/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/reboot.h>
+#include "xen-ops.h"
+#include "mmu.h"
+#include "multicalls.h"
+EXPORT_SYMBOL_GPL(hypercall_page);
+DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU(unsigned long, xen_cr3);
+struct start_info *xen_start_info;
+EXPORT_SYMBOL_GPL(xen_start_info);
+static /* __initdata */ struct shared_info dummy_shared_info;
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
+/*
+ * Flag to determine whether vcpu info placement is available on all
+ * VCPUs.  We assume it is to start with, and then set it to zero on
+ * the first failure.  This is because it can succeed on some VCPUs
+ * and not others, since it can involve hypervisor memory allocation,
+ * or because the guest failed to guarantee all the appropriate
+ * constraints on all VCPUs (ie buffer can't cross a page boundary).
+ *
+ * Note that any particular CPU may be using a placed vcpu structure,
+ * but we can only optimise if the all are.
+ *
+ * 0: not available, 1: available
+ */
+static int have_vcpu_info_placement = 1;
+static void __init xen_vcpu_setup(int cpu)
+{
+        struct vcpu_register_vcpu_info info;
+        int err;
+        struct vcpu_info *vcpup;
+        per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+        if (!have_vcpu_info_placement)
+                return;         /* already tested, not available */
+        vcpup = &per_cpu(xen_vcpu_info, cpu);
+        info.mfn = virt_to_mfn(vcpup);
+        info.offset = offset_in_page(vcpup);
+        printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+               cpu, vcpup, info.mfn, info.offset);
+        /* Check to see if the hypervisor will put the vcpu_info
+           structure where we want it, which allows direct access via
+           a percpu-variable. */
+        err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
+        if (err) {
+                printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
+                have_vcpu_info_placement = 0;
+        } else {
+                /* This cpu is using the registered vcpu info, even if
+                   later ones fail to. */
+                per_cpu(xen_vcpu, cpu) = vcpup;
+                printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
+                       cpu, vcpup);
+        }
+}
+static void __init xen_banner(void)
+{
+        printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+               paravirt_ops.name);
+        printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+}
+static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+                      unsigned int *ecx, unsigned int *edx)
+{
+        unsigned maskedx = ~0;
+        /*
+         * Mask out inconvenient features, to try and disable as many
+         * unsupported kernel subsystems as possible.
+         */
+        if (*eax == 1)
+                maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
+                            (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
+                            (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+        asm(XEN_EMULATE_PREFIX "cpuid"
+                : "=a" (*eax),
+                  "=b" (*ebx),
+                  "=c" (*ecx),
+                  "=d" (*edx)
+                : "0" (*eax), "2" (*ecx));
+        *edx &= maskedx;
+}
+static void xen_set_debugreg(int reg, unsigned long val)
+{
+        HYPERVISOR_set_debugreg(reg, val);
+}
+static unsigned long xen_get_debugreg(int reg)
+{
+        return HYPERVISOR_get_debugreg(reg);
+}
+static unsigned long xen_save_fl(void)
+{
+        struct vcpu_info *vcpu;
+        unsigned long flags;
+        vcpu = x86_read_percpu(xen_vcpu);
+        /* flag has opposite sense of mask */
+        flags = !vcpu->evtchn_upcall_mask;
+        /* convert to IF type flag
+           -0 -> 0x00000000
+           -1 -> 0xffffffff
+        */
+        return (-flags) & X86_EFLAGS_IF;
+}
+static void xen_restore_fl(unsigned long flags)
+{
+        struct vcpu_info *vcpu;
+        /* convert from IF type flag */
+        flags = !(flags & X86_EFLAGS_IF);
+        /* There's a one instruction preempt window here.  We need to
+           make sure we're don't switch CPUs between getting the vcpu
+           pointer and updating the mask. */
+        preempt_disable();
+        vcpu = x86_read_percpu(xen_vcpu);
+        vcpu->evtchn_upcall_mask = flags;
+        preempt_enable_no_resched();
+        /* Doesn't matter if we get preempted here, because any
+           pending event will get dealt with anyway. */
+        if (flags == 0) {
+                preempt_check_resched();
+                barrier(); /* unmask then check (avoid races) */
+                if (unlikely(vcpu->evtchn_upcall_pending))
+                        force_evtchn_callback();
+        }
+}
+static void xen_irq_disable(void)
+{
+        /* There's a one instruction preempt window here.  We need to
+           make sure we're don't switch CPUs between getting the vcpu
+           pointer and updating the mask. */
+        preempt_disable();
+        x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+        preempt_enable_no_resched();
+}
+static void xen_irq_enable(void)
+{
+        struct vcpu_info *vcpu;
+        /* There's a one instruction preempt window here.  We need to
+           make sure we're don't switch CPUs between getting the vcpu
+           pointer and updating the mask. */
+        preempt_disable();
+        vcpu = x86_read_percpu(xen_vcpu);
+        vcpu->evtchn_upcall_mask = 0;
+        preempt_enable_no_resched();
+        /* Doesn't matter if we get preempted here, because any
+           pending event will get dealt with anyway. */
+        barrier(); /* unmask then check (avoid races) */
+        if (unlikely(vcpu->evtchn_upcall_pending))
+                force_evtchn_callback();
+}
+static void xen_safe_halt(void)
+{
+        /* Blocking includes an implicit local_irq_enable(). */
+        if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+                BUG();
+}
+static void xen_halt(void)
+{
+        if (irqs_disabled())
+                HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+        else
+                xen_safe_halt();
+}
+static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+        BUG_ON(preemptible());
+        switch (mode) {
+        case PARAVIRT_LAZY_NONE:
+                BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
+                break;
+        case PARAVIRT_LAZY_MMU:
+        case PARAVIRT_LAZY_CPU:
+                BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
+                break;
+        case PARAVIRT_LAZY_FLUSH:
+                /* flush if necessary, but don't change state */
+                if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
+                        xen_mc_flush();
+                return;
+        }
+        xen_mc_flush();
+        x86_write_percpu(xen_lazy_mode, mode);
+}
+static unsigned long xen_store_tr(void)
+{
+        return 0;
+}
+static void xen_set_ldt(const void *addr, unsigned entries)
+{
+        unsigned long linear_addr = (unsigned long)addr;
+        struct mmuext_op *op;
+        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_SET_LDT;
+        if (linear_addr) {
+                /* ldt my be vmalloced, use arbitrary_virt_to_machine */
+                xmaddr_t maddr;
+                maddr = arbitrary_virt_to_machine((unsigned long)addr);
+                linear_addr = (unsigned long)maddr.maddr;
+        }
+        op->arg1.linear_addr = linear_addr;
+        op->arg2.nr_ents = entries;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+        unsigned long *frames;
+        unsigned long va = dtr->address;
+        unsigned int size = dtr->size + 1;
+        unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+        int f;
+        struct multicall_space mcs;
+        /* A GDT can be up to 64k in size, which corresponds to 8192
+           8-byte entries, or 16 4k pages.. */
+        BUG_ON(size > 65536);
+        BUG_ON(va & ~PAGE_MASK);
+        mcs = xen_mc_entry(sizeof(*frames) * pages);
+        frames = mcs.args;
+        for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+                frames[f] = virt_to_mfn(va);
+                make_lowmem_page_readonly((void *)va);
+        }
+        MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+static void load_TLS_descriptor(struct thread_struct *t,
+                                unsigned int cpu, unsigned int i)
+{
+        struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+        xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+        struct multicall_space mc = __xen_mc_entry(0);
+        MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+        xen_mc_batch();
+        load_TLS_descriptor(t, cpu, 0);
+        load_TLS_descriptor(t, cpu, 1);
+        load_TLS_descriptor(t, cpu, 2);
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+        /*
+         * XXX sleazy hack: If we're being called in a lazy-cpu zone,
+         * it means we're in a context switch, and %gs has just been
+         * saved.  This means we can zero it out to prevent faults on
+         * exit from the hypervisor if the next process has no %gs.
+         * Either way, it has been saved, and the new value will get
+         * loaded properly.  This will go away as soon as Xen has been
+         * modified to not save/restore %gs for normal hypercalls.
+         */
+        if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+                loadsegment(gs, 0);
+}
+static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
+                                u32 low, u32 high)
+{
+        unsigned long lp = (unsigned long)&dt[entrynum];
+        xmaddr_t mach_lp = virt_to_machine(lp);
+        u64 entry = (u64)high << 32 | low;
+        preempt_disable();
+        xen_mc_flush();
+        if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+                BUG();
+        preempt_enable();
+}
+static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+                            struct trap_info *info)
+{
+        u8 type, dpl;
+        type = (high >> 8) & 0x1f;
+        dpl = (high >> 13) & 3;
+        if (type != 0xf && type != 0xe)
+                return 0;
+        info->vector = vector;
+        info->address = (high & 0xffff0000) | (low & 0x0000ffff);
+        info->cs = low >> 16;
+        info->flags = dpl;
+        /* interrupt gates clear IF */
+        if (type == 0xe)
+                info->flags |= 4;
+        return 1;
+}
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+/* Set an IDT entry.  If the entry is part of the current IDT, then
+   also update Xen. */
+static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
+                                u32 low, u32 high)
+{
+        unsigned long p = (unsigned long)&dt[entrynum];
+        unsigned long start, end;
+        preempt_disable();
+        start = __get_cpu_var(idt_desc).address;
+        end = start + __get_cpu_var(idt_desc).size + 1;
+        xen_mc_flush();
+        write_dt_entry(dt, entrynum, low, high);
+        if (p >= start && (p + 8) <= end) {
+                struct trap_info info[2];
+                info[1].address = 0;
+                if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+                        if (HYPERVISOR_set_trap_table(info))
+                                BUG();
+        }
+        preempt_enable();
+}
+static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+                                  struct trap_info *traps)
+{
+        unsigned in, out, count;
+        count = (desc->size+1) / 8;
+        BUG_ON(count > 256);
+        for (in = out = 0; in < count; in++) {
+                const u32 *entry = (u32 *)(desc->address + in * 8);
+                if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+                        out++;
+        }
+        traps[out].address = 0;
+}
+void xen_copy_trap_info(struct trap_info *traps)
+{
+        const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
+        xen_convert_trap_info(desc, traps);
+}
+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
+   hold a spinlock to protect the static traps[] array (static because
+   it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+        static DEFINE_SPINLOCK(lock);
+        static struct trap_info traps[257];
+        spin_lock(&lock);
+        __get_cpu_var(idt_desc) = *desc;
+        xen_convert_trap_info(desc, traps);
+        xen_mc_flush();
+        if (HYPERVISOR_set_trap_table(traps))
+                BUG();
+        spin_unlock(&lock);
+}
+/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
+   they're handled differently. */
+static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
+                                u32 low, u32 high)
+{
+        preempt_disable();
+        switch ((high >> 8) & 0xff) {
+        case DESCTYPE_LDT:
+        case DESCTYPE_TSS:
+                /* ignore */
+                break;
+        default: {
+                xmaddr_t maddr = virt_to_machine(&dt[entry]);
+                u64 desc = (u64)high << 32 | low;
+                xen_mc_flush();
+                if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
+                        BUG();
+        }
+        }
+        preempt_enable();
+}
+static void xen_load_esp0(struct tss_struct *tss,
+                          struct thread_struct *thread)
+{
+        struct multicall_space mcs = xen_mc_entry(0);
+        MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+static void xen_set_iopl_mask(unsigned mask)
+{
+        struct physdev_set_iopl set_iopl;
+        /* Force the change at ring 0. */
+        set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+        HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+}
+static void xen_io_delay(void)
+{
+}
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_apic_read(unsigned long reg)
+{
+        return 0;
+}
+static void xen_apic_write(unsigned long reg, unsigned long val)
+{
+        /* Warn to see if there's any stray references */
+        WARN_ON(1);
+}
+#endif
+static void xen_flush_tlb(void)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+static void xen_flush_tlb_single(unsigned long addr)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_INVLPG_LOCAL;
+        op->arg1.linear_addr = addr & PAGE_MASK;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
+                                 unsigned long va)
+{
+        struct {
+                struct mmuext_op op;
+                cpumask_t mask;
+        } *args;
+        cpumask_t cpumask = *cpus;
+        struct multicall_space mcs;
+        /*
+         * A couple of (to be removed) sanity checks:
+         *
+         * - current CPU must not be in mask
+         * - mask must exist :)
+         */
+        BUG_ON(cpus_empty(cpumask));
+        BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+        BUG_ON(!mm);
+        /* If a CPU which we ran on has gone down, OK. */
+        cpus_and(cpumask, cpumask, cpu_online_map);
+        if (cpus_empty(cpumask))
+                return;
+        mcs = xen_mc_entry(sizeof(*args));
+        args = mcs.args;
+        args->mask = cpumask;
+        args->op.arg2.vcpumask = &args->mask;
+        if (va == TLB_FLUSH_ALL) {
+                args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+        } else {
+                args->op.cmd = MMUEXT_INVLPG_MULTI;
+                args->op.arg1.linear_addr = va;
+        }
+        MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+static void xen_write_cr2(unsigned long cr2)
+{
+        x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+}
+static unsigned long xen_read_cr2(void)
+{
+        return x86_read_percpu(xen_vcpu)->arch.cr2;
+}
+static unsigned long xen_read_cr2_direct(void)
+{
+        return x86_read_percpu(xen_vcpu_info.arch.cr2);
+}
+static void xen_write_cr4(unsigned long cr4)
+{
+        /* never allow TSC to be disabled */
+        native_write_cr4(cr4 & ~X86_CR4_TSD);
+}
+static unsigned long xen_read_cr3(void)
+{
+        return x86_read_percpu(xen_cr3);
+}
+static void xen_write_cr3(unsigned long cr3)
+{
+        BUG_ON(preemptible());
+        if (cr3 == x86_read_percpu(xen_cr3)) {
+                /* just a simple tlb flush */
+                xen_flush_tlb();
+                return;
+        }
+        x86_write_percpu(xen_cr3, cr3);
+        {
+                struct mmuext_op *op;
+                struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+                unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+                op = mcs.args;
+                op->cmd = MMUEXT_NEW_BASEPTR;
+                op->arg1.mfn = mfn;
+                MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+                xen_mc_issue(PARAVIRT_LAZY_CPU);
+        }
+}
+/* Early in boot, while setting up the initial pagetable, assume
+   everything is pinned. */
+static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+{
+        BUG_ON(mem_map);        /* should only be used early */
+        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+/* This needs to make sure the new pte page is pinned iff its being
+   attached to a pinned pagetable. */
+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+{
+        struct page *page = pfn_to_page(pfn);
+        if (PagePinned(virt_to_page(mm->pgd))) {
+                SetPagePinned(page);
+                if (!PageHighMem(page))
+                        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+                else
+                        /* make sure there are no stray mappings of
+                           this page */
+                        kmap_flush_unused();
+        }
+}
+/* This should never happen until we're OK to use struct page */
+static void xen_release_pt(u32 pfn)
+{
+        struct page *page = pfn_to_page(pfn);
+        if (PagePinned(page)) {
+                if (!PageHighMem(page))
+                        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+        }
+}
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+        pgprot_t prot = PAGE_KERNEL;
+        if (PagePinned(page))
+                prot = PAGE_KERNEL_RO;
+        if (0 && PageHighMem(page))
+                printk("mapping highpte %lx type %d prot %s\n",
+                       page_to_pfn(page), type,
+                       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+        return kmap_atomic_prot(page, type, prot);
+}
+#endif
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+        /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+        if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+                pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+                               pte_val_ma(pte));
+        return pte;
+}
+/* Init-time set_pte while constructing initial pagetables, which
+   doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+        pte = mask_rw_pte(ptep, pte);
+        xen_set_pte(ptep, pte);
+}
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+        pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+        /* special set_pte for pagetable initialization */
+        paravirt_ops.set_pte = xen_set_pte_init;
+        init_mm.pgd = base;
+        /*
+         * copy top-level of Xen-supplied pagetable into place.  For
+         * !PAE we can use this as-is, but for PAE it is a stand-in
+         * while we copy the pmd pages.
+         */
+        memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+        if (PTRS_PER_PMD > 1) {
+                int i;
+                /*
+                 * For PAE, need to allocate new pmds, rather than
+                 * share Xen's, since Xen doesn't like pmd's being
+                 * shared between address spaces.
+                 */
+                for (i = 0; i < PTRS_PER_PGD; i++) {
+                        if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+                                pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+                                memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+                                       PAGE_SIZE);
+                                make_lowmem_page_readonly(pmd);
+                                set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+                        } else
+                                pgd_clear(&base[i]);
+                }
+        }
+        /* make sure zero_page is mapped RO so we can use it in pagetables */
+        make_lowmem_page_readonly(empty_zero_page);
+        make_lowmem_page_readonly(base);
+        /*
+         * Switch to new pagetable.  This is done before
+         * pagetable_init has done anything so that the new pages
+         * added to the table can be prepared properly for Xen.
+         */
+        xen_write_cr3(__pa(base));
+}
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+        /* This will work as long as patching hasn't happened yet
+           (which it hasn't) */
+        paravirt_ops.alloc_pt = xen_alloc_pt;
+        paravirt_ops.set_pte = xen_set_pte;
+        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                /*
+                 * Create a mapping for the shared info page.
+                 * Should be set_fixmap(), but shared_info is a machine
+                 * address with no corresponding pseudo-phys address.
+                 */
+                set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+                            PFN_DOWN(xen_start_info->shared_info),
+                            PAGE_KERNEL);
+                HYPERVISOR_shared_info =
+                        (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+        } else
+                HYPERVISOR_shared_info =
+                        (struct shared_info *)__va(xen_start_info->shared_info);
+        /* Actually pin the pagetable down, but we can't set PG_pinned
+           yet because the page structures don't exist yet. */
+        {
+                struct mmuext_op op;
+#ifdef CONFIG_X86_PAE
+                op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+                op.cmd = MMUEXT_PIN_L3_TABLE;
+#endif
+                op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
+                if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+                        BUG();
+        }
+}
+/* This is called once we have the cpu_possible_map */
+void __init xen_setup_vcpu_info_placement(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                xen_vcpu_setup(cpu);
+        /* xen_vcpu_setup managed to place the vcpu_info within the
+           percpu area for all cpus, so make use of it */
+        if (have_vcpu_info_placement) {
+                printk(KERN_INFO "Xen: using vcpu_info placement\n");
+                paravirt_ops.save_fl = xen_save_fl_direct;
+                paravirt_ops.restore_fl = xen_restore_fl_direct;
+                paravirt_ops.irq_disable = xen_irq_disable_direct;
+                paravirt_ops.irq_enable = xen_irq_enable_direct;
+                paravirt_ops.read_cr2 = xen_read_cr2_direct;
+                paravirt_ops.iret = xen_iret_direct;
+        }
+}
+static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+        char *start, *end, *reloc;
+        unsigned ret;
+        start = end = reloc = NULL;
+#define SITE(x)                                                         \
+        case PARAVIRT_PATCH(x):                                         \
+        if (have_vcpu_info_placement) {                                 \
+                start = (char *)xen_##x##_direct;                       \
+                end = xen_##x##_direct_end;                             \
+                reloc = xen_##x##_direct_reloc;                         \
+        }                                                               \
+        goto patch_site
+        switch (type) {
+                SITE(irq_enable);
+                SITE(irq_disable);
+                SITE(save_fl);
+                SITE(restore_fl);
+#undef SITE
+        patch_site:
+                if (start == NULL || (end-start) > len)
+                        goto default_patch;
+                ret = paravirt_patch_insns(insns, len, start, end);
+                /* Note: because reloc is assigned from something that
+                   appears to be an array, gcc assumes it's non-null,
+                   but doesn't know its relationship with start and
+                   end. */
+                if (reloc > start && reloc < end) {
+                        int reloc_off = reloc - start;
+                        long *relocp = (long *)(insns + reloc_off);
+                        long delta = start - (char *)insns;
+                        *relocp += delta;
+                }
+                break;
+        default_patch:
+        default:
+                ret = paravirt_patch_default(type, clobbers, insns, len);
+                break;
+        }
+        return ret;
+}
+static const struct paravirt_ops xen_paravirt_ops __initdata = {
+        .paravirt_enabled = 1,
+        .shared_kernel_pmd = 0,
+        .name = "Xen",
+        .banner = xen_banner,
+        .patch = xen_patch,
+        .memory_setup = xen_memory_setup,
+        .arch_setup = xen_arch_setup,
+        .init_IRQ = xen_init_IRQ,
+        .post_allocator_init = xen_mark_init_mm_pinned,
+        .time_init = xen_time_init,
+        .set_wallclock = xen_set_wallclock,
+        .get_wallclock = xen_get_wallclock,
+        .get_cpu_khz = xen_cpu_khz,
+        .sched_clock = xen_sched_clock,
+        .cpuid = xen_cpuid,
+        .set_debugreg = xen_set_debugreg,
+        .get_debugreg = xen_get_debugreg,
+        .clts = native_clts,
+        .read_cr0 = native_read_cr0,
+        .write_cr0 = native_write_cr0,
+        .read_cr2 = xen_read_cr2,
+        .write_cr2 = xen_write_cr2,
+        .read_cr3 = xen_read_cr3,
+        .write_cr3 = xen_write_cr3,
+        .read_cr4 = native_read_cr4,
+        .read_cr4_safe = native_read_cr4_safe,
+        .write_cr4 = xen_write_cr4,
+        .save_fl = xen_save_fl,
+        .restore_fl = xen_restore_fl,
+        .irq_disable = xen_irq_disable,
+        .irq_enable = xen_irq_enable,
+        .safe_halt = xen_safe_halt,
+        .halt = xen_halt,
+        .wbinvd = native_wbinvd,
+        .read_msr = native_read_msr_safe,
+        .write_msr = native_write_msr_safe,
+        .read_tsc = native_read_tsc,
+        .read_pmc = native_read_pmc,
+        .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
+        .irq_enable_sysexit = NULL,  /* never called */
+        .load_tr_desc = paravirt_nop,
+        .set_ldt = xen_set_ldt,
+        .load_gdt = xen_load_gdt,
+        .load_idt = xen_load_idt,
+        .load_tls = xen_load_tls,
+        .store_gdt = native_store_gdt,
+        .store_idt = native_store_idt,
+        .store_tr = xen_store_tr,
+        .write_ldt_entry = xen_write_ldt_entry,
+        .write_gdt_entry = xen_write_gdt_entry,
+        .write_idt_entry = xen_write_idt_entry,
+        .load_esp0 = xen_load_esp0,
+        .set_iopl_mask = xen_set_iopl_mask,
+        .io_delay = xen_io_delay,
+#ifdef CONFIG_X86_LOCAL_APIC
+        .apic_write = xen_apic_write,
+        .apic_write_atomic = xen_apic_write,
+        .apic_read = xen_apic_read,
+        .setup_boot_clock = paravirt_nop,
+        .setup_secondary_clock = paravirt_nop,
+        .startup_ipi_hook = paravirt_nop,
+#endif
+        .flush_tlb_user = xen_flush_tlb,
+        .flush_tlb_kernel = xen_flush_tlb,
+        .flush_tlb_single = xen_flush_tlb_single,
+        .flush_tlb_others = xen_flush_tlb_others,
+        .pte_update = paravirt_nop,
+        .pte_update_defer = paravirt_nop,
+        .pagetable_setup_start = xen_pagetable_setup_start,
+        .pagetable_setup_done = xen_pagetable_setup_done,
+        .alloc_pt = xen_alloc_pt_init,
+        .release_pt = xen_release_pt,
+        .alloc_pd = paravirt_nop,
+        .alloc_pd_clone = paravirt_nop,
+        .release_pd = paravirt_nop,
+#ifdef CONFIG_HIGHPTE
+        .kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+        .set_pte = NULL,        /* see xen_pagetable_setup_* */
+        .set_pte_at = xen_set_pte_at,
+        .set_pmd = xen_set_pmd,
+        .pte_val = xen_pte_val,
+        .pgd_val = xen_pgd_val,
+        .make_pte = xen_make_pte,
+        .make_pgd = xen_make_pgd,
+#ifdef CONFIG_X86_PAE
+        .set_pte_atomic = xen_set_pte_atomic,
+        .set_pte_present = xen_set_pte_at,
+        .set_pud = xen_set_pud,
+        .pte_clear = xen_pte_clear,
+        .pmd_clear = xen_pmd_clear,
+        .make_pmd = xen_make_pmd,
+        .pmd_val = xen_pmd_val,
+#endif  /* PAE */
+        .activate_mm = xen_activate_mm,
+        .dup_mmap = xen_dup_mmap,
+        .exit_mmap = xen_exit_mmap,
+        .set_lazy_mode = xen_set_lazy_mode,
+};
+#ifdef CONFIG_SMP
+static const struct smp_ops xen_smp_ops __initdata = {
+        .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+        .smp_prepare_cpus = xen_smp_prepare_cpus,
+        .cpu_up = xen_cpu_up,
+        .smp_cpus_done = xen_smp_cpus_done,
+        .smp_send_stop = xen_smp_send_stop,
+        .smp_send_reschedule = xen_smp_send_reschedule,
+        .smp_call_function_mask = xen_smp_call_function_mask,
+};
+#endif  /* CONFIG_SMP */
+static void xen_reboot(int reason)
+{
+#ifdef CONFIG_SMP
+        smp_send_stop();
+#endif
+        if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
+                BUG();
+}
+static void xen_restart(char *msg)
+{
+        xen_reboot(SHUTDOWN_reboot);
+}
+static void xen_emergency_restart(void)
+{
+        xen_reboot(SHUTDOWN_reboot);
+}
+static void xen_machine_halt(void)
+{
+        xen_reboot(SHUTDOWN_poweroff);
+}
+static void xen_crash_shutdown(struct pt_regs *regs)
+{
+        xen_reboot(SHUTDOWN_crash);
+}
+static const struct machine_ops __initdata xen_machine_ops = {
+        .restart = xen_restart,
+        .halt = xen_machine_halt,
+        .power_off = xen_machine_halt,
+        .shutdown = xen_machine_halt,
+        .crash_shutdown = xen_crash_shutdown,
+        .emergency_restart = xen_emergency_restart,
+};
+/* First C function to be called on Xen boot */
+asmlinkage void __init xen_start_kernel(void)
+{
+        pgd_t *pgd;
+        if (!xen_start_info)
+                return;
+        BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
+        /* Install Xen paravirt ops */
+        paravirt_ops = xen_paravirt_ops;
+        machine_ops = xen_machine_ops;
+#ifdef CONFIG_SMP
+        smp_ops = xen_smp_ops;
+#endif
+        xen_setup_features();
+        /* Get mfn list */
+        if (!xen_feature(XENFEAT_auto_translated_physmap))
+                phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+        pgd = (pgd_t *)xen_start_info->pt_base;
+        init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+        init_mm.pgd = pgd; /* use the Xen pagetables to start */
+        /* keep using Xen gdt for now; no urgent need to change it */
+        x86_write_percpu(xen_cr3, __pa(pgd));
+#ifdef CONFIG_SMP
+        /* Don't do the full vcpu_info placement stuff until we have a
+           possible map. */
+        per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+#else
+        /* May as well do it now, since there's no good time to call
+           it later on UP. */
+        xen_setup_vcpu_info_placement();
+#endif
+        paravirt_ops.kernel_rpl = 1;
+        if (xen_feature(XENFEAT_supervisor_mode_kernel))
+                paravirt_ops.kernel_rpl = 0;
+        /* set the limit of our address space */
+        reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+        /* set up basic CPUID stuff */
+        cpu_detect(&new_cpu_data);
+        new_cpu_data.hard_math = 1;
+        new_cpu_data.x86_capability[0] = cpuid_edx(1);
+        /* Poke various useful things into boot_params */
+        LOADER_TYPE = (9 << 4) | 0;
+        INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
+        INITRD_SIZE = xen_start_info->mod_len;
+        /* Start the world */
+        start_kernel();
+}
diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c
new file mode 100644
index 000000000000..8904acc20f8c
--- /dev/null
+++ b/arch/i386/xen/events.c
@@ -0,0 +1,590 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels.  Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels.  The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip.  When an event is recieved, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications.  This includes all the virtual
+ *    device events, since they're driven by front-ends in another domain
+ *    (typically dom0).
+ * 2. VIRQs, typically used for timers.  These are per-cpu events.
+ * 3. IPIs.
+ * 4. Hardware interrupts. Not supported at present.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include "xen-ops.h"
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+struct packed_irq
+{
+        unsigned short evtchn;
+        unsigned char index;
+        unsigned char type;
+};
+static struct packed_irq irq_info[NR_IRQS];
+/* Binding types. */
+enum {
+        IRQT_UNBOUND,
+        IRQT_PIRQ,
+        IRQT_VIRQ,
+        IRQT_IPI,
+        IRQT_EVTCHN
+};
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND     mk_irq_info(IRQT_UNBOUND, 0, 0)
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+        [0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)       ((chn) != 0)
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+        (void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+static struct irq_chip xen_dynamic_chip;
+/* Constructor for packed IRQ information. */
+static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+        return (struct packed_irq) { evtchn, index, type };
+}
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+        return irq_info[irq].evtchn;
+}
+static inline unsigned int index_from_irq(int irq)
+{
+        return irq_info[irq].index;
+}
+static inline unsigned int type_from_irq(int irq)
+{
+        return irq_info[irq].type;
+}
+static inline unsigned long active_evtchns(unsigned int cpu,
+                                           struct shared_info *sh,
+                                           unsigned int idx)
+{
+        return (sh->evtchn_pending[idx] &
+                cpu_evtchn_mask[cpu][idx] &
+                ~sh->evtchn_mask[idx]);
+}
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+        int irq = evtchn_to_irq[chn];
+        BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+        irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+#endif
+        __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
+        __set_bit(chn, cpu_evtchn_mask[cpu]);
+        cpu_evtchn[chn] = cpu;
+}
+static void init_evtchn_cpu_bindings(void)
+{
+#ifdef CONFIG_SMP
+        int i;
+        /* By default all event channels notify CPU#0. */
+        for (i = 0; i < NR_IRQS; i++)
+                irq_desc[i].affinity = cpumask_of_cpu(0);
+#endif
+        memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+        memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+        return cpu_evtchn[evtchn];
+}
+static inline void clear_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+static inline void set_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        sync_set_bit(port, &s->evtchn_pending[0]);
+}
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+static void mask_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        sync_set_bit(port, &s->evtchn_mask[0]);
+}
+static void unmask_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        unsigned int cpu = get_cpu();
+        BUG_ON(!irqs_disabled());
+        /* Slow path (hypercall) if this is a non-local port. */
+        if (unlikely(cpu != cpu_from_evtchn(port))) {
+                struct evtchn_unmask unmask = { .port = port };
+                (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+        } else {
+                struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+                sync_clear_bit(port, &s->evtchn_mask[0]);
+                /*
+                 * The following is basically the equivalent of
+                 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+                 * the interrupt edge' if the channel is masked.
+                 */
+                if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+                    !sync_test_and_set_bit(port / BITS_PER_LONG,
+                                           &vcpu_info->evtchn_pending_sel))
+                        vcpu_info->evtchn_upcall_pending = 1;
+        }
+        put_cpu();
+}
+static int find_unbound_irq(void)
+{
+        int irq;
+        /* Only allocate from dynirq range */
+        for (irq = 0; irq < NR_IRQS; irq++)
+                if (irq_bindcount[irq] == 0)
+                        break;
+        if (irq == NR_IRQS)
+                panic("No available IRQ to bind to: increase NR_IRQS!\n");
+        return irq;
+}
+int bind_evtchn_to_irq(unsigned int evtchn)
+{
+        int irq;
+        spin_lock(&irq_mapping_update_lock);
+        irq = evtchn_to_irq[evtchn];
+        if (irq == -1) {
+                irq = find_unbound_irq();
+                dynamic_irq_init(irq);
+                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                              handle_level_irq, "event");
+                evtchn_to_irq[evtchn] = irq;
+                irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+        }
+        irq_bindcount[irq]++;
+        spin_unlock(&irq_mapping_update_lock);
+        return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+        struct evtchn_bind_ipi bind_ipi;
+        int evtchn, irq;
+        spin_lock(&irq_mapping_update_lock);
+        irq = per_cpu(ipi_to_irq, cpu)[ipi];
+        if (irq == -1) {
+                irq = find_unbound_irq();
+                if (irq < 0)
+                        goto out;
+                dynamic_irq_init(irq);
+                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                              handle_level_irq, "ipi");
+                bind_ipi.vcpu = cpu;
+                if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+                                                &bind_ipi) != 0)
+                        BUG();
+                evtchn = bind_ipi.port;
+                evtchn_to_irq[evtchn] = irq;
+                irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+                per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+                bind_evtchn_to_cpu(evtchn, cpu);
+        }
+        irq_bindcount[irq]++;
+ out:
+        spin_unlock(&irq_mapping_update_lock);
+        return irq;
+}
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+        struct evtchn_bind_virq bind_virq;
+        int evtchn, irq;
+        spin_lock(&irq_mapping_update_lock);
+        irq = per_cpu(virq_to_irq, cpu)[virq];
+        if (irq == -1) {
+                bind_virq.virq = virq;
+                bind_virq.vcpu = cpu;
+                if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+                                                &bind_virq) != 0)
+                        BUG();
+                evtchn = bind_virq.port;
+                irq = find_unbound_irq();
+                dynamic_irq_init(irq);
+                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                              handle_level_irq, "virq");
+                evtchn_to_irq[evtchn] = irq;
+                irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+                per_cpu(virq_to_irq, cpu)[virq] = irq;
+                bind_evtchn_to_cpu(evtchn, cpu);
+        }
+        irq_bindcount[irq]++;
+        spin_unlock(&irq_mapping_update_lock);
+        return irq;
+}
+static void unbind_from_irq(unsigned int irq)
+{
+        struct evtchn_close close;
+        int evtchn = evtchn_from_irq(irq);
+        spin_lock(&irq_mapping_update_lock);
+        if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+                close.port = evtchn;
+                if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+                        BUG();
+                switch (type_from_irq(irq)) {
+                case IRQT_VIRQ:
+                        per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+                                [index_from_irq(irq)] = -1;
+                        break;
+                default:
+                        break;
+                }
+                /* Closed ports are implicitly re-bound to VCPU0. */
+                bind_evtchn_to_cpu(evtchn, 0);
+                evtchn_to_irq[evtchn] = -1;
+                irq_info[irq] = IRQ_UNBOUND;
+                dynamic_irq_init(irq);
+        }
+        spin_unlock(&irq_mapping_update_lock);
+}
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+                              irqreturn_t (*handler)(int, void *),
+                              unsigned long irqflags,
+                              const char *devname, void *dev_id)
+{
+        unsigned int irq;
+        int retval;
+        irq = bind_evtchn_to_irq(evtchn);
+        retval = request_irq(irq, handler, irqflags, devname, dev_id);
+        if (retval != 0) {
+                unbind_from_irq(irq);
+                return retval;
+        }
+        return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+                            irqreturn_t (*handler)(int, void *),
+                            unsigned long irqflags, const char *devname, void *dev_id)
+{
+        unsigned int irq;
+        int retval;
+        irq = bind_virq_to_irq(virq, cpu);
+        retval = request_irq(irq, handler, irqflags, devname, dev_id);
+        if (retval != 0) {
+                unbind_from_irq(irq);
+                return retval;
+        }
+        return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+                           unsigned int cpu,
+                           irq_handler_t handler,
+                           unsigned long irqflags,
+                           const char *devname,
+                           void *dev_id)
+{
+        int irq, retval;
+        irq = bind_ipi_to_irq(ipi, cpu);
+        if (irq < 0)
+                return irq;
+        retval = request_irq(irq, handler, irqflags, devname, dev_id);
+        if (retval != 0) {
+                unbind_from_irq(irq);
+                return retval;
+        }
+        return irq;
+}
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+        free_irq(irq, dev_id);
+        unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+        int irq = per_cpu(ipi_to_irq, cpu)[vector];
+        BUG_ON(irq < 0);
+        notify_remote_via_irq(irq);
+}
+/*
+ * Search the CPUs pending events bitmasks.  For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching.  The first level is
+ * a bitset of words which contain pending event bits.  The second
+ * level is a bitset of pending events themselves.
+ */
+fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+        int cpu = get_cpu();
+        struct shared_info *s = HYPERVISOR_shared_info;
+        struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+        unsigned long pending_words;
+        vcpu_info->evtchn_upcall_pending = 0;
+        /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+        pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+        while (pending_words != 0) {
+                unsigned long pending_bits;
+                int word_idx = __ffs(pending_words);
+                pending_words &= ~(1UL << word_idx);
+                while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+                        int bit_idx = __ffs(pending_bits);
+                        int port = (word_idx * BITS_PER_LONG) + bit_idx;
+                        int irq = evtchn_to_irq[port];
+                        if (irq != -1) {
+                                regs->orig_eax = ~irq;
+                                do_IRQ(regs);
+                        }
+                }
+        }
+        put_cpu();
+}
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+        struct evtchn_bind_vcpu bind_vcpu;
+        int evtchn = evtchn_from_irq(irq);
+        if (!VALID_EVTCHN(evtchn))
+                return;
+        /* Send future instances of this interrupt to other vcpu. */
+        bind_vcpu.port = evtchn;
+        bind_vcpu.vcpu = tcpu;
+        /*
+         * If this fails, it usually just indicates that we're dealing with a
+         * virq or IPI channel, which don't actually need to be rebound. Ignore
+         * it, but don't do the xenlinux-level rebind in that case.
+         */
+        if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+                bind_evtchn_to_cpu(evtchn, tcpu);
+}
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+        unsigned tcpu = first_cpu(dest);
+        rebind_irq_to_cpu(irq, tcpu);
+}
+static void enable_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                unmask_evtchn(evtchn);
+}
+static void disable_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                mask_evtchn(evtchn);
+}
+static void ack_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        move_native_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                clear_evtchn(evtchn);
+}
+static int retrigger_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        int ret = 0;
+        if (VALID_EVTCHN(evtchn)) {
+                set_evtchn(evtchn);
+                ret = 1;
+        }
+        return ret;
+}
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+        .name           = "xen-dyn",
+        .mask           = disable_dynirq,
+        .unmask         = enable_dynirq,
+        .ack            = ack_dynirq,
+        .set_affinity   = set_affinity_irq,
+        .retrigger      = retrigger_dynirq,
+};
+void __init xen_init_IRQ(void)
+{
+        int i;
+        init_evtchn_cpu_bindings();
+        /* No event channels are 'live' right now. */
+        for (i = 0; i < NR_EVENT_CHANNELS; i++)
+                mask_evtchn(i);
+        /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+        for (i = 0; i < NR_IRQS; i++)
+                irq_bindcount[i] = 0;
+        irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c
new file mode 100644
index 000000000000..0707714e40d6
--- /dev/null
+++ b/arch/i386/xen/features.c
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/features.h>
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+void xen_setup_features(void)
+{
+        struct xen_feature_info fi;
+        int i, j;
+        for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+                fi.submap_idx = i;
+                if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+                        break;
+                for (j = 0; j < 32; j++)
+                        xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+        }
+}
diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c
new file mode 100644
index 000000000000..aa7af9e6abc0
--- /dev/null
+++ b/arch/i386/xen/manage.c
@@ -0,0 +1,143 @@
+/*
+ * Handle extern requests for shutdown, reboot and sysrq
+ */
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <xen/xenbus.h>
+#define SHUTDOWN_INVALID  -1
+#define SHUTDOWN_POWEROFF  0
+#define SHUTDOWN_SUSPEND   2
+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+ * report a crash, not be instructed to crash!
+ * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
+ * the distinction when we return the reason code to them.
+ */
+#define SHUTDOWN_HALT      4
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+static void shutdown_handler(struct xenbus_watch *watch,
+                             const char **vec, unsigned int len)
+{
+        char *str;
+        struct xenbus_transaction xbt;
+        int err;
+        if (shutting_down != SHUTDOWN_INVALID)
+                return;
+ again:
+        err = xenbus_transaction_start(&xbt);
+        if (err)
+                return;
+        str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+        /* Ignore read errors and empty reads. */
+        if (XENBUS_IS_ERR_READ(str)) {
+                xenbus_transaction_end(xbt, 1);
+                return;
+        }
+        xenbus_write(xbt, "control", "shutdown", "");
+        err = xenbus_transaction_end(xbt, 0);
+        if (err == -EAGAIN) {
+                kfree(str);
+                goto again;
+        }
+        if (strcmp(str, "poweroff") == 0 ||
+            strcmp(str, "halt") == 0)
+                orderly_poweroff(false);
+        else if (strcmp(str, "reboot") == 0)
+                ctrl_alt_del();
+        else {
+                printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+                shutting_down = SHUTDOWN_INVALID;
+        }
+        kfree(str);
+}
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+                          unsigned int len)
+{
+        char sysrq_key = '\0';
+        struct xenbus_transaction xbt;
+        int err;
+ again:
+        err = xenbus_transaction_start(&xbt);
+        if (err)
+                return;
+        if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+                printk(KERN_ERR "Unable to read sysrq code in "
+                       "control/sysrq\n");
+                xenbus_transaction_end(xbt, 1);
+                return;
+        }
+        if (sysrq_key != '\0')
+                xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+        err = xenbus_transaction_end(xbt, 0);
+        if (err == -EAGAIN)
+                goto again;
+        if (sysrq_key != '\0')
+                handle_sysrq(sysrq_key, NULL);
+}
+static struct xenbus_watch shutdown_watch = {
+        .node = "control/shutdown",
+        .callback = shutdown_handler
+};
+static struct xenbus_watch sysrq_watch = {
+        .node = "control/sysrq",
+        .callback = sysrq_handler
+};
+static int setup_shutdown_watcher(void)
+{
+        int err;
+        err = register_xenbus_watch(&shutdown_watch);
+        if (err) {
+                printk(KERN_ERR "Failed to set shutdown watcher\n");
+                return err;
+        }
+        err = register_xenbus_watch(&sysrq_watch);
+        if (err) {
+                printk(KERN_ERR "Failed to set sysrq watcher\n");
+                return err;
+        }
+        return 0;
+}
+static int shutdown_event(struct notifier_block *notifier,
+                          unsigned long event,
+                          void *data)
+{
+        setup_shutdown_watcher();
+        return NOTIFY_DONE;
+}
+static int __init setup_shutdown_event(void)
+{
+        static struct notifier_block xenstore_notifier = {
+                .notifier_call = shutdown_event
+        };
+        register_xenstore_notifier(&xenstore_notifier);
+        return 0;
+}
+subsys_initcall(setup_shutdown_event);
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
new file mode 100644
index 000000000000..4ae038aa6c24
--- /dev/null
+++ b/arch/i386/xen/mmu.c
@@ -0,0 +1,564 @@
+/*
+ * Xen mmu operations
+ *
+ * This file contains the various mmu fetch and update operations.
+ * The most important job they must perform is the mapping between the
+ * domain's pfn and the overall machine mfns.
+ *
+ * Xen allows guests to directly update the pagetable, in a controlled
+ * fashion.  In other words, the guest modifies the same pagetable
+ * that the CPU actually uses, which eliminates the overhead of having
+ * a separate shadow pagetable.
+ *
+ * In order to allow this, it falls on the guest domain to map its
+ * notion of a "physical" pfn - which is just a domain-local linear
+ * address - into a real "machine address" which the CPU's MMU can
+ * use.
+ *
+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
+ * inserted directly into the pagetable.  When creating a new
+ * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
+ * the mfn back into a pfn.
+ *
+ * The other constraint is that all pages which make up a pagetable
+ * must be mapped read-only in the guest.  This prevents uncontrolled
+ * guest updates to the pagetable.  Xen strictly enforces this, and
+ * will disallow any pagetable update which will end up mapping a
+ * pagetable page RW, and will disallow using any writable page as a
+ * pagetable.
+ *
+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
+ * would need to validate the whole pagetable before going on.
+ * Naturally, this is quite slow.  The solution is to "pin" a
+ * pagetable, which enforces all the constraints on the pagetable even
+ * when it is not actively in use.  This menas that Xen can be assured
+ * that it is still valid when you do load it into %cr3, and doesn't
+ * need to revalidate it.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/bug.h>
+#include <linux/sched.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/paravirt.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include "multicalls.h"
+#include "mmu.h"
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+        pte_t *pte = lookup_address(address);
+        unsigned offset = address & PAGE_MASK;
+        BUG_ON(pte == NULL);
+        return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+void make_lowmem_page_readonly(void *vaddr)
+{
+        pte_t *pte, ptev;
+        unsigned long address = (unsigned long)vaddr;
+        pte = lookup_address(address);
+        BUG_ON(pte == NULL);
+        ptev = pte_wrprotect(*pte);
+        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+                BUG();
+}
+void make_lowmem_page_readwrite(void *vaddr)
+{
+        pte_t *pte, ptev;
+        unsigned long address = (unsigned long)vaddr;
+        pte = lookup_address(address);
+        BUG_ON(pte == NULL);
+        ptev = pte_mkwrite(*pte);
+        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+                BUG();
+}
+void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+        struct multicall_space mcs;
+        struct mmu_update *u;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*u));
+        u = mcs.args;
+        u->ptr = virt_to_machine(ptr).maddr;
+        u->val = pmd_val_ma(val);
+        MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
+}
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        pgd = swapper_pg_dir + pgd_index(vaddr);
+        if (pgd_none(*pgd)) {
+                BUG();
+                return;
+        }
+        pud = pud_offset(pgd, vaddr);
+        if (pud_none(*pud)) {
+                BUG();
+                return;
+        }
+        pmd = pmd_offset(pud, vaddr);
+        if (pmd_none(*pmd)) {
+                BUG();
+                return;
+        }
+        pte = pte_offset_kernel(pmd, vaddr);
+        /* <mfn,flags> stored as-is, to permit clearing entries */
+        xen_set_pte(pte, mfn_pte(mfn, flags));
+        /*
+         * It's enough to flush this one mapping.
+         * (PGE mappings get flushed as well)
+         */
+        __flush_tlb_one(vaddr);
+}
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval)
+{
+        if (mm == current->mm || mm == &init_mm) {
+                if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+                        struct multicall_space mcs;
+                        mcs = xen_mc_entry(0);
+                        MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+                        xen_mc_issue(PARAVIRT_LAZY_MMU);
+                        return;
+                } else
+                        if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
+                                return;
+        }
+        xen_set_pte(ptep, pteval);
+}
+#ifdef CONFIG_X86_PAE
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+        struct multicall_space mcs;
+        struct mmu_update *u;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*u));
+        u = mcs.args;
+        u->ptr = virt_to_machine(ptr).maddr;
+        u->val = pud_val_ma(val);
+        MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
+}
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+        ptep->pte_high = pte.pte_high;
+        smp_wmb();
+        ptep->pte_low = pte.pte_low;
+}
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+        set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+        ptep->pte_low = 0;
+        smp_wmb();              /* make sure low gets written first */
+        ptep->pte_high = 0;
+}
+void xen_pmd_clear(pmd_t *pmdp)
+{
+        xen_set_pmd(pmdp, __pmd(0));
+}
+unsigned long long xen_pte_val(pte_t pte)
+{
+        unsigned long long ret = 0;
+        if (pte.pte_low) {
+                ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        }
+        return ret;
+}
+unsigned long long xen_pmd_val(pmd_t pmd)
+{
+        unsigned long long ret = pmd.pmd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+unsigned long long xen_pgd_val(pgd_t pgd)
+{
+        unsigned long long ret = pgd.pgd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+pte_t xen_make_pte(unsigned long long pte)
+{
+        if (pte & 1)
+                pte = phys_to_machine(XPADDR(pte)).maddr;
+        return (pte_t){ pte, pte >> 32 };
+}
+pmd_t xen_make_pmd(unsigned long long pmd)
+{
+        if (pmd & 1)
+                pmd = phys_to_machine(XPADDR(pmd)).maddr;
+        return (pmd_t){ pmd };
+}
+pgd_t xen_make_pgd(unsigned long long pgd)
+{
+        if (pgd & _PAGE_PRESENT)
+                pgd = phys_to_machine(XPADDR(pgd)).maddr;
+        return (pgd_t){ pgd };
+}
+#else  /* !PAE */
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+        *ptep = pte;
+}
+unsigned long xen_pte_val(pte_t pte)
+{
+        unsigned long ret = pte.pte_low;
+        if (ret & _PAGE_PRESENT)
+                ret = machine_to_phys(XMADDR(ret)).paddr;
+        return ret;
+}
+unsigned long xen_pgd_val(pgd_t pgd)
+{
+        unsigned long ret = pgd.pgd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+pte_t xen_make_pte(unsigned long pte)
+{
+        if (pte & _PAGE_PRESENT)
+                pte = phys_to_machine(XPADDR(pte)).maddr;
+        return (pte_t){ pte };
+}
+pgd_t xen_make_pgd(unsigned long pgd)
+{
+        if (pgd & _PAGE_PRESENT)
+                pgd = phys_to_machine(XPADDR(pgd)).maddr;
+        return (pgd_t){ pgd };
+}
+#endif  /* CONFIG_X86_PAE */
+/*
+  (Yet another) pagetable walker.  This one is intended for pinning a
+  pagetable.  This means that it walks a pagetable and calls the
+  callback function on each page it finds making up the page table,
+  at every level.  It walks the entire pagetable, but it only bothers
+  pinning pte pages which are below pte_limit.  In the normal case
+  this will be TASK_SIZE, but at boot we need to pin up to
+  FIXADDR_TOP.  But the important bit is that we don't pin beyond
+  there, because then we start getting into Xen's ptes.
+*/
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+                    unsigned long limit)
+{
+        pgd_t *pgd = pgd_base;
+        int flush = 0;
+        unsigned long addr = 0;
+        unsigned long pgd_next;
+        BUG_ON(limit > FIXADDR_TOP);
+        if (xen_feature(XENFEAT_auto_translated_physmap))
+                return 0;
+        for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+                pud_t *pud;
+                unsigned long pud_limit, pud_next;
+                pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+                if (!pgd_val(*pgd))
+                        continue;
+                pud = pud_offset(pgd, 0);
+                if (PTRS_PER_PUD > 1) /* not folded */
+                        flush |= (*func)(virt_to_page(pud), 0);
+                for (; addr != pud_limit; pud++, addr = pud_next) {
+                        pmd_t *pmd;
+                        unsigned long pmd_limit;
+                        pud_next = pud_addr_end(addr, pud_limit);
+                        if (pud_next < limit)
+                                pmd_limit = pud_next;
+                        else
+                                pmd_limit = limit;
+                        if (pud_none(*pud))
+                                continue;
+                        pmd = pmd_offset(pud, 0);
+                        if (PTRS_PER_PMD > 1) /* not folded */
+                                flush |= (*func)(virt_to_page(pmd), 0);
+                        for (; addr != pmd_limit; pmd++) {
+                                addr += (PAGE_SIZE * PTRS_PER_PTE);
+                                if ((pmd_limit-1) < (addr-1)) {
+                                        addr = pmd_limit;
+                                        break;
+                                }
+                                if (pmd_none(*pmd))
+                                        continue;
+                                flush |= (*func)(pmd_page(*pmd), 0);
+                        }
+                }
+        }
+        flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+        return flush;
+}
+static int pin_page(struct page *page, unsigned flags)
+{
+        unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+        int flush;
+        if (pgfl)
+                flush = 0;              /* already pinned */
+        else if (PageHighMem(page))
+                /* kmaps need flushing if we found an unpinned
+                   highpage */
+                flush = 1;
+        else {
+                void *pt = lowmem_page_address(page);
+                unsigned long pfn = page_to_pfn(page);
+                struct multicall_space mcs = __xen_mc_entry(0);
+                flush = 0;
+                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+                                        pfn_pte(pfn, PAGE_KERNEL_RO),
+                                        flags);
+        }
+        return flush;
+}
+/* This is called just after a mm has been created, but it has not
+   been used yet.  We need to make sure that its pagetable is all
+   read-only, and can be pinned. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+        struct multicall_space mcs;
+        struct mmuext_op *op;
+        xen_mc_batch();
+        if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+                /* re-enable interrupts for kmap_flush_unused */
+                xen_mc_issue(0);
+                kmap_flush_unused();
+                xen_mc_batch();
+        }
+        mcs = __xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+#ifdef CONFIG_X86_PAE
+        op->cmd = MMUEXT_PIN_L3_TABLE;
+#else
+        op->cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+        op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(0);
+}
+/* The init_mm pagetable is really pinned as soon as its created, but
+   that's before we have page structures to store the bits.  So do all
+   the book-keeping now. */
+static __init int mark_pinned(struct page *page, unsigned flags)
+{
+        SetPagePinned(page);
+        return 0;
+}
+void __init xen_mark_init_mm_pinned(void)
+{
+        pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+}
+static int unpin_page(struct page *page, unsigned flags)
+{
+        unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
+        if (pgfl && !PageHighMem(page)) {
+                void *pt = lowmem_page_address(page);
+                unsigned long pfn = page_to_pfn(page);
+                struct multicall_space mcs = __xen_mc_entry(0);
+                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+                                        pfn_pte(pfn, PAGE_KERNEL),
+                                        flags);
+        }
+        return 0;               /* never need to flush on unpin */
+}
+/* Release a pagetables pages back as normal RW */
+static void xen_pgd_unpin(pgd_t *pgd)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs;
+        xen_mc_batch();
+        mcs = __xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_UNPIN_TABLE;
+        op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        pgd_walk(pgd, unpin_page, TASK_SIZE);
+        xen_mc_issue(0);
+}
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+        spin_lock(&next->page_table_lock);
+        xen_pgd_pin(next->pgd);
+        spin_unlock(&next->page_table_lock);
+}
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+        spin_lock(&mm->page_table_lock);
+        xen_pgd_pin(mm->pgd);
+        spin_unlock(&mm->page_table_lock);
+}
+#ifdef CONFIG_SMP
+/* Another cpu may still have their %cr3 pointing at the pagetable, so
+   we need to repoint it somewhere else before we can unpin it. */
+static void drop_other_mm_ref(void *info)
+{
+        struct mm_struct *mm = info;
+        if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+                leave_mm(smp_processor_id());
+}
+static void drop_mm_ref(struct mm_struct *mm)
+{
+        if (current->active_mm == mm) {
+                if (current->mm == mm)
+                        load_cr3(swapper_pg_dir);
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(mm->cpu_vm_mask))
+                xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+                                           mm, 1);
+}
+#else
+static void drop_mm_ref(struct mm_struct *mm)
+{
+        if (current->active_mm == mm)
+                load_cr3(swapper_pg_dir);
+}
+#endif
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it.  This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing.  This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+void xen_exit_mmap(struct mm_struct *mm)
+{
+        get_cpu();              /* make sure we don't move around */
+        drop_mm_ref(mm);
+        put_cpu();
+        spin_lock(&mm->page_table_lock);
+        xen_pgd_unpin(mm->pgd);
+        spin_unlock(&mm->page_table_lock);
+}
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h
new file mode 100644
index 000000000000..c9ff27f3ac3a
--- /dev/null
+++ b/arch/i386/xen/mmu.h
@@ -0,0 +1,60 @@
+#ifndef _XEN_MMU_H
+#include <linux/linkage.h>
+#include <asm/page.h>
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+void xen_set_pte(pte_t *ptep, pte_t pteval);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval);
+void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+void xen_exit_mmap(struct mm_struct *mm);
+void xen_pgd_pin(pgd_t *pgd);
+//void xen_pgd_unpin(pgd_t *pgd);
+#ifdef CONFIG_X86_PAE
+unsigned long long xen_pte_val(pte_t);
+unsigned long long xen_pmd_val(pmd_t);
+unsigned long long xen_pgd_val(pgd_t);
+pte_t xen_make_pte(unsigned long long);
+pmd_t xen_make_pmd(unsigned long long);
+pgd_t xen_make_pgd(unsigned long long);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval);
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_set_pud(pud_t *ptr, pud_t val);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+#else
+unsigned long xen_pte_val(pte_t);
+unsigned long xen_pmd_val(pmd_t);
+unsigned long xen_pgd_val(pgd_t);
+pte_t xen_make_pte(unsigned long);
+pmd_t xen_make_pmd(unsigned long);
+pgd_t xen_make_pgd(unsigned long);
+#endif
+#endif  /* _XEN_MMU_H */
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c
new file mode 100644
index 000000000000..c837e8e463db
--- /dev/null
+++ b/arch/i386/xen/multicalls.c
@@ -0,0 +1,90 @@
+/*
+ * Xen hypercall batching.
+ *
+ * Xen allows multiple hypercalls to be issued at once, using the
+ * multicall interface.  This allows the cost of trapping into the
+ * hypervisor to be amortized over several calls.
+ *
+ * This file implements a simple interface for multicalls.  There's a
+ * per-cpu buffer of outstanding multicalls.  When you want to queue a
+ * multicall for issuing, you can allocate a multicall slot for the
+ * call and its arguments, along with storage for space which is
+ * pointed to by the arguments (for passing pointers to structures,
+ * etc).  When the multicall is actually issued, all the space for the
+ * commands and allocated memory is freed for reuse.
+ *
+ * Multicalls are flushed whenever any of the buffers get full, or
+ * when explicitly requested.  There's no way to get per-multicall
+ * return results back.  It will BUG if any of the multicalls fail.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/xen/hypercall.h>
+#include "multicalls.h"
+#define MC_BATCH        32
+#define MC_ARGS         (MC_BATCH * 16 / sizeof(u64))
+struct mc_buffer {
+        struct multicall_entry entries[MC_BATCH];
+        u64 args[MC_ARGS];
+        unsigned mcidx, argidx;
+};
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+void xen_mc_flush(void)
+{
+        struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+        int ret = 0;
+        unsigned long flags;
+        BUG_ON(preemptible());
+        /* Disable interrupts in case someone comes in and queues
+           something in the middle */
+        local_irq_save(flags);
+        if (b->mcidx) {
+                int i;
+                if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+                        BUG();
+                for (i = 0; i < b->mcidx; i++)
+                        if (b->entries[i].result < 0)
+                                ret++;
+                b->mcidx = 0;
+                b->argidx = 0;
+        } else
+                BUG_ON(b->argidx != 0);
+        local_irq_restore(flags);
+        BUG_ON(ret);
+}
+struct multicall_space __xen_mc_entry(size_t args)
+{
+        struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+        struct multicall_space ret;
+        unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+        BUG_ON(preemptible());
+        BUG_ON(argspace > MC_ARGS);
+        if (b->mcidx == MC_BATCH ||
+            (b->argidx + argspace) > MC_ARGS)
+                xen_mc_flush();
+        ret.mc = &b->entries[b->mcidx];
+        b->mcidx++;
+        ret.args = &b->args[b->argidx];
+        b->argidx += argspace;
+        return ret;
+}
diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h
new file mode 100644
index 000000000000..e6f7530b156c
--- /dev/null
+++ b/arch/i386/xen/multicalls.h
@@ -0,0 +1,45 @@
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+#include "xen-ops.h"
+/* Multicalls */
+struct multicall_space
+{
+        struct multicall_entry *mc;
+        void *args;
+};
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+/* Call to start a batch of multiple __xen_mc_entry()s.  Must be
+   paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+        /* need to disable interrupts until this entry is complete */
+        local_irq_save(__get_cpu_var(xen_mc_irq_flags));
+}
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+        xen_mc_batch();
+        return __xen_mc_entry(args);
+}
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+        if ((xen_get_lazy_mode() & mode) == 0)
+                xen_mc_flush();
+        /* restore flags saved in xen_mc_batch */
+        local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+}
+#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c
new file mode 100644
index 000000000000..2fe6eac510f0
--- /dev/null
+++ b/arch/i386/xen/setup.c
@@ -0,0 +1,96 @@
+/*
+ * Machine specific setup for xen
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+#include <asm/elf.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+#include "xen-ops.h"
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+char * __init xen_memory_setup(void)
+{
+        unsigned long max_pfn = xen_start_info->nr_pages;
+        e820.nr_map = 0;
+        add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+        return "Xen";
+}
+static void xen_idle(void)
+{
+        local_irq_disable();
+        if (need_resched())
+                local_irq_enable();
+        else {
+                current_thread_info()->status &= ~TS_POLLING;
+                smp_mb__after_clear_bit();
+                safe_halt();
+                current_thread_info()->status |= TS_POLLING;
+        }
+}
+void __init xen_arch_setup(void)
+{
+        struct physdev_set_iopl set_iopl;
+        int rc;
+        HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+        HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+        if (!xen_feature(XENFEAT_auto_translated_physmap))
+                HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+        HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
+                                 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+        set_iopl.iopl = 1;
+        rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+        if (rc != 0)
+                printk(KERN_INFO "physdev_op failed %d\n", rc);
+#ifdef CONFIG_ACPI
+        if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+                printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+                disable_acpi();
+        }
+#endif
+        memcpy(boot_command_line, xen_start_info->cmd_line,
+               MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+               COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+        pm_idle = xen_idle;
+#ifdef CONFIG_SMP
+        /* fill cpus_possible with all available cpus */
+        xen_fill_possible_map();
+#endif
+        paravirt_disable_iospace();
+}
diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c
new file mode 100644
index 000000000000..557b8e24706a
--- /dev/null
+++ b/arch/i386/xen/smp.c
@@ -0,0 +1,404 @@
+/*
+ * Xen SMP support
+ *
+ * This file implements the Xen versions of smp_ops.  SMP under Xen is
+ * very straightforward.  Bringing a CPU up is simply a matter of
+ * loading its initial context and setting it running.
+ *
+ * IPIs are handled through the Xen event mechanism.
+ *
+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
+ * useful topology information for the kernel to make use of.  As a
+ * result, all CPUs are treated as if they're single-core and
+ * single-threaded.
+ *
+ * This does not handle HOTPLUG_CPU yet.
+ */
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/smp.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cpu.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+#include <xen/page.h>
+#include <xen/events.h>
+#include "xen-ops.h"
+#include "mmu.h"
+static cpumask_t cpu_initialized_map;
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+struct call_data_struct {
+        void (*func) (void *info);
+        void *info;
+        atomic_t started;
+        atomic_t finished;
+        int wait;
+};
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
+static struct call_data_struct *call_data;
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
+{
+        return IRQ_HANDLED;
+}
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+        int cpu = smp_processor_id();
+        cpu_init();
+        preempt_disable();
+        per_cpu(cpu_state, cpu) = CPU_ONLINE;
+        xen_setup_cpu_clockevents();
+        /* We can take interrupts now: we're officially "up". */
+        local_irq_enable();
+        wmb();                  /* make sure everything is out */
+        cpu_idle();
+}
+static int xen_smp_intr_init(unsigned int cpu)
+{
+        int rc;
+        const char *resched_name, *callfunc_name;
+        per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+        resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+        rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
+                                    cpu,
+                                    xen_reschedule_interrupt,
+                                    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                    resched_name,
+                                    NULL);
+        if (rc < 0)
+                goto fail;
+        per_cpu(resched_irq, cpu) = rc;
+        callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+        rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
+                                    cpu,
+                                    xen_call_function_interrupt,
+                                    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                    callfunc_name,
+                                    NULL);
+        if (rc < 0)
+                goto fail;
+        per_cpu(callfunc_irq, cpu) = rc;
+        return 0;
+ fail:
+        if (per_cpu(resched_irq, cpu) >= 0)
+                unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+        if (per_cpu(callfunc_irq, cpu) >= 0)
+                unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+        return rc;
+}
+void __init xen_fill_possible_map(void)
+{
+        int i, rc;
+        for (i = 0; i < NR_CPUS; i++) {
+                rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+                if (rc >= 0)
+                        cpu_set(i, cpu_possible_map);
+        }
+}
+void __init xen_smp_prepare_boot_cpu(void)
+{
+        int cpu;
+        BUG_ON(smp_processor_id() != 0);
+        native_smp_prepare_boot_cpu();
+        /* We've switched to the "real" per-cpu gdt, so make sure the
+           old memory can be recycled */
+        make_lowmem_page_readwrite(&per_cpu__gdt_page);
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                cpus_clear(cpu_sibling_map[cpu]);
+                cpus_clear(cpu_core_map[cpu]);
+        }
+        xen_setup_vcpu_info_placement();
+}
+void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+{
+        unsigned cpu;
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                cpus_clear(cpu_sibling_map[cpu]);
+                cpus_clear(cpu_core_map[cpu]);
+        }
+        smp_store_cpu_info(0);
+        set_cpu_sibling_map(0);
+        if (xen_smp_intr_init(0))
+                BUG();
+        cpu_initialized_map = cpumask_of_cpu(0);
+        /* Restrict the possible_map according to max_cpus. */
+        while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+                for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+                        continue;
+                cpu_clear(cpu, cpu_possible_map);
+        }
+        for_each_possible_cpu (cpu) {
+                struct task_struct *idle;
+                if (cpu == 0)
+                        continue;
+                idle = fork_idle(cpu);
+                if (IS_ERR(idle))
+                        panic("failed fork for CPU %d", cpu);
+                cpu_set(cpu, cpu_present_map);
+        }
+        //init_xenbus_allowed_cpumask();
+}
+static __cpuinit int
+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
+{
+        struct vcpu_guest_context *ctxt;
+        struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+        if (cpu_test_and_set(cpu, cpu_initialized_map))
+                return 0;
+        ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+        if (ctxt == NULL)
+                return -ENOMEM;
+        ctxt->flags = VGCF_IN_KERNEL;
+        ctxt->user_regs.ds = __USER_DS;
+        ctxt->user_regs.es = __USER_DS;
+        ctxt->user_regs.fs = __KERNEL_PERCPU;
+        ctxt->user_regs.gs = 0;
+        ctxt->user_regs.ss = __KERNEL_DS;
+        ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+        ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+        memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+        xen_copy_trap_info(ctxt->trap_ctxt);
+        ctxt->ldt_ents = 0;
+        BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
+        make_lowmem_page_readonly(gdt->gdt);
+        ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
+        ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
+        ctxt->user_regs.cs = __KERNEL_CS;
+        ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+        ctxt->kernel_ss = __KERNEL_DS;
+        ctxt->kernel_sp = idle->thread.esp0;
+        ctxt->event_callback_cs     = __KERNEL_CS;
+        ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
+        ctxt->failsafe_callback_cs  = __KERNEL_CS;
+        ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
+        per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+        ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+        if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+                BUG();
+        kfree(ctxt);
+        return 0;
+}
+int __cpuinit xen_cpu_up(unsigned int cpu)
+{
+        struct task_struct *idle = idle_task(cpu);
+        int rc;
+#if 0
+        rc = cpu_up_check(cpu);
+        if (rc)
+                return rc;
+#endif
+        init_gdt(cpu);
+        per_cpu(current_task, cpu) = idle;
+        irq_ctx_init(cpu);
+        xen_setup_timer(cpu);
+        /* make sure interrupts start blocked */
+        per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
+        rc = cpu_initialize_context(cpu, idle);
+        if (rc)
+                return rc;
+        if (num_online_cpus() == 1)
+                alternatives_smp_switch(1);
+        rc = xen_smp_intr_init(cpu);
+        if (rc)
+                return rc;
+        smp_store_cpu_info(cpu);
+        set_cpu_sibling_map(cpu);
+        /* This must be done before setting cpu_online_map */
+        wmb();
+        cpu_set(cpu, cpu_online_map);
+        rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+        BUG_ON(rc);
+        return 0;
+}
+void xen_smp_cpus_done(unsigned int max_cpus)
+{
+}
+static void stop_self(void *v)
+{
+        int cpu = smp_processor_id();
+        /* make sure we're not pinning something down */
+        load_cr3(swapper_pg_dir);
+        /* should set up a minimal gdt */
+        HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
+        BUG();
+}
+void xen_smp_send_stop(void)
+{
+        smp_call_function(stop_self, NULL, 0, 0);
+}
+void xen_smp_send_reschedule(int cpu)
+{
+        xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
+}
+static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+{
+        unsigned cpu;
+        cpus_and(mask, mask, cpu_online_map);
+        for_each_cpu_mask(cpu, mask)
+                xen_send_IPI_one(cpu, vector);
+}
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+{
+        void (*func) (void *info) = call_data->func;
+        void *info = call_data->info;
+        int wait = call_data->wait;
+        /*
+         * Notify initiating CPU that I've grabbed the data and am
+         * about to execute the function
+         */
+        mb();
+        atomic_inc(&call_data->started);
+        /*
+         * At this point the info structure may be out of scope unless wait==1
+         */
+        irq_enter();
+        (*func)(info);
+        irq_exit();
+        if (wait) {
+                mb();           /* commit everything before setting finished */
+                atomic_inc(&call_data->finished);
+        }
+        return IRQ_HANDLED;
+}
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+                               void *info, int wait)
+{
+        struct call_data_struct data;
+        int cpus;
+        /* Holding any lock stops cpus from going down. */
+        spin_lock(&call_lock);
+        cpu_clear(smp_processor_id(), mask);
+        cpus = cpus_weight(mask);
+        if (!cpus) {
+                spin_unlock(&call_lock);
+                return 0;
+        }
+        /* Can deadlock when called with interrupts disabled */
+        WARN_ON(irqs_disabled());
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        call_data = &data;
+        mb();                   /* write everything before IPI */
+        /* Send a message to other CPUs and wait for them to respond */
+        xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+        /* Make sure other vcpus get a chance to run.
+           XXX too severe?  Maybe we should check the other CPU's states? */
+        HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus ||
+               (wait && atomic_read(&data.finished) != cpus))
+                cpu_relax();
+        spin_unlock(&call_lock);
+        return 0;
+}
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
new file mode 100644
index 000000000000..51fdabf1fd4d
--- /dev/null
+++ b/arch/i386/xen/time.c
@@ -0,0 +1,590 @@
+/*
+ * Xen time implementation.
+ *
+ * This is implemented in terms of a clocksource driver which uses
+ * the hypervisor clock as a nanosecond timebase, and a clockevent
+ * driver which uses the hypervisor's timer mechanism.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/kernel_stat.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include "xen-ops.h"
+#define XEN_SHIFT 22
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP      100000
+#define NS_PER_TICK     (1000000000LL / HZ)
+static cycle_t xen_clocksource_read(void);
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+        u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+        u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+        u32 tsc_to_nsec_mul;
+        int tsc_shift;
+        u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(u64, residual_stolen);
+static DEFINE_PER_CPU(u64, residual_blocked);
+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+        u64 ret;
+        if (BITS_PER_LONG < 64) {
+                u32 *p32 = (u32 *)p;
+                u32 h, l;
+                /*
+                 * Read high then low, and then make sure high is
+                 * still the same; this will only loop if low wraps
+                 * and carries into high.
+                 * XXX some clean way to make this endian-proof?
+                 */
+                do {
+                        h = p32[1];
+                        barrier();
+                        l = p32[0];
+                        barrier();
+                } while (p32[1] != h);
+                ret = (((u64)h) << 32) | l;
+        } else
+                ret = *p;
+        return ret;
+}
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+        u64 state_time;
+        struct vcpu_runstate_info *state;
+        BUG_ON(preemptible());
+        state = &__get_cpu_var(runstate);
+        /*
+         * The runstate info is always updated by the hypervisor on
+         * the current CPU, so there's no need to use anything
+         * stronger than a compiler barrier when fetching it.
+         */
+        do {
+                state_time = get64(&state->state_entry_time);
+                barrier();
+                *res = *state;
+                barrier();
+        } while (get64(&state->state_entry_time) != state_time);
+}
+static void setup_runstate_info(int cpu)
+{
+        struct vcpu_register_runstate_memory_area area;
+        area.addr.v = &per_cpu(runstate, cpu);
+        if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+                               cpu, &area))
+                BUG();
+}
+static void do_stolen_accounting(void)
+{
+        struct vcpu_runstate_info state;
+        struct vcpu_runstate_info *snap;
+        s64 blocked, runnable, offline, stolen;
+        cputime_t ticks;
+        get_runstate_snapshot(&state);
+        WARN_ON(state.state != RUNSTATE_running);
+        snap = &__get_cpu_var(runstate_snapshot);
+        /* work out how much time the VCPU has not been runn*ing*  */
+        blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
+        runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
+        offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
+        *snap = state;
+        /* Add the appropriate number of ticks of stolen time,
+           including any left-overs from last time.  Passing NULL to
+           account_steal_time accounts the time as stolen. */
+        stolen = runnable + offline + __get_cpu_var(residual_stolen);
+        if (stolen < 0)
+                stolen = 0;
+        ticks = 0;
+        while (stolen >= NS_PER_TICK) {
+                ticks++;
+                stolen -= NS_PER_TICK;
+        }
+        __get_cpu_var(residual_stolen) = stolen;
+        account_steal_time(NULL, ticks);
+        /* Add the appropriate number of ticks of blocked time,
+           including any left-overs from last time.  Passing idle to
+           account_steal_time accounts the time as idle/wait. */
+        blocked += __get_cpu_var(residual_blocked);
+        if (blocked < 0)
+                blocked = 0;
+        ticks = 0;
+        while (blocked >= NS_PER_TICK) {
+                ticks++;
+                blocked -= NS_PER_TICK;
+        }
+        __get_cpu_var(residual_blocked) = blocked;
+        account_steal_time(idle_task(smp_processor_id()), ticks);
+}
+/*
+ * Xen sched_clock implementation.  Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long xen_sched_clock(void)
+{
+        struct vcpu_runstate_info state;
+        cycle_t now;
+        u64 ret;
+        s64 offset;
+        /*
+         * Ideally sched_clock should be called on a per-cpu basis
+         * anyway, so preempt should already be disabled, but that's
+         * not current practice at the moment.
+         */
+        preempt_disable();
+        now = xen_clocksource_read();
+        get_runstate_snapshot(&state);
+        WARN_ON(state.state != RUNSTATE_running);
+        offset = now - state.state_entry_time;
+        if (offset < 0)
+                offset = 0;
+        ret = state.time[RUNSTATE_blocked] +
+                state.time[RUNSTATE_running] +
+                offset;
+        preempt_enable();
+        return ret;
+}
+/* Get the CPU speed from Xen */
+unsigned long xen_cpu_khz(void)
+{
+        u64 cpu_khz = 1000000ULL << 32;
+        const struct vcpu_time_info *info =
+                &HYPERVISOR_shared_info->vcpu_info[0].time;
+        do_div(cpu_khz, info->tsc_to_system_mul);
+        if (info->tsc_shift < 0)
+                cpu_khz <<= -info->tsc_shift;
+        else
+                cpu_khz >>= info->tsc_shift;
+        return cpu_khz;
+}
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static unsigned get_time_values_from_xen(void)
+{
+        struct vcpu_time_info   *src;
+        struct shadow_time_info *dst;
+        /* src is shared memory with the hypervisor, so we need to
+           make sure we get a consistent snapshot, even in the face of
+           being preempted. */
+        src = &__get_cpu_var(xen_vcpu)->time;
+        dst = &__get_cpu_var(shadow_time);
+        do {
+                dst->version = src->version;
+                rmb();          /* fetch version before data */
+                dst->tsc_timestamp     = src->tsc_timestamp;
+                dst->system_timestamp  = src->system_time;
+                dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+                dst->tsc_shift         = src->tsc_shift;
+                rmb();          /* test version after fetching data */
+        } while ((src->version & 1) | (dst->version ^ src->version));
+        return dst->version;
+}
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+        u64 product;
+#ifdef __i386__
+        u32 tmp1, tmp2;
+#endif
+        if (shift < 0)
+                delta >>= -shift;
+        else
+                delta <<= shift;
+#ifdef __i386__
+        __asm__ (
+                "mul  %5       ; "
+                "mov  %4,%%eax ; "
+                "mov  %%edx,%4 ; "
+                "mul  %5       ; "
+                "xor  %5,%5    ; "
+                "add  %4,%%eax ; "
+                "adc  %5,%%edx ; "
+                : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+                : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+        __asm__ (
+                "mul %%rdx ; shrd $32,%%rdx,%%rax"
+                : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+        return product;
+}
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+        u64 now, delta;
+        now = native_read_tsc();
+        delta = now - shadow->tsc_timestamp;
+        return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+static cycle_t xen_clocksource_read(void)
+{
+        struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+        cycle_t ret;
+        unsigned version;
+        do {
+                version = get_time_values_from_xen();
+                barrier();
+                ret = shadow->system_timestamp + get_nsec_offset(shadow);
+                barrier();
+        } while (version != __get_cpu_var(xen_vcpu)->time.version);
+        put_cpu_var(shadow_time);
+        return ret;
+}
+static void xen_read_wallclock(struct timespec *ts)
+{
+        const struct shared_info *s = HYPERVISOR_shared_info;
+        u32 version;
+        u64 delta;
+        struct timespec now;
+        /* get wallclock at system boot */
+        do {
+                version = s->wc_version;
+                rmb();          /* fetch version before time */
+                now.tv_sec  = s->wc_sec;
+                now.tv_nsec = s->wc_nsec;
+                rmb();          /* fetch time before checking version */
+        } while ((s->wc_version & 1) | (version ^ s->wc_version));
+        delta = xen_clocksource_read(); /* time since system boot */
+        delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+        now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+        now.tv_sec = delta;
+        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+unsigned long xen_get_wallclock(void)
+{
+        struct timespec ts;
+        xen_read_wallclock(&ts);
+        return ts.tv_sec;
+}
+int xen_set_wallclock(unsigned long now)
+{
+        /* do nothing for domU */
+        return -1;
+}
+static struct clocksource xen_clocksource __read_mostly = {
+        .name = "xen",
+        .rating = 400,
+        .read = xen_clocksource_read,
+        .mask = ~0,
+        .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
+        .shift = XEN_SHIFT,
+        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+/*
+   Xen clockevent implementation
+   Xen has two clockevent implementations:
+   The old timer_op one works with all released versions of Xen prior
+   to version 3.0.4.  This version of the hypervisor provides a
+   single-shot timer with nanosecond resolution.  However, sharing the
+   same event channel is a 100Hz tick which is delivered while the
+   vcpu is running.  We don't care about or use this tick, but it will
+   cause the core time code to think the timer fired too soon, and
+   will end up resetting it each time.  It could be filtered, but
+   doing so has complications when the ktime clocksource is not yet
+   the xen clocksource (ie, at boot time).
+   The new vcpu_op-based timer interface allows the tick timer period
+   to be changed or turned off.  The tick timer is not useful as a
+   periodic timer because events are only delivered to running vcpus.
+   The one-shot timer can report when a timeout is in the past, so
+   set_next_event is capable of returning -ETIME when appropriate.
+   This interface is used when available.
+*/
+/*
+  Get a hypervisor absolute time.  In theory we could maintain an
+  offset between the kernel's time and the hypervisor's time, and
+  apply that to a kernel's absolute timeout.  Unfortunately the
+  hypervisor and kernel times can drift even if the kernel is using
+  the Xen clocksource, because ntp can warp the kernel's clocksource.
+*/
+static s64 get_abs_timeout(unsigned long delta)
+{
+        return xen_clocksource_read() + delta;
+}
+static void xen_timerop_set_mode(enum clock_event_mode mode,
+                                 struct clock_event_device *evt)
+{
+        switch (mode) {
+        case CLOCK_EVT_MODE_PERIODIC:
+                /* unsupported */
+                WARN_ON(1);
+                break;
+        case CLOCK_EVT_MODE_ONESHOT:
+                break;
+        case CLOCK_EVT_MODE_UNUSED:
+        case CLOCK_EVT_MODE_SHUTDOWN:
+                HYPERVISOR_set_timer_op(0);  /* cancel timeout */
+                break;
+        }
+}
+static int xen_timerop_set_next_event(unsigned long delta,
+                                      struct clock_event_device *evt)
+{
+        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+        if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+                BUG();
+        /* We may have missed the deadline, but there's no real way of
+           knowing for sure.  If the event was in the past, then we'll
+           get an immediate interrupt. */
+        return 0;
+}
+static const struct clock_event_device xen_timerop_clockevent = {
+        .name = "xen",
+        .features = CLOCK_EVT_FEAT_ONESHOT,
+        .max_delta_ns = 0xffffffff,
+        .min_delta_ns = TIMER_SLOP,
+        .mult = 1,
+        .shift = 0,
+        .rating = 500,
+        .set_mode = xen_timerop_set_mode,
+        .set_next_event = xen_timerop_set_next_event,
+};
+static void xen_vcpuop_set_mode(enum clock_event_mode mode,
+                                struct clock_event_device *evt)
+{
+        int cpu = smp_processor_id();
+        switch (mode) {
+        case CLOCK_EVT_MODE_PERIODIC:
+                WARN_ON(1);     /* unsupported */
+                break;
+        case CLOCK_EVT_MODE_ONESHOT:
+                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                        BUG();
+                break;
+        case CLOCK_EVT_MODE_UNUSED:
+        case CLOCK_EVT_MODE_SHUTDOWN:
+                if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+                    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                        BUG();
+                break;
+        }
+}
+static int xen_vcpuop_set_next_event(unsigned long delta,
+                                     struct clock_event_device *evt)
+{
+        int cpu = smp_processor_id();
+        struct vcpu_set_singleshot_timer single;
+        int ret;
+        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+        single.timeout_abs_ns = get_abs_timeout(delta);
+        single.flags = VCPU_SSHOTTMR_future;
+        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
+        BUG_ON(ret != 0 && ret != -ETIME);
+        return ret;
+}
+static const struct clock_event_device xen_vcpuop_clockevent = {
+        .name = "xen",
+        .features = CLOCK_EVT_FEAT_ONESHOT,
+        .max_delta_ns = 0xffffffff,
+        .min_delta_ns = TIMER_SLOP,
+        .mult = 1,
+        .shift = 0,
+        .rating = 500,
+        .set_mode = xen_vcpuop_set_mode,
+        .set_next_event = xen_vcpuop_set_next_event,
+};
+static const struct clock_event_device *xen_clockevent =
+        &xen_timerop_clockevent;
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+        struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+        irqreturn_t ret;
+        ret = IRQ_NONE;
+        if (evt->event_handler) {
+                evt->event_handler(evt);
+                ret = IRQ_HANDLED;
+        }
+        do_stolen_accounting();
+        return ret;
+}
+void xen_setup_timer(int cpu)
+{
+        const char *name;
+        struct clock_event_device *evt;
+        int irq;
+        printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
+        name = kasprintf(GFP_KERNEL, "timer%d", cpu);
+        if (!name)
+                name = "<timer kasprintf failed>";
+        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
+                                      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                      name, NULL);
+        evt = &per_cpu(xen_clock_events, cpu);
+        memcpy(evt, xen_clockevent, sizeof(*evt));
+        evt->cpumask = cpumask_of_cpu(cpu);
+        evt->irq = irq;
+        setup_runstate_info(cpu);
+}
+void xen_setup_cpu_clockevents(void)
+{
+        BUG_ON(preemptible());
+        clockevents_register_device(&__get_cpu_var(xen_clock_events));
+}
+__init void xen_time_init(void)
+{
+        int cpu = smp_processor_id();
+        get_time_values_from_xen();
+        clocksource_register(&xen_clocksource);
+        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
+                /* Successfully turned off 100Hz tick, so we have the
+                   vcpuop-based timer interface */
+                printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
+                xen_clockevent = &xen_vcpuop_clockevent;
+        }
+        /* Set initial system time with full resolution */
+        xen_read_wallclock(&xtime);
+        set_normalized_timespec(&wall_to_monotonic,
+                                -xtime.tv_sec, -xtime.tv_nsec);
+        tsc_disable = 0;
+        xen_setup_timer(cpu);
+        xen_setup_cpu_clockevents();
+}
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S
new file mode 100644
index 000000000000..1a43b60c0c62
--- /dev/null
+++ b/arch/i386/xen/xen-asm.S
@@ -0,0 +1,291 @@
+/*
+        Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+        The inline versions are the same as the direct-use versions, with the
+        pre- and post-amble chopped off.
+        This code is encoded for size rather than absolute efficiency,
+        with a view to being able to inline as much as possible.
+        We only bother with direct forms (ie, vcpu in pda) of the operations
+        here; the indirect forms are better handled in C, since they're
+        generally too large to inline anyway.
+ */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+#include <xen/interface/xen.h>
+#define RELOC(x, v)     .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)     .globl x##_end; x##_end=.
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI  0x80000000
+/*
+        Enable events.  This clears the event mask and tests the pending
+        event status with one and operation.  If there are pending
+        events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+        /* Clear mask and test pending */
+        andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+        /* Preempt here doesn't matter because that will deal with
+           any pending interrupts.  The pending check may end up being
+           run on the wrong CPU, but that doesn't hurt. */
+        jz 1f
+2:      call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+        ret
+        ENDPROC(xen_irq_enable_direct)
+        RELOC(xen_irq_enable_direct, 2b+1)
+/*
+        Disabling events is simply a matter of making the event mask
+        non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+        movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+        ret
+        ENDPROC(xen_irq_disable_direct)
+        RELOC(xen_irq_disable_direct, 0)
+/*
+        (xen_)save_fl is used to get the current interrupt enable status.
+        Callers expect the status to be in X86_EFLAGS_IF, and other bits
+        may be set in the return value.  We take advantage of this by
+        making sure that X86_EFLAGS_IF has the right value (and other bits
+        in that byte are 0), but other bits in the return value are
+        undefined.  We need to toggle the state of the bit, because
+        Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+        testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+        setz %ah
+        addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+        ret
+        ENDPROC(xen_save_fl_direct)
+        RELOC(xen_save_fl_direct, 0)
+/*
+        In principle the caller should be passing us a value return
+        from xen_save_fl_direct, but for robustness sake we test only
+        the X86_EFLAGS_IF flag rather than the whole byte. After
+        setting the interrupt mask state, it checks for unmasked
+        pending events and enters the hypervisor to get them delivered
+        if so.
+ */
+ENTRY(xen_restore_fl_direct)
+        testb $X86_EFLAGS_IF>>8, %ah
+        setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+        /* Preempt here doesn't matter because that will deal with
+           any pending interrupts.  The pending check may end up being
+           run on the wrong CPU, but that doesn't hurt. */
+        /* check for unmasked and pending */
+        cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+        jz 1f
+2:      call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+        ret
+        ENDPROC(xen_restore_fl_direct)
+        RELOC(xen_restore_fl_direct, 2b+1)
+/*
+        This is run where a normal iret would be run, with the same stack setup:
+              8: eflags
+              4: cs
+        esp-> 0: eip
+        This attempts to make sure that any pending events are dealt
+        with on return to usermode, but there is a small window in
+        which an event can happen just before entering usermode.  If
+        the nested interrupt ends up setting one of the TIF_WORK_MASK
+        pending work flags, they will not be tested again before
+        returning to usermode. This means that a process can end up
+        with pending work, which will be unprocessed until the process
+        enters and leaves the kernel again, which could be an
+        unbounded amount of time.  This means that a pending signal or
+        reschedule event could be indefinitely delayed.
+        The fix is to notice a nested interrupt in the critical
+        window, and if one occurs, then fold the nested interrupt into
+        the current interrupt stack frame, and re-process it
+        iteratively rather than recursively.  This means that it will
+        exit via the normal path, and all pending work will be dealt
+        with appropriately.
+        Because the nested interrupt handler needs to deal with the
+        current stack state in whatever form its in, we keep things
+        simple by only using a single register which is pushed/popped
+        on the stack.
+        Non-direct iret could be done in the same way, but it would
+        require an annoying amount of code duplication.  We'll assume
+        that direct mode will be the common case once the hypervisor
+        support becomes commonplace.
+ */
+ENTRY(xen_iret_direct)
+        /* test eflags for special cases */
+        testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+        jnz hyper_iret
+        push %eax
+        ESP_OFFSET=4    # bytes pushed onto stack
+        /* Store vcpu_info pointer for easy access.  Do it this
+           way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+        GET_THREAD_INFO(%eax)
+        movl TI_cpu(%eax),%eax
+        movl __per_cpu_offset(,%eax,4),%eax
+        lea per_cpu__xen_vcpu_info(%eax),%eax
+#else
+        movl $per_cpu__xen_vcpu_info, %eax
+#endif
+        /* check IF state we're restoring */
+        testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+        /* Maybe enable events.  Once this happens we could get a
+           recursive event, so the critical region starts immediately
+           afterwards.  However, if that happens we don't end up
+           resuming the code, so we don't have to be worried about
+           being preempted to another CPU. */
+        setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+        /* check for unmasked and pending */
+        cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+        /* If there's something pending, mask events again so we
+           can jump back into xen_hypervisor_callback */
+        sete XEN_vcpu_info_mask(%eax)
+        popl %eax
+        /* From this point on the registers are restored and the stack
+           updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+        /* Jump to hypervisor_callback after fixing up the stack.
+           Events are masked, so jumping out of the critical
+           region is OK. */
+        je xen_hypervisor_callback
+        iret
+xen_iret_end_crit:
+hyper_iret:
+        /* put this out of line since its very rarely used */
+        jmp hypercall_page + __HYPERVISOR_iret * 32
+        .globl xen_iret_start_crit, xen_iret_end_crit
+/*
+   This is called by xen_hypervisor_callback in entry.S when it sees
+   that the EIP at the time of interrupt was between xen_iret_start_crit
+   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
+   a more refined determination of what to do.
+   The stack format at this point is:
+        ----------------
+         ss             : (ss/esp may be present if we came from usermode)
+         esp            :
+         eflags         }  outer exception info
+         cs             }
+         eip            }
+        ---------------- <- edi (copy dest)
+         eax            :  outer eax if it hasn't been restored
+        ----------------
+         eflags         }  nested exception info
+         cs             }   (no ss/esp because we're nested
+         eip            }    from the same ring)
+         orig_eax       }<- esi (copy src)
+         - - - - - - - -
+         fs             }
+         es             }
+         ds             }  SAVE_ALL state
+         eax            }
+          :             :
+         ebx            }
+        ----------------
+         return addr     <- esp
+        ----------------
+   In order to deliver the nested exception properly, we need to shift
+   everything from the return addr up to the error code so it
+   sits just under the outer exception info.  This means that when we
+   handle the exception, we do it in the context of the outer exception
+   rather than starting a new one.
+   The only caveat is that if the outer eax hasn't been
+   restored yet (ie, it's still on stack), we need to insert
+   its value into the SAVE_ALL state before going on, since
+   it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+        /* offsets +4 for return address */
+        /*
+           Paranoia: Make sure we're really coming from userspace.
+           One could imagine a case where userspace jumps into the
+           critical range address, but just before the CPU delivers a GP,
+           it decides to deliver an interrupt instead.  Unlikely?
+           Definitely.  Easy to avoid?  Yes.  The Intel documents
+           explicitly say that the reported EIP for a bad jump is the
+           jump instruction itself, not the destination, but some virtual
+           environments get this wrong.
+         */
+        movl PT_CS+4(%esp), %ecx
+        andl $SEGMENT_RPL_MASK, %ecx
+        cmpl $USER_RPL, %ecx
+        je 2f
+        lea PT_ORIG_EAX+4(%esp), %esi
+        lea PT_EFLAGS+4(%esp), %edi
+        /* If eip is before iret_restore_end then stack
+           hasn't been restored yet. */
+        cmp $iret_restore_end, %eax
+        jae 1f
+        movl 0+4(%edi),%eax             /* copy EAX */
+        movl %eax, PT_EAX+4(%esp)
+        lea ESP_OFFSET(%edi),%edi       /* move dest up over saved regs */
+        /* set up the copy */
+1:      std
+        mov $(PT_EIP+4) / 4, %ecx       /* copy ret+saved regs up to orig_eax */
+        rep movsl
+        cld
+        lea 4(%edi),%esp                /* point esp to new frame */
+2:      ret
+/*
+        Force an event check by making a hypercall,
+        but preserve regs before making the call.
+ */
+check_events:
+        push %eax
+        push %ecx
+        push %edx
+        call force_evtchn_callback
+        pop %edx
+        pop %ecx
+        pop %eax
+        ret
diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S
new file mode 100644
index 000000000000..2998d55a0017
--- /dev/null
+++ b/arch/i386/xen/xen-head.S
@@ -0,0 +1,36 @@
+/* Xen-specific pieces of head.S, intended to be included in the right
+        place in head.S */
+#ifdef CONFIG_XEN
+#include <linux/elfnote.h>
+#include <asm/boot.h>
+#include <xen/interface/elfnote.h>
+ENTRY(startup_xen)
+        movl %esi,xen_start_info
+        cld
+        movl $(init_thread_union+THREAD_SIZE),%esp
+        jmp xen_start_kernel
+.pushsection ".bss.page_aligned"
+        .align PAGE_SIZE_asm
+ENTRY(hypercall_page)
+        .skip 0x1000
+.popsection
+        ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
+        ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
+        ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
+        ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
+        ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
+        ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
+        ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
+        ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
+#else
+        ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
+#endif
+        ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+#endif /*CONFIG_XEN */
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
new file mode 100644
index 000000000000..b9aaea45f07f
--- /dev/null
+++ b/arch/i386/xen/xen-ops.h
@@ -0,0 +1,71 @@
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+#include <linux/init.h>
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+void xen_copy_trap_info(struct trap_info *traps);
+DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DECLARE_PER_CPU(unsigned long, xen_cr3);
+extern struct start_info *xen_start_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void __init xen_init_IRQ(void);
+void xen_setup_timer(int cpu);
+void xen_setup_cpu_clockevents(void);
+unsigned long xen_cpu_khz(void);
+void __init xen_time_init(void);
+unsigned long xen_get_wallclock(void);
+int xen_set_wallclock(unsigned long time);
+unsigned long long xen_sched_clock(void);
+void xen_mark_init_mm_pinned(void);
+DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+static inline unsigned xen_get_lazy_mode(void)
+{
+        return x86_read_percpu(xen_lazy_mode);
+}
+void __init xen_fill_possible_map(void);
+void __init xen_setup_vcpu_info_placement(void);
+void xen_smp_prepare_boot_cpu(void);
+void xen_smp_prepare_cpus(unsigned int max_cpus);
+int xen_cpu_up(unsigned int cpu);
+void xen_smp_cpus_done(unsigned int max_cpus);
+void xen_smp_send_stop(void);
+void xen_smp_send_reschedule(int cpu);
+int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+                           int wait);
+int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+                                 int nonatomic, int wait);
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+                               void *info, int wait);
+/* Declare an asm function, along with symbols needed to make it
+   inlineable */
+#define DECL_ASM(ret, name, ...)                \
+        ret name(__VA_ARGS__);                  \
+        extern char name##_end[];               \
+        extern char name##_reloc[]              \
+DECL_ASM(void, xen_irq_enable_direct, void);
+DECL_ASM(void, xen_irq_disable_direct, void);
+DECL_ASM(unsigned long, xen_save_fl_direct, void);
+DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+void xen_iret_direct(void);
+#endif /* XEN_OPS_H */
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 296d2b0c5d88..fd9aff3f3890 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -6,6 +6,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/fcntl.h>
+#include <xen/hvc-console.h>
 /* Simple VGA output */
@@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf)
                simnow_init(buf + 6);
                early_console = &simnow_console;
                keep_early = 1;
+#ifdef CONFIG_HVC_XEN
+        } else if (!strncmp(buf, "xen", 3)) {
+                early_console = &xenboot_console;
+#endif
        }
        if (keep_early)
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index aa1d15991794..f3fb8174559e 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -174,7 +174,7 @@ static void do_mce_trigger(void)
        if (events != atomic_read(&mce_logged) && trigger[0]) {
                /* Small race window, but should be harmless.  */
                atomic_set(&mce_logged, events);
-                call_usermodehelper(trigger, trigger_argv, NULL, -1);
+                call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
        }
 }
diff --git a/drivers/Makefile b/drivers/Makefile
index 503d82569449..6d9d7fab77f5 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -15,6 +15,8 @@ obj-$(CONFIG_ACPI)		+= acpi/
 obj-$(CONFIG_PNP)               += pnp/
 obj-$(CONFIG_ARM_AMBA)          += amba/
+obj-$(CONFIG_XEN)               += xen/
 # char/ comes before serial/ etc so that the VT console is the boot-time
 # default.
 obj-y                           += char/
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 88a6fc7fd271..58f1338981bc 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -40,6 +40,7 @@
 #include <linux/jiffies.h>
 #include <linux/kmod.h>
 #include <linux/seq_file.h>
+#include <linux/reboot.h>
 #include <asm/uaccess.h>
 #include <acpi/acpi_bus.h>
@@ -59,7 +60,6 @@
 #define ACPI_THERMAL_NOTIFY_CRITICAL    0xF0
 #define ACPI_THERMAL_NOTIFY_HOT         0xF1
 #define ACPI_THERMAL_MODE_ACTIVE        0x00
-#define ACPI_THERMAL_PATH_POWEROFF      "/sbin/poweroff"
 #define ACPI_THERMAL_MAX_ACTIVE 10
 #define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65
@@ -419,26 +419,6 @@ static int acpi_thermal_get_devices(struct acpi_thermal *tz)
        return 0;
 }
-static int acpi_thermal_call_usermode(char *path)
-{
-        char *argv[2] = { NULL, NULL };
-        char *envp[3] = { NULL, NULL, NULL };
-        if (!path)
-                return -EINVAL;
-        argv[0] = path;
-        /* minimal command environment */
-        envp[0] = "HOME=/";
-        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-        call_usermodehelper(argv[0], argv, envp, 0);
-        return 0;
-}
 static int acpi_thermal_critical(struct acpi_thermal *tz)
 {
        if (!tz || !tz->trips.critical.flags.valid)
@@ -456,7 +436,7 @@ static int acpi_thermal_critical(struct acpi_thermal *tz)
        acpi_bus_generate_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL,
                                tz->trips.critical.flags.enabled);
-        acpi_thermal_call_usermode(ACPI_THERMAL_PATH_POWEROFF);
+        orderly_poweroff(true);
        return 0;
 }
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 8f65b88cf711..a4a311992408 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -427,4 +427,13 @@ config XILINX_SYSACE
        help
          Include support for the Xilinx SystemACE CompactFlash interface
+config XEN_BLKDEV_FRONTEND
+        tristate "Xen virtual block device support"
+        depends on XEN
+        default y
+        help
+          This driver implements the front-end of the Xen virtual
+          block device driver.  It communicates with a back-end driver
+          in another domain which drives the actual block device.
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 9ee08ab4ffa8..3e31532df0ed 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD)		+= viodasd.o
 obj-$(CONFIG_BLK_DEV_SX8)       += sx8.o
 obj-$(CONFIG_BLK_DEV_UB)        += ub.o
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)       += xen-blkfront.o
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
new file mode 100644
index 000000000000..6746c29181f8
--- /dev/null
+++ b/drivers/block/xen-blkfront.c
@@ -0,0 +1,988 @@
+/*
+ * blkfront.c
+ *
+ * XenLinux virtual block device driver.
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ * Copyright (c) 2004, Andrew Warfield
+ * Copyright (c) 2005, Christopher Clark
+ * Copyright (c) 2005, XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <linux/interrupt.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/blkif.h>
+#include <asm/xen/hypervisor.h>
+enum blkif_state {
+        BLKIF_STATE_DISCONNECTED,
+        BLKIF_STATE_CONNECTED,
+        BLKIF_STATE_SUSPENDED,
+};
+struct blk_shadow {
+        struct blkif_request req;
+        unsigned long request;
+        unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+static struct block_device_operations xlvbd_block_fops;
+#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'.  They
+ * hang in private_data off the gendisk structure. We may end up
+ * putting all kinds of interesting stuff here :-)
+ */
+struct blkfront_info
+{
+        struct xenbus_device *xbdev;
+        dev_t dev;
+        struct gendisk *gd;
+        int vdevice;
+        blkif_vdev_t handle;
+        enum blkif_state connected;
+        int ring_ref;
+        struct blkif_front_ring ring;
+        unsigned int evtchn, irq;
+        struct request_queue *rq;
+        struct work_struct work;
+        struct gnttab_free_callback callback;
+        struct blk_shadow shadow[BLK_RING_SIZE];
+        unsigned long shadow_free;
+        int feature_barrier;
+        /**
+         * The number of people holding this device open.  We won't allow a
+         * hot-unplug unless this is 0.
+         */
+        int users;
+};
+static DEFINE_SPINLOCK(blkif_io_lock);
+#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+        (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
+#define GRANT_INVALID_REF       0
+#define PARTS_PER_DISK          16
+#define BLKIF_MAJOR(dev) ((dev)>>8)
+#define BLKIF_MINOR(dev) ((dev) & 0xff)
+#define DEV_NAME        "xvd"   /* name in /dev */
+/* Information about our VBDs. */
+#define MAX_VBDS 64
+static LIST_HEAD(vbds_list);
+static int get_id_from_freelist(struct blkfront_info *info)
+{
+        unsigned long free = info->shadow_free;
+        BUG_ON(free > BLK_RING_SIZE);
+        info->shadow_free = info->shadow[free].req.id;
+        info->shadow[free].req.id = 0x0fffffee; /* debug */
+        return free;
+}
+static void add_id_to_freelist(struct blkfront_info *info,
+                               unsigned long id)
+{
+        info->shadow[id].req.id  = info->shadow_free;
+        info->shadow[id].request = 0;
+        info->shadow_free = id;
+}
+static void blkif_restart_queue_callback(void *arg)
+{
+        struct blkfront_info *info = (struct blkfront_info *)arg;
+        schedule_work(&info->work);
+}
+/*
+ * blkif_queue_request
+ *
+ * request block io
+ *
+ * id: for guest use only.
+ * operation: BLKIF_OP_{READ,WRITE,PROBE}
+ * buffer: buffer to read/write into. this should be a
+ *   virtual address in the guest os.
+ */
+static int blkif_queue_request(struct request *req)
+{
+        struct blkfront_info *info = req->rq_disk->private_data;
+        unsigned long buffer_mfn;
+        struct blkif_request *ring_req;
+        struct bio *bio;
+        struct bio_vec *bvec;
+        int idx;
+        unsigned long id;
+        unsigned int fsect, lsect;
+        int ref;
+        grant_ref_t gref_head;
+        if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+                return 1;
+        if (gnttab_alloc_grant_references(
+                BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
+                gnttab_request_free_callback(
+                        &info->callback,
+                        blkif_restart_queue_callback,
+                        info,
+                        BLKIF_MAX_SEGMENTS_PER_REQUEST);
+                return 1;
+        }
+        /* Fill out a communications ring structure. */
+        ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+        id = get_id_from_freelist(info);
+        info->shadow[id].request = (unsigned long)req;
+        ring_req->id = id;
+        ring_req->sector_number = (blkif_sector_t)req->sector;
+        ring_req->handle = info->handle;
+        ring_req->operation = rq_data_dir(req) ?
+                BLKIF_OP_WRITE : BLKIF_OP_READ;
+        if (blk_barrier_rq(req))
+                ring_req->operation = BLKIF_OP_WRITE_BARRIER;
+        ring_req->nr_segments = 0;
+        rq_for_each_bio (bio, req) {
+                bio_for_each_segment (bvec, bio, idx) {
+                        BUG_ON(ring_req->nr_segments
+                               == BLKIF_MAX_SEGMENTS_PER_REQUEST);
+                        buffer_mfn = pfn_to_mfn(page_to_pfn(bvec->bv_page));
+                        fsect = bvec->bv_offset >> 9;
+                        lsect = fsect + (bvec->bv_len >> 9) - 1;
+                        /* install a grant reference. */
+                        ref = gnttab_claim_grant_reference(&gref_head);
+                        BUG_ON(ref == -ENOSPC);
+                        gnttab_grant_foreign_access_ref(
+                                ref,
+                                info->xbdev->otherend_id,
+                                buffer_mfn,
+                                rq_data_dir(req) );
+                        info->shadow[id].frame[ring_req->nr_segments] =
+                                mfn_to_pfn(buffer_mfn);
+                        ring_req->seg[ring_req->nr_segments] =
+                                (struct blkif_request_segment) {
+                                        .gref       = ref,
+                                        .first_sect = fsect,
+                                        .last_sect  = lsect };
+                        ring_req->nr_segments++;
+                }
+        }
+        info->ring.req_prod_pvt++;
+        /* Keep a private copy so we can reissue requests when recovering. */
+        info->shadow[id].req = *ring_req;
+        gnttab_free_grant_references(gref_head);
+        return 0;
+}
+static inline void flush_requests(struct blkfront_info *info)
+{
+        int notify;
+        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+        if (notify)
+                notify_remote_via_irq(info->irq);
+}
+/*
+ * do_blkif_request
+ *  read a block; request is in a request queue
+ */
+static void do_blkif_request(request_queue_t *rq)
+{
+        struct blkfront_info *info = NULL;
+        struct request *req;
+        int queued;
+        pr_debug("Entered do_blkif_request\n");
+        queued = 0;
+        while ((req = elv_next_request(rq)) != NULL) {
+                info = req->rq_disk->private_data;
+                if (!blk_fs_request(req)) {
+                        end_request(req, 0);
+                        continue;
+                }
+                if (RING_FULL(&info->ring))
+                        goto wait;
+                pr_debug("do_blk_req %p: cmd %p, sec %lx, "
+                         "(%u/%li) buffer:%p [%s]\n",
+                         req, req->cmd, (unsigned long)req->sector,
+                         req->current_nr_sectors,
+                         req->nr_sectors, req->buffer,
+                         rq_data_dir(req) ? "write" : "read");
+                blkdev_dequeue_request(req);
+                if (blkif_queue_request(req)) {
+                        blk_requeue_request(rq, req);
+wait:
+                        /* Avoid pointless unplugs. */
+                        blk_stop_queue(rq);
+                        break;
+                }
+                queued++;
+        }
+        if (queued != 0)
+                flush_requests(info);
+}
+static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+{
+        request_queue_t *rq;
+        rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+        if (rq == NULL)
+                return -1;
+        elevator_init(rq, "noop");
+        /* Hard sector size and max sectors impersonate the equiv. hardware. */
+        blk_queue_hardsect_size(rq, sector_size);
+        blk_queue_max_sectors(rq, 512);
+        /* Each segment in a request is up to an aligned page in size. */
+        blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+        blk_queue_max_segment_size(rq, PAGE_SIZE);
+        /* Ensure a merged request will fit in a single I/O ring slot. */
+        blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+        blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+        /* Make sure buffer addresses are sector-aligned. */
+        blk_queue_dma_alignment(rq, 511);
+        gd->queue = rq;
+        return 0;
+}
+static int xlvbd_barrier(struct blkfront_info *info)
+{
+        int err;
+        err = blk_queue_ordered(info->rq,
+                                info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
+                                NULL);
+        if (err)
+                return err;
+        printk(KERN_INFO "blkfront: %s: barriers %s\n",
+               info->gd->disk_name,
+               info->feature_barrier ? "enabled" : "disabled");
+        return 0;
+}
+static int xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity,
+                               int vdevice, u16 vdisk_info, u16 sector_size,
+                               struct blkfront_info *info)
+{
+        struct gendisk *gd;
+        int nr_minors = 1;
+        int err = -ENODEV;
+        BUG_ON(info->gd != NULL);
+        BUG_ON(info->rq != NULL);
+        if ((minor % PARTS_PER_DISK) == 0)
+                nr_minors = PARTS_PER_DISK;
+        gd = alloc_disk(nr_minors);
+        if (gd == NULL)
+                goto out;
+        if (nr_minors > 1)
+                sprintf(gd->disk_name, "%s%c", DEV_NAME,
+                        'a' + minor / PARTS_PER_DISK);
+        else
+                sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
+                        'a' + minor / PARTS_PER_DISK,
+                        minor % PARTS_PER_DISK);
+        gd->major = XENVBD_MAJOR;
+        gd->first_minor = minor;
+        gd->fops = &xlvbd_block_fops;
+        gd->private_data = info;
+        gd->driverfs_dev = &(info->xbdev->dev);
+        set_capacity(gd, capacity);
+        if (xlvbd_init_blk_queue(gd, sector_size)) {
+                del_gendisk(gd);
+                goto out;
+        }
+        info->rq = gd->queue;
+        info->gd = gd;
+        if (info->feature_barrier)
+                xlvbd_barrier(info);
+        if (vdisk_info & VDISK_READONLY)
+                set_disk_ro(gd, 1);
+        if (vdisk_info & VDISK_REMOVABLE)
+                gd->flags |= GENHD_FL_REMOVABLE;
+        if (vdisk_info & VDISK_CDROM)
+                gd->flags |= GENHD_FL_CD;
+        return 0;
+ out:
+        return err;
+}
+static void kick_pending_request_queues(struct blkfront_info *info)
+{
+        if (!RING_FULL(&info->ring)) {
+                /* Re-enable calldowns. */
+                blk_start_queue(info->rq);
+                /* Kick things off immediately. */
+                do_blkif_request(info->rq);
+        }
+}
+static void blkif_restart_queue(struct work_struct *work)
+{
+        struct blkfront_info *info = container_of(work, struct blkfront_info, work);
+        spin_lock_irq(&blkif_io_lock);
+        if (info->connected == BLKIF_STATE_CONNECTED)
+                kick_pending_request_queues(info);
+        spin_unlock_irq(&blkif_io_lock);
+}
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+        /* Prevent new requests being issued until we fix things up. */
+        spin_lock_irq(&blkif_io_lock);
+        info->connected = suspend ?
+                BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+        /* No more blkif_request(). */
+        if (info->rq)
+                blk_stop_queue(info->rq);
+        /* No more gnttab callback work. */
+        gnttab_cancel_free_callback(&info->callback);
+        spin_unlock_irq(&blkif_io_lock);
+        /* Flush gnttab callback work. Must be done with no locks held. */
+        flush_scheduled_work();
+        /* Free resources associated with old device channel. */
+        if (info->ring_ref != GRANT_INVALID_REF) {
+                gnttab_end_foreign_access(info->ring_ref, 0,
+                                          (unsigned long)info->ring.sring);
+                info->ring_ref = GRANT_INVALID_REF;
+                info->ring.sring = NULL;
+        }
+        if (info->irq)
+                unbind_from_irqhandler(info->irq, info);
+        info->evtchn = info->irq = 0;
+}
+static void blkif_completion(struct blk_shadow *s)
+{
+        int i;
+        for (i = 0; i < s->req.nr_segments; i++)
+                gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+}
+static irqreturn_t blkif_interrupt(int irq, void *dev_id)
+{
+        struct request *req;
+        struct blkif_response *bret;
+        RING_IDX i, rp;
+        unsigned long flags;
+        struct blkfront_info *info = (struct blkfront_info *)dev_id;
+        int uptodate;
+        spin_lock_irqsave(&blkif_io_lock, flags);
+        if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
+                spin_unlock_irqrestore(&blkif_io_lock, flags);
+                return IRQ_HANDLED;
+        }
+ again:
+        rp = info->ring.sring->rsp_prod;
+        rmb(); /* Ensure we see queued responses up to 'rp'. */
+        for (i = info->ring.rsp_cons; i != rp; i++) {
+                unsigned long id;
+                int ret;
+                bret = RING_GET_RESPONSE(&info->ring, i);
+                id   = bret->id;
+                req  = (struct request *)info->shadow[id].request;
+                blkif_completion(&info->shadow[id]);
+                add_id_to_freelist(info, id);
+                uptodate = (bret->status == BLKIF_RSP_OKAY);
+                switch (bret->operation) {
+                case BLKIF_OP_WRITE_BARRIER:
+                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+                                printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
+                                       info->gd->disk_name);
+                                uptodate = -EOPNOTSUPP;
+                                info->feature_barrier = 0;
+                                xlvbd_barrier(info);
+                        }
+                        /* fall through */
+                case BLKIF_OP_READ:
+                case BLKIF_OP_WRITE:
+                        if (unlikely(bret->status != BLKIF_RSP_OKAY))
+                                dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
+                                        "request: %x\n", bret->status);
+                        ret = end_that_request_first(req, uptodate,
+                                req->hard_nr_sectors);
+                        BUG_ON(ret);
+                        end_that_request_last(req, uptodate);
+                        break;
+                default:
+                        BUG();
+                }
+        }
+        info->ring.rsp_cons = i;
+        if (i != info->ring.req_prod_pvt) {
+                int more_to_do;
+                RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+                if (more_to_do)
+                        goto again;
+        } else
+                info->ring.sring->rsp_event = i + 1;
+        kick_pending_request_queues(info);
+        spin_unlock_irqrestore(&blkif_io_lock, flags);
+        return IRQ_HANDLED;
+}
+static int setup_blkring(struct xenbus_device *dev,
+                         struct blkfront_info *info)
+{
+        struct blkif_sring *sring;
+        int err;
+        info->ring_ref = GRANT_INVALID_REF;
+        sring = (struct blkif_sring *)__get_free_page(GFP_KERNEL);
+        if (!sring) {
+                xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
+                return -ENOMEM;
+        }
+        SHARED_RING_INIT(sring);
+        FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+        err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
+        if (err < 0) {
+                free_page((unsigned long)sring);
+                info->ring.sring = NULL;
+                goto fail;
+        }
+        info->ring_ref = err;
+        err = xenbus_alloc_evtchn(dev, &info->evtchn);
+        if (err)
+                goto fail;
+        err = bind_evtchn_to_irqhandler(info->evtchn,
+                                        blkif_interrupt,
+                                        IRQF_SAMPLE_RANDOM, "blkif", info);
+        if (err <= 0) {
+                xenbus_dev_fatal(dev, err,
+                                 "bind_evtchn_to_irqhandler failed");
+                goto fail;
+        }
+        info->irq = err;
+        return 0;
+fail:
+        blkif_free(info, 0);
+        return err;
+}
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+                           struct blkfront_info *info)
+{
+        const char *message = NULL;
+        struct xenbus_transaction xbt;
+        int err;
+        /* Create shared ring, alloc event channel. */
+        err = setup_blkring(dev, info);
+        if (err)
+                goto out;
+again:
+        err = xenbus_transaction_start(&xbt);
+        if (err) {
+                xenbus_dev_fatal(dev, err, "starting transaction");
+                goto destroy_blkring;
+        }
+        err = xenbus_printf(xbt, dev->nodename,
+                            "ring-ref", "%u", info->ring_ref);
+        if (err) {
+                message = "writing ring-ref";
+                goto abort_transaction;
+        }
+        err = xenbus_printf(xbt, dev->nodename,
+                            "event-channel", "%u", info->evtchn);
+        if (err) {
+                message = "writing event-channel";
+                goto abort_transaction;
+        }
+        err = xenbus_transaction_end(xbt, 0);
+        if (err) {
+                if (err == -EAGAIN)
+                        goto again;
+                xenbus_dev_fatal(dev, err, "completing transaction");
+                goto destroy_blkring;
+        }
+        xenbus_switch_state(dev, XenbusStateInitialised);
+        return 0;
+ abort_transaction:
+        xenbus_transaction_end(xbt, 1);
+        if (message)
+                xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_blkring:
+        blkif_free(info, 0);
+ out:
+        return err;
+}
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffer for communication with the backend, and
+ * inform the backend of the appropriate details for those.  Switch to
+ * Initialised state.
+ */
+static int blkfront_probe(struct xenbus_device *dev,
+                          const struct xenbus_device_id *id)
+{
+        int err, vdevice, i;
+        struct blkfront_info *info;
+        /* FIXME: Use dynamic device id if this is not set. */
+        err = xenbus_scanf(XBT_NIL, dev->nodename,
+                           "virtual-device", "%i", &vdevice);
+        if (err != 1) {
+                xenbus_dev_fatal(dev, err, "reading virtual-device");
+                return err;
+        }
+        info = kzalloc(sizeof(*info), GFP_KERNEL);
+        if (!info) {
+                xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+                return -ENOMEM;
+        }
+        info->xbdev = dev;
+        info->vdevice = vdevice;
+        info->connected = BLKIF_STATE_DISCONNECTED;
+        INIT_WORK(&info->work, blkif_restart_queue);
+        for (i = 0; i < BLK_RING_SIZE; i++)
+                info->shadow[i].req.id = i+1;
+        info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+        /* Front end dir is a number, which is used as the id. */
+        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
+        dev->dev.driver_data = info;
+        err = talk_to_backend(dev, info);
+        if (err) {
+                kfree(info);
+                dev->dev.driver_data = NULL;
+                return err;
+        }
+        return 0;
+}
+static int blkif_recover(struct blkfront_info *info)
+{
+        int i;
+        struct blkif_request *req;
+        struct blk_shadow *copy;
+        int j;
+        /* Stage 1: Make a safe copy of the shadow state. */
+        copy = kmalloc(sizeof(info->shadow), GFP_KERNEL);
+        if (!copy)
+                return -ENOMEM;
+        memcpy(copy, info->shadow, sizeof(info->shadow));
+        /* Stage 2: Set up free list. */
+        memset(&info->shadow, 0, sizeof(info->shadow));
+        for (i = 0; i < BLK_RING_SIZE; i++)
+                info->shadow[i].req.id = i+1;
+        info->shadow_free = info->ring.req_prod_pvt;
+        info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+        /* Stage 3: Find pending requests and requeue them. */
+        for (i = 0; i < BLK_RING_SIZE; i++) {
+                /* Not in use? */
+                if (copy[i].request == 0)
+                        continue;
+                /* Grab a request slot and copy shadow state into it. */
+                req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+                *req = copy[i].req;
+                /* We get a new request id, and must reset the shadow state. */
+                req->id = get_id_from_freelist(info);
+                memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
+                /* Rewrite any grant references invalidated by susp/resume. */
+                for (j = 0; j < req->nr_segments; j++)
+                        gnttab_grant_foreign_access_ref(
+                                req->seg[j].gref,
+                                info->xbdev->otherend_id,
+                                pfn_to_mfn(info->shadow[req->id].frame[j]),
+                                rq_data_dir(
+                                        (struct request *)
+                                        info->shadow[req->id].request));
+                info->shadow[req->id].req = *req;
+                info->ring.req_prod_pvt++;
+        }
+        kfree(copy);
+        xenbus_switch_state(info->xbdev, XenbusStateConnected);
+        spin_lock_irq(&blkif_io_lock);
+        /* Now safe for us to use the shared ring */
+        info->connected = BLKIF_STATE_CONNECTED;
+        /* Send off requeued requests */
+        flush_requests(info);
+        /* Kick any other new requests queued since we resumed */
+        kick_pending_request_queues(info);
+        spin_unlock_irq(&blkif_io_lock);
+        return 0;
+}
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our blkif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int blkfront_resume(struct xenbus_device *dev)
+{
+        struct blkfront_info *info = dev->dev.driver_data;
+        int err;
+        dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
+        blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+        err = talk_to_backend(dev, info);
+        if (info->connected == BLKIF_STATE_SUSPENDED && !err)
+                err = blkif_recover(info);
+        return err;
+}
+/*
+ * Invoked when the backend is finally 'ready' (and has told produced
+ * the details about the physical device - #sectors, size, etc).
+ */
+static void blkfront_connect(struct blkfront_info *info)
+{
+        unsigned long long sectors;
+        unsigned long sector_size;
+        unsigned int binfo;
+        int err;
+        if ((info->connected == BLKIF_STATE_CONNECTED) ||
+            (info->connected == BLKIF_STATE_SUSPENDED) )
+                return;
+        dev_dbg(&info->xbdev->dev, "%s:%s.\n",
+                __func__, info->xbdev->otherend);
+        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+                            "sectors", "%llu", &sectors,
+                            "info", "%u", &binfo,
+                            "sector-size", "%lu", &sector_size,
+                            NULL);
+        if (err) {
+                xenbus_dev_fatal(info->xbdev, err,
+                                 "reading backend fields at %s",
+                                 info->xbdev->otherend);
+                return;
+        }
+        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+                            "feature-barrier", "%lu", &info->feature_barrier,
+                            NULL);
+        if (err)
+                info->feature_barrier = 0;
+        err = xlvbd_alloc_gendisk(BLKIF_MINOR(info->vdevice),
+                                  sectors, info->vdevice,
+                                  binfo, sector_size, info);
+        if (err) {
+                xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
+                                 info->xbdev->otherend);
+                return;
+        }
+        xenbus_switch_state(info->xbdev, XenbusStateConnected);
+        /* Kick pending requests. */
+        spin_lock_irq(&blkif_io_lock);
+        info->connected = BLKIF_STATE_CONNECTED;
+        kick_pending_request_queues(info);
+        spin_unlock_irq(&blkif_io_lock);
+        add_disk(info->gd);
+}
+/**
+ * Handle the change of state of the backend to Closing.  We must delete our
+ * device-layer structures now, to ensure that writes are flushed through to
+ * the backend.  Once is this done, we can switch to Closed in
+ * acknowledgement.
+ */
+static void blkfront_closing(struct xenbus_device *dev)
+{
+        struct blkfront_info *info = dev->dev.driver_data;
+        unsigned long flags;
+        dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
+        if (info->rq == NULL)
+                goto out;
+        spin_lock_irqsave(&blkif_io_lock, flags);
+        del_gendisk(info->gd);
+        /* No more blkif_request(). */
+        blk_stop_queue(info->rq);
+        /* No more gnttab callback work. */
+        gnttab_cancel_free_callback(&info->callback);
+        spin_unlock_irqrestore(&blkif_io_lock, flags);
+        /* Flush gnttab callback work. Must be done with no locks held. */
+        flush_scheduled_work();
+        blk_cleanup_queue(info->rq);
+        info->rq = NULL;
+ out:
+        xenbus_frontend_closed(dev);
+}
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+                            enum xenbus_state backend_state)
+{
+        struct blkfront_info *info = dev->dev.driver_data;
+        struct block_device *bd;
+        dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
+        switch (backend_state) {
+        case XenbusStateInitialising:
+        case XenbusStateInitWait:
+        case XenbusStateInitialised:
+        case XenbusStateUnknown:
+        case XenbusStateClosed:
+                break;
+        case XenbusStateConnected:
+                blkfront_connect(info);
+                break;
+        case XenbusStateClosing:
+                bd = bdget(info->dev);
+                if (bd == NULL)
+                        xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
+                mutex_lock(&bd->bd_mutex);
+                if (info->users > 0)
+                        xenbus_dev_error(dev, -EBUSY,
+                                         "Device in use; refusing to close");
+                else
+                        blkfront_closing(dev);
+                mutex_unlock(&bd->bd_mutex);
+                bdput(bd);
+                break;
+        }
+}
+static int blkfront_remove(struct xenbus_device *dev)
+{
+        struct blkfront_info *info = dev->dev.driver_data;
+        dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
+        blkif_free(info, 0);
+        kfree(info);
+        return 0;
+}
+static int blkif_open(struct inode *inode, struct file *filep)
+{
+        struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
+        info->users++;
+        return 0;
+}
+static int blkif_release(struct inode *inode, struct file *filep)
+{
+        struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
+        info->users--;
+        if (info->users == 0) {
+                /* Check whether we have been instructed to close.  We will
+                   have ignored this request initially, as the device was
+                   still mounted. */
+                struct xenbus_device *dev = info->xbdev;
+                enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
+                if (state == XenbusStateClosing)
+                        blkfront_closing(dev);
+        }
+        return 0;
+}
+static struct block_device_operations xlvbd_block_fops =
+{
+        .owner = THIS_MODULE,
+        .open = blkif_open,
+        .release = blkif_release,
+};
+static struct xenbus_device_id blkfront_ids[] = {
+        { "vbd" },
+        { "" }
+};
+static struct xenbus_driver blkfront = {
+        .name = "vbd",
+        .owner = THIS_MODULE,
+        .ids = blkfront_ids,
+        .probe = blkfront_probe,
+        .remove = blkfront_remove,
+        .resume = blkfront_resume,
+        .otherend_changed = backend_changed,
+};
+static int __init xlblk_init(void)
+{
+        if (!is_running_on_xen())
+                return -ENODEV;
+        if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
+                printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
+                       XENVBD_MAJOR, DEV_NAME);
+                return -ENODEV;
+        }
+        return xenbus_register_frontend(&blkfront);
+}
+module_init(xlblk_init);
+static void xlblk_exit(void)
+{
+        return xenbus_unregister_driver(&blkfront);
+}
+module_exit(xlblk_exit);
+MODULE_DESCRIPTION("Xen virtual block device frontend");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 97bd71bc3aea..9e8f21410d2d 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -604,6 +604,14 @@ config HVC_BEAT
        help
          Toshiba's Cell Reference Set Beat Console device driver
+config HVC_XEN
+        bool "Xen Hypervisor Console support"
+        depends on XEN
+        select HVC_DRIVER
+        default y
+        help
+          Xen virtual console device driver
 config HVCS
        tristate "IBM Hypervisor Virtual Console Server support"
        depends on PPC_PSERIES
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index f2996a95eb07..8852b8d643cf 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_HVC_ISERIES)	+= hvc_iseries.o
 obj-$(CONFIG_HVC_RTAS)          += hvc_rtas.o
 obj-$(CONFIG_HVC_BEAT)          += hvc_beat.o
 obj-$(CONFIG_HVC_DRIVER)        += hvc_console.o
+obj-$(CONFIG_HVC_XEN)           += hvc_xen.o
 obj-$(CONFIG_RAW_DRIVER)        += raw.o
 obj-$(CONFIG_SGI_SNSC)          += snsc.o snsc_event.o
 obj-$(CONFIG_MSPEC)             += mspec.o
diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
new file mode 100644
index 000000000000..dd68f8541c2d
--- /dev/null
+++ b/drivers/char/hvc_xen.c
@@ -0,0 +1,159 @@
+/*
+ * xen console driver interface to hvc_console.c
+ *
+ * (c) 2007 Gerd Hoffmann <kraxel@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/page.h>
+#include <xen/events.h>
+#include <xen/interface/io/console.h>
+#include <xen/hvc-console.h>
+#include "hvc_console.h"
+#define HVC_COOKIE   0x58656e /* "Xen" in hex */
+static struct hvc_struct *hvc;
+static int xencons_irq;
+/* ------------------------------------------------------------------ */
+static inline struct xencons_interface *xencons_interface(void)
+{
+        return mfn_to_virt(xen_start_info->console.domU.mfn);
+}
+static inline void notify_daemon(void)
+{
+        /* Use evtchn: this is called early, before irq is set up. */
+        notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
+}
+static int write_console(uint32_t vtermno, const char *data, int len)
+{
+        struct xencons_interface *intf = xencons_interface();
+        XENCONS_RING_IDX cons, prod;
+        int sent = 0;
+        cons = intf->out_cons;
+        prod = intf->out_prod;
+        mb();                   /* update queue values before going on */
+        BUG_ON((prod - cons) > sizeof(intf->out));
+        while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
+                intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
+        wmb();                  /* write ring before updating pointer */
+        intf->out_prod = prod;
+        notify_daemon();
+        return sent;
+}
+static int read_console(uint32_t vtermno, char *buf, int len)
+{
+        struct xencons_interface *intf = xencons_interface();
+        XENCONS_RING_IDX cons, prod;
+        int recv = 0;
+        cons = intf->in_cons;
+        prod = intf->in_prod;
+        mb();                   /* get pointers before reading ring */
+        BUG_ON((prod - cons) > sizeof(intf->in));
+        while (cons != prod && recv < len)
+                buf[recv++] = intf->in[MASK_XENCONS_IDX(cons++, intf->in)];
+        mb();                   /* read ring before consuming */
+        intf->in_cons = cons;
+        notify_daemon();
+        return recv;
+}
+static struct hv_ops hvc_ops = {
+        .get_chars = read_console,
+        .put_chars = write_console,
+};
+static int __init xen_init(void)
+{
+        struct hvc_struct *hp;
+        if (!is_running_on_xen())
+                return 0;
+        xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
+        if (xencons_irq < 0)
+                xencons_irq = 0 /* NO_IRQ */;
+        hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
+        if (IS_ERR(hp))
+                return PTR_ERR(hp);
+        hvc = hp;
+        return 0;
+}
+static void __exit xen_fini(void)
+{
+        if (hvc)
+                hvc_remove(hvc);
+}
+static int xen_cons_init(void)
+{
+        if (!is_running_on_xen())
+                return 0;
+        hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
+        return 0;
+}
+module_init(xen_init);
+module_exit(xen_fini);
+console_initcall(xen_cons_init);
+static void xenboot_write_console(struct console *console, const char *string,
+                                  unsigned len)
+{
+        unsigned int linelen, off = 0;
+        const char *pos;
+        while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
+                linelen = pos-string+off;
+                if (off + linelen > len)
+                        break;
+                write_console(0, string+off, linelen);
+                write_console(0, "\r\n", 2);
+                off += linelen + 1;
+        }
+        if (off < len)
+                write_console(0, string+off, len-off);
+}
+struct console xenboot_console = {
+        .name           = "xenboot",
+        .write          = xenboot_write_console,
+        .flags          = CON_PRINTBUFFER | CON_BOOT,
+};
diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index dbb22403979f..3d90fc002097 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -1770,7 +1770,8 @@ static int call_critical_overtemp(void)
                                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
                                NULL };
-        return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
+        return call_usermodehelper(critical_overtemp_path,
+                                   argv, envp, UMH_WAIT_EXEC);
 }
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index e18d265d5d33..516d943227e2 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -80,7 +80,8 @@ int wf_critical_overtemp(void)
                                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
                                NULL };
-        return call_usermodehelper(critical_overtemp_path, argv, envp, 0);
+        return call_usermodehelper(critical_overtemp_path,
+                                   argv, envp, UMH_WAIT_EXEC);
 }
 EXPORT_SYMBOL_GPL(wf_critical_overtemp);
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 43d03178064d..5fb659f8b20e 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2486,6 +2486,18 @@ source "drivers/atm/Kconfig"
 source "drivers/s390/net/Kconfig"
+config XEN_NETDEV_FRONTEND
+        tristate "Xen network device frontend driver"
+        depends on XEN
+        default y
+        help
+          The network device frontend driver allows the kernel to
+          access network devices exported exported by a virtual
+          machine containing a physical network device driver. The
+          frontend driver is intended for unprivileged guest domains;
+          if you are compiling a kernel for a Xen guest, you almost
+          certainly want to enable this.
 config ISERIES_VETH
        tristate "iSeries Virtual Ethernet driver support"
        depends on PPC_ISERIES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index eb4167622a6a..0e286ab8855a 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -127,6 +127,8 @@ obj-$(CONFIG_PPPOL2TP) += pppox.o pppol2tp.o
 obj-$(CONFIG_SLIP) += slip.o
 obj-$(CONFIG_SLHC) += slhc.o
+obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
 obj-$(CONFIG_DUMMY) += dummy.o
 obj-$(CONFIG_IFB) += ifb.o
 obj-$(CONFIG_MACVLAN) += macvlan.o
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 84aa2117c0ee..355c6cf3d112 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -320,7 +320,7 @@ static int eppconfig(struct baycom_state *bc)
        sprintf(portarg, "%ld", bc->pdev->port->base);
        printk(KERN_DEBUG "%s: %s -s -p %s -m %s\n", bc_drvname, eppconfig_path, portarg, modearg);
-        return call_usermodehelper(eppconfig_path, argv, envp, 1);
+        return call_usermodehelper(eppconfig_path, argv, envp, UMH_WAIT_PROC);
 }
 /* ---------------------------------------------------------------------- */
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
new file mode 100644
index 000000000000..489f69c5d6ca
--- /dev/null
+++ b/drivers/net/xen-netfront.c
@@ -0,0 +1,1863 @@
+/*
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ * Copyright (c) 2005, XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/ethtool.h>
+#include <linux/if_ether.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/moduleparam.h>
+#include <linux/mm.h>
+#include <net/ip.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <xen/interface/io/netif.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/grant_table.h>
+static struct ethtool_ops xennet_ethtool_ops;
+struct netfront_cb {
+        struct page *page;
+        unsigned offset;
+};
+#define NETFRONT_SKB_CB(skb)    ((struct netfront_cb *)((skb)->cb))
+#define RX_COPY_THRESHOLD 256
+#define GRANT_INVALID_REF       0
+#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
+#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
+#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
+struct netfront_info {
+        struct list_head list;
+        struct net_device *netdev;
+        struct net_device_stats stats;
+        struct xen_netif_tx_front_ring tx;
+        struct xen_netif_rx_front_ring rx;
+        spinlock_t   tx_lock;
+        spinlock_t   rx_lock;
+        unsigned int evtchn;
+        /* Receive-ring batched refills. */
+#define RX_MIN_TARGET 8
+#define RX_DFL_MIN_TARGET 64
+#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
+        unsigned rx_min_target, rx_max_target, rx_target;
+        struct sk_buff_head rx_batch;
+        struct timer_list rx_refill_timer;
+        /*
+         * {tx,rx}_skbs store outstanding skbuffs. Free tx_skb entries
+         * are linked from tx_skb_freelist through skb_entry.link.
+         *
+         *  NB. Freelist index entries are always going to be less than
+         *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
+         *  greater than PAGE_OFFSET: we use this property to distinguish
+         *  them.
+         */
+        union skb_entry {
+                struct sk_buff *skb;
+                unsigned link;
+        } tx_skbs[NET_TX_RING_SIZE];
+        grant_ref_t gref_tx_head;
+        grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
+        unsigned tx_skb_freelist;
+        struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
+        grant_ref_t gref_rx_head;
+        grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
+        struct xenbus_device *xbdev;
+        int tx_ring_ref;
+        int rx_ring_ref;
+        unsigned long rx_pfn_array[NET_RX_RING_SIZE];
+        struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
+        struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+};
+struct netfront_rx_info {
+        struct xen_netif_rx_response rx;
+        struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
+};
+/*
+ * Access macros for acquiring freeing slots in tx_skbs[].
+ */
+static void add_id_to_freelist(unsigned *head, union skb_entry *list,
+                               unsigned short id)
+{
+        list[id].link = *head;
+        *head = id;
+}
+static unsigned short get_id_from_freelist(unsigned *head,
+                                           union skb_entry *list)
+{
+        unsigned int id = *head;
+        *head = list[id].link;
+        return id;
+}
+static int xennet_rxidx(RING_IDX idx)
+{
+        return idx & (NET_RX_RING_SIZE - 1);
+}
+static struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
+                                         RING_IDX ri)
+{
+        int i = xennet_rxidx(ri);
+        struct sk_buff *skb = np->rx_skbs[i];
+        np->rx_skbs[i] = NULL;
+        return skb;
+}
+static grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
+                                            RING_IDX ri)
+{
+        int i = xennet_rxidx(ri);
+        grant_ref_t ref = np->grant_rx_ref[i];
+        np->grant_rx_ref[i] = GRANT_INVALID_REF;
+        return ref;
+}
+#ifdef CONFIG_SYSFS
+static int xennet_sysfs_addif(struct net_device *netdev);
+static void xennet_sysfs_delif(struct net_device *netdev);
+#else /* !CONFIG_SYSFS */
+#define xennet_sysfs_addif(dev) (0)
+#define xennet_sysfs_delif(dev) do { } while (0)
+#endif
+static int xennet_can_sg(struct net_device *dev)
+{
+        return dev->features & NETIF_F_SG;
+}
+static void rx_refill_timeout(unsigned long data)
+{
+        struct net_device *dev = (struct net_device *)data;
+        netif_rx_schedule(dev);
+}
+static int netfront_tx_slot_available(struct netfront_info *np)
+{
+        return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
+                (TX_MAX_TARGET - MAX_SKB_FRAGS - 2));
+}
+static void xennet_maybe_wake_tx(struct net_device *dev)
+{
+        struct netfront_info *np = netdev_priv(dev);
+        if (unlikely(netif_queue_stopped(dev)) &&
+            netfront_tx_slot_available(np) &&
+            likely(netif_running(dev)))
+                netif_wake_queue(dev);
+}
+static void xennet_alloc_rx_buffers(struct net_device *dev)
+{
+        unsigned short id;
+        struct netfront_info *np = netdev_priv(dev);
+        struct sk_buff *skb;
+        struct page *page;
+        int i, batch_target, notify;
+        RING_IDX req_prod = np->rx.req_prod_pvt;
+        struct xen_memory_reservation reservation;
+        grant_ref_t ref;
+        unsigned long pfn;
+        void *vaddr;
+        int nr_flips;
+        struct xen_netif_rx_request *req;
+        if (unlikely(!netif_carrier_ok(dev)))
+                return;
+        /*
+         * Allocate skbuffs greedily, even though we batch updates to the
+         * receive ring. This creates a less bursty demand on the memory
+         * allocator, so should reduce the chance of failed allocation requests
+         * both for ourself and for other kernel subsystems.
+         */
+        batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
+        for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
+                skb = __netdev_alloc_skb(dev, RX_COPY_THRESHOLD,
+                                         GFP_ATOMIC | __GFP_NOWARN);
+                if (unlikely(!skb))
+                        goto no_skb;
+                page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+                if (!page) {
+                        kfree_skb(skb);
+no_skb:
+                        /* Any skbuffs queued for refill? Force them out. */
+                        if (i != 0)
+                                goto refill;
+                        /* Could not allocate any skbuffs. Try again later. */
+                        mod_timer(&np->rx_refill_timer,
+                                  jiffies + (HZ/10));
+                        break;
+                }
+                skb_shinfo(skb)->frags[0].page = page;
+                skb_shinfo(skb)->nr_frags = 1;
+                __skb_queue_tail(&np->rx_batch, skb);
+        }
+        /* Is the batch large enough to be worthwhile? */
+        if (i < (np->rx_target/2)) {
+                if (req_prod > np->rx.sring->req_prod)
+                        goto push;
+                return;
+        }
+        /* Adjust our fill target if we risked running out of buffers. */
+        if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
+            ((np->rx_target *= 2) > np->rx_max_target))
+                np->rx_target = np->rx_max_target;
+ refill:
+        for (nr_flips = i = 0; ; i++) {
+                skb = __skb_dequeue(&np->rx_batch);
+                if (skb == NULL)
+                        break;
+                skb->dev = dev;
+                id = xennet_rxidx(req_prod + i);
+                BUG_ON(np->rx_skbs[id]);
+                np->rx_skbs[id] = skb;
+                ref = gnttab_claim_grant_reference(&np->gref_rx_head);
+                BUG_ON((signed short)ref < 0);
+                np->grant_rx_ref[id] = ref;
+                pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
+                vaddr = page_address(skb_shinfo(skb)->frags[0].page);
+                req = RING_GET_REQUEST(&np->rx, req_prod + i);
+                gnttab_grant_foreign_access_ref(ref,
+                                                np->xbdev->otherend_id,
+                                                pfn_to_mfn(pfn),
+                                                0);
+                req->id = id;
+                req->gref = ref;
+        }
+        if (nr_flips != 0) {
+                reservation.extent_start = np->rx_pfn_array;
+                reservation.nr_extents   = nr_flips;
+                reservation.extent_order = 0;
+                reservation.address_bits = 0;
+                reservation.domid        = DOMID_SELF;
+                if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                        /* After all PTEs have been zapped, flush the TLB. */
+                        np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
+                                UVMF_TLB_FLUSH|UVMF_ALL;
+                        /* Give away a batch of pages. */
+                        np->rx_mcl[i].op = __HYPERVISOR_memory_op;
+                        np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
+                        np->rx_mcl[i].args[1] = (unsigned long)&reservation;
+                        /* Zap PTEs and give away pages in one big
+                         * multicall. */
+                        (void)HYPERVISOR_multicall(np->rx_mcl, i+1);
+                        /* Check return status of HYPERVISOR_memory_op(). */
+                        if (unlikely(np->rx_mcl[i].result != i))
+                                panic("Unable to reduce memory reservation\n");
+                } else {
+                        if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+                                                 &reservation) != i)
+                                panic("Unable to reduce memory reservation\n");
+                }
+        } else {
+                wmb();          /* barrier so backend seens requests */
+        }
+        /* Above is a suitable barrier to ensure backend will see requests. */
+        np->rx.req_prod_pvt = req_prod + i;
+ push:
+        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
+        if (notify)
+                notify_remote_via_irq(np->netdev->irq);
+}
+static int xennet_open(struct net_device *dev)
+{
+        struct netfront_info *np = netdev_priv(dev);
+        memset(&np->stats, 0, sizeof(np->stats));
+        spin_lock_bh(&np->rx_lock);
+        if (netif_carrier_ok(dev)) {
+                xennet_alloc_rx_buffers(dev);
+                np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
+                if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
+                        netif_rx_schedule(dev);
+        }
+        spin_unlock_bh(&np->rx_lock);
+        xennet_maybe_wake_tx(dev);
+        return 0;
+}
+static void xennet_tx_buf_gc(struct net_device *dev)
+{
+        RING_IDX cons, prod;
+        unsigned short id;
+        struct netfront_info *np = netdev_priv(dev);
+        struct sk_buff *skb;
+        BUG_ON(!netif_carrier_ok(dev));
+        do {
+                prod = np->tx.sring->rsp_prod;
+                rmb(); /* Ensure we see responses up to 'rp'. */
+                for (cons = np->tx.rsp_cons; cons != prod; cons++) {
+                        struct xen_netif_tx_response *txrsp;
+                        txrsp = RING_GET_RESPONSE(&np->tx, cons);
+                        if (txrsp->status == NETIF_RSP_NULL)
+                                continue;
+                        id  = txrsp->id;
+                        skb = np->tx_skbs[id].skb;
+                        if (unlikely(gnttab_query_foreign_access(
+                                np->grant_tx_ref[id]) != 0)) {
+                                printk(KERN_ALERT "xennet_tx_buf_gc: warning "
+                                       "-- grant still in use by backend "
+                                       "domain.\n");
+                                BUG();
+                        }
+                        gnttab_end_foreign_access_ref(
+                                np->grant_tx_ref[id], GNTMAP_readonly);
+                        gnttab_release_grant_reference(
+                                &np->gref_tx_head, np->grant_tx_ref[id]);
+                        np->grant_tx_ref[id] = GRANT_INVALID_REF;
+                        add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, id);
+                        dev_kfree_skb_irq(skb);
+                }
+                np->tx.rsp_cons = prod;
+                /*
+                 * Set a new event, then check for race with update of tx_cons.
+                 * Note that it is essential to schedule a callback, no matter
+                 * how few buffers are pending. Even if there is space in the
+                 * transmit ring, higher layers may be blocked because too much
+                 * data is outstanding: in such cases notification from Xen is
+                 * likely to be the only kick that we'll get.
+                 */
+                np->tx.sring->rsp_event =
+                        prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
+                mb();           /* update shared area */
+        } while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
+        xennet_maybe_wake_tx(dev);
+}
+static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
+                              struct xen_netif_tx_request *tx)
+{
+        struct netfront_info *np = netdev_priv(dev);
+        char *data = skb->data;
+        unsigned long mfn;
+        RING_IDX prod = np->tx.req_prod_pvt;
+        int frags = skb_shinfo(skb)->nr_frags;
+        unsigned int offset = offset_in_page(data);
+        unsigned int len = skb_headlen(skb);
+        unsigned int id;
+        grant_ref_t ref;
+        int i;
+        /* While the header overlaps a page boundary (including being
+           larger than a page), split it it into page-sized chunks. */
+        while (len > PAGE_SIZE - offset) {
+                tx->size = PAGE_SIZE - offset;
+                tx->flags |= NETTXF_more_data;
+                len -= tx->size;
+                data += tx->size;
+                offset = 0;
+                id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
+                np->tx_skbs[id].skb = skb_get(skb);
+                tx = RING_GET_REQUEST(&np->tx, prod++);
+                tx->id = id;
+                ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+                BUG_ON((signed short)ref < 0);
+                mfn = virt_to_mfn(data);
+                gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+                                                mfn, GNTMAP_readonly);
+                tx->gref = np->grant_tx_ref[id] = ref;
+                tx->offset = offset;
+                tx->size = len;
+                tx->flags = 0;
+        }
+        /* Grant backend access to each skb fragment page. */
+        for (i = 0; i < frags; i++) {
+                skb_frag_t *frag = skb_shinfo(skb)->frags + i;
+                tx->flags |= NETTXF_more_data;
+                id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
+                np->tx_skbs[id].skb = skb_get(skb);
+                tx = RING_GET_REQUEST(&np->tx, prod++);
+                tx->id = id;
+                ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+                BUG_ON((signed short)ref < 0);
+                mfn = pfn_to_mfn(page_to_pfn(frag->page));
+                gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+                                                mfn, GNTMAP_readonly);
+                tx->gref = np->grant_tx_ref[id] = ref;
+                tx->offset = frag->page_offset;
+                tx->size = frag->size;
+                tx->flags = 0;
+        }
+        np->tx.req_prod_pvt = prod;
+}
+static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+        unsigned short id;
+        struct netfront_info *np = netdev_priv(dev);
+        struct xen_netif_tx_request *tx;
+        struct xen_netif_extra_info *extra;
+        char *data = skb->data;
+        RING_IDX i;
+        grant_ref_t ref;
+        unsigned long mfn;
+        int notify;
+        int frags = skb_shinfo(skb)->nr_frags;
+        unsigned int offset = offset_in_page(data);
+        unsigned int len = skb_headlen(skb);
+        frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
+        if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
+                printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
+                       frags);
+                dump_stack();
+                goto drop;
+        }
+        spin_lock_irq(&np->tx_lock);
+        if (unlikely(!netif_carrier_ok(dev) ||
+                     (frags > 1 && !xennet_can_sg(dev)) ||
+                     netif_needs_gso(dev, skb))) {
+                spin_unlock_irq(&np->tx_lock);
+                goto drop;
+        }
+        i = np->tx.req_prod_pvt;
+        id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
+        np->tx_skbs[id].skb = skb;
+        tx = RING_GET_REQUEST(&np->tx, i);
+        tx->id   = id;
+        ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+        BUG_ON((signed short)ref < 0);
+        mfn = virt_to_mfn(data);
+        gnttab_grant_foreign_access_ref(
+                ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
+        tx->gref = np->grant_tx_ref[id] = ref;
+        tx->offset = offset;
+        tx->size = len;
+        extra = NULL;
+        tx->flags = 0;
+        if (skb->ip_summed == CHECKSUM_PARTIAL)
+                /* local packet? */
+                tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
+        else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+                /* remote but checksummed. */
+                tx->flags |= NETTXF_data_validated;
+        if (skb_shinfo(skb)->gso_size) {
+                struct xen_netif_extra_info *gso;
+                gso = (struct xen_netif_extra_info *)
+                        RING_GET_REQUEST(&np->tx, ++i);
+                if (extra)
+                        extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
+                else
+                        tx->flags |= NETTXF_extra_info;
+                gso->u.gso.size = skb_shinfo(skb)->gso_size;
+                gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+                gso->u.gso.pad = 0;
+                gso->u.gso.features = 0;
+                gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+                gso->flags = 0;
+                extra = gso;
+        }
+        np->tx.req_prod_pvt = i + 1;
+        xennet_make_frags(skb, dev, tx);
+        tx->size = skb->len;
+        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
+        if (notify)
+                notify_remote_via_irq(np->netdev->irq);
+        xennet_tx_buf_gc(dev);
+        if (!netfront_tx_slot_available(np))
+                netif_stop_queue(dev);
+        spin_unlock_irq(&np->tx_lock);
+        np->stats.tx_bytes += skb->len;
+        np->stats.tx_packets++;
+        return 0;
+ drop:
+        np->stats.tx_dropped++;
+        dev_kfree_skb(skb);
+        return 0;
+}
+static int xennet_close(struct net_device *dev)
+{
+        struct netfront_info *np = netdev_priv(dev);
+        netif_stop_queue(np->netdev);
+        return 0;
+}
+static struct net_device_stats *xennet_get_stats(struct net_device *dev)
+{
+        struct netfront_info *np = netdev_priv(dev);
+        return &np->stats;
+}
+static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
+                                grant_ref_t ref)
+{
+        int new = xennet_rxidx(np->rx.req_prod_pvt);
+        BUG_ON(np->rx_skbs[new]);
+        np->rx_skbs[new] = skb;
+        np->grant_rx_ref[new] = ref;
+        RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
+        RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
+        np->rx.req_prod_pvt++;
+}
+static int xennet_get_extras(struct netfront_info *np,
+                             struct xen_netif_extra_info *extras,
+                             RING_IDX rp)
+{
+        struct xen_netif_extra_info *extra;
+        struct device *dev = &np->netdev->dev;
+        RING_IDX cons = np->rx.rsp_cons;
+        int err = 0;
+        do {
+                struct sk_buff *skb;
+                grant_ref_t ref;
+                if (unlikely(cons + 1 == rp)) {
+                        if (net_ratelimit())
+                                dev_warn(dev, "Missing extra info\n");
+                        err = -EBADR;
+                        break;
+                }
+                extra = (struct xen_netif_extra_info *)
+                        RING_GET_RESPONSE(&np->rx, ++cons);
+                if (unlikely(!extra->type ||
+                             extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+                        if (net_ratelimit())
+                                dev_warn(dev, "Invalid extra type: %d\n",
+                                        extra->type);
+                        err = -EINVAL;
+                } else {
+                        memcpy(&extras[extra->type - 1], extra,
+                               sizeof(*extra));
+                }
+                skb = xennet_get_rx_skb(np, cons);
+                ref = xennet_get_rx_ref(np, cons);
+                xennet_move_rx_slot(np, skb, ref);
+        } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
+        np->rx.rsp_cons = cons;
+        return err;
+}
+static int xennet_get_responses(struct netfront_info *np,
+                                struct netfront_rx_info *rinfo, RING_IDX rp,
+                                struct sk_buff_head *list)
+{
+        struct xen_netif_rx_response *rx = &rinfo->rx;
+        struct xen_netif_extra_info *extras = rinfo->extras;
+        struct device *dev = &np->netdev->dev;
+        RING_IDX cons = np->rx.rsp_cons;
+        struct sk_buff *skb = xennet_get_rx_skb(np, cons);
+        grant_ref_t ref = xennet_get_rx_ref(np, cons);
+        int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
+        int frags = 1;
+        int err = 0;
+        unsigned long ret;
+        if (rx->flags & NETRXF_extra_info) {
+                err = xennet_get_extras(np, extras, rp);
+                cons = np->rx.rsp_cons;
+        }
+        for (;;) {
+                if (unlikely(rx->status < 0 ||
+                             rx->offset + rx->status > PAGE_SIZE)) {
+                        if (net_ratelimit())
+                                dev_warn(dev, "rx->offset: %x, size: %u\n",
+                                         rx->offset, rx->status);
+                        xennet_move_rx_slot(np, skb, ref);
+                        err = -EINVAL;
+                        goto next;
+                }
+                /*
+                 * This definitely indicates a bug, either in this driver or in
+                 * the backend driver. In future this should flag the bad
+                 * situation to the system controller to reboot the backed.
+                 */
+                if (ref == GRANT_INVALID_REF) {
+                        if (net_ratelimit())
+                                dev_warn(dev, "Bad rx response id %d.\n",
+                                         rx->id);
+                        err = -EINVAL;
+                        goto next;
+                }
+                ret = gnttab_end_foreign_access_ref(ref, 0);
+                BUG_ON(!ret);
+                gnttab_release_grant_reference(&np->gref_rx_head, ref);
+                __skb_queue_tail(list, skb);
+next:
+                if (!(rx->flags & NETRXF_more_data))
+                        break;
+                if (cons + frags == rp) {
+                        if (net_ratelimit())
+                                dev_warn(dev, "Need more frags\n");
+                        err = -ENOENT;
+                        break;
+                }
+                rx = RING_GET_RESPONSE(&np->rx, cons + frags);
+                skb = xennet_get_rx_skb(np, cons + frags);
+                ref = xennet_get_rx_ref(np, cons + frags);
+                frags++;
+        }
+        if (unlikely(frags > max)) {
+                if (net_ratelimit())
+                        dev_warn(dev, "Too many frags\n");
+                err = -E2BIG;
+        }
+        if (unlikely(err))
+                np->rx.rsp_cons = cons + frags;
+        return err;
+}
+static int xennet_set_skb_gso(struct sk_buff *skb,
+                              struct xen_netif_extra_info *gso)
+{
+        if (!gso->u.gso.size) {
+                if (net_ratelimit())
+                        printk(KERN_WARNING "GSO size must not be zero.\n");
+                return -EINVAL;
+        }
+        /* Currently only TCPv4 S.O. is supported. */
+        if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+                if (net_ratelimit())
+                        printk(KERN_WARNING "Bad GSO type %d.\n", gso->u.gso.type);
+                return -EINVAL;
+        }
+        skb_shinfo(skb)->gso_size = gso->u.gso.size;
+        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+        /* Header must be checked, and gso_segs computed. */
+        skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+        skb_shinfo(skb)->gso_segs = 0;
+        return 0;
+}
+static RING_IDX xennet_fill_frags(struct netfront_info *np,
+                                  struct sk_buff *skb,
+                                  struct sk_buff_head *list)
+{
+        struct skb_shared_info *shinfo = skb_shinfo(skb);
+        int nr_frags = shinfo->nr_frags;
+        RING_IDX cons = np->rx.rsp_cons;
+        skb_frag_t *frag = shinfo->frags + nr_frags;
+        struct sk_buff *nskb;
+        while ((nskb = __skb_dequeue(list))) {
+                struct xen_netif_rx_response *rx =
+                        RING_GET_RESPONSE(&np->rx, ++cons);
+                frag->page = skb_shinfo(nskb)->frags[0].page;
+                frag->page_offset = rx->offset;
+                frag->size = rx->status;
+                skb->data_len += rx->status;
+                skb_shinfo(nskb)->nr_frags = 0;
+                kfree_skb(nskb);
+                frag++;
+                nr_frags++;
+        }
+        shinfo->nr_frags = nr_frags;
+        return cons;
+}
+static int skb_checksum_setup(struct sk_buff *skb)
+{
+        struct iphdr *iph;
+        unsigned char *th;
+        int err = -EPROTO;
+        if (skb->protocol != htons(ETH_P_IP))
+                goto out;
+        iph = (void *)skb->data;
+        th = skb->data + 4 * iph->ihl;
+        if (th >= skb_tail_pointer(skb))
+                goto out;
+        skb->csum_start = th - skb->head;
+        switch (iph->protocol) {
+        case IPPROTO_TCP:
+                skb->csum_offset = offsetof(struct tcphdr, check);
+                break;
+        case IPPROTO_UDP:
+                skb->csum_offset = offsetof(struct udphdr, check);
+                break;
+        default:
+                if (net_ratelimit())
+                        printk(KERN_ERR "Attempting to checksum a non-"
+                               "TCP/UDP packet, dropping a protocol"
+                               " %d packet", iph->protocol);
+                goto out;
+        }
+        if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
+                goto out;
+        err = 0;
+out:
+        return err;
+}
+static int handle_incoming_queue(struct net_device *dev,
+                                  struct sk_buff_head *rxq)
+{
+        struct netfront_info *np = netdev_priv(dev);
+        int packets_dropped = 0;
+        struct sk_buff *skb;
+        while ((skb = __skb_dequeue(rxq)) != NULL) {
+                struct page *page = NETFRONT_SKB_CB(skb)->page;
+                void *vaddr = page_address(page);
+                unsigned offset = NETFRONT_SKB_CB(skb)->offset;
+                memcpy(skb->data, vaddr + offset,
+                       skb_headlen(skb));
+                if (page != skb_shinfo(skb)->frags[0].page)
+                        __free_page(page);
+                /* Ethernet work: Delayed to here as it peeks the header. */
+                skb->protocol = eth_type_trans(skb, dev);
+                if (skb->ip_summed == CHECKSUM_PARTIAL) {
+                        if (skb_checksum_setup(skb)) {
+                                kfree_skb(skb);
+                                packets_dropped++;
+                                np->stats.rx_errors++;
+                                continue;
+                        }
+                }
+                np->stats.rx_packets++;
+                np->stats.rx_bytes += skb->len;
+                /* Pass it up. */
+                netif_receive_skb(skb);
+                dev->last_rx = jiffies;
+        }
+        return packets_dropped;
+}
+static int xennet_poll(struct net_device *dev, int *pbudget)
+{
+        struct netfront_info *np = netdev_priv(dev);
+        struct sk_buff *skb;
+        struct netfront_rx_info rinfo;
+        struct xen_netif_rx_response *rx = &rinfo.rx;
+        struct xen_netif_extra_info *extras = rinfo.extras;
+        RING_IDX i, rp;
+        int work_done, budget, more_to_do = 1;
+        struct sk_buff_head rxq;
+        struct sk_buff_head errq;
+        struct sk_buff_head tmpq;
+        unsigned long flags;
+        unsigned int len;
+        int err;
+        spin_lock(&np->rx_lock);
+        if (unlikely(!netif_carrier_ok(dev))) {
+                spin_unlock(&np->rx_lock);
+                return 0;
+        }
+        skb_queue_head_init(&rxq);
+        skb_queue_head_init(&errq);
+        skb_queue_head_init(&tmpq);
+        budget = *pbudget;
+        if (budget > dev->quota)
+                budget = dev->quota;
+        rp = np->rx.sring->rsp_prod;
+        rmb(); /* Ensure we see queued responses up to 'rp'. */
+        i = np->rx.rsp_cons;
+        work_done = 0;
+        while ((i != rp) && (work_done < budget)) {
+                memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
+                memset(extras, 0, sizeof(rinfo.extras));
+                err = xennet_get_responses(np, &rinfo, rp, &tmpq);
+                if (unlikely(err)) {
+err:
+                        while ((skb = __skb_dequeue(&tmpq)))
+                                __skb_queue_tail(&errq, skb);
+                        np->stats.rx_errors++;
+                        i = np->rx.rsp_cons;
+                        continue;
+                }
+                skb = __skb_dequeue(&tmpq);
+                if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+                        struct xen_netif_extra_info *gso;
+                        gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+                        if (unlikely(xennet_set_skb_gso(skb, gso))) {
+                                __skb_queue_head(&tmpq, skb);
+                                np->rx.rsp_cons += skb_queue_len(&tmpq);
+                                goto err;
+                        }
+                }
+                NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page;
+                NETFRONT_SKB_CB(skb)->offset = rx->offset;
+                len = rx->status;
+                if (len > RX_COPY_THRESHOLD)
+                        len = RX_COPY_THRESHOLD;
+                skb_put(skb, len);
+                if (rx->status > len) {
+                        skb_shinfo(skb)->frags[0].page_offset =
+                                rx->offset + len;
+                        skb_shinfo(skb)->frags[0].size = rx->status - len;
+                        skb->data_len = rx->status - len;
+                } else {
+                        skb_shinfo(skb)->frags[0].page = NULL;
+                        skb_shinfo(skb)->nr_frags = 0;
+                }
+                i = xennet_fill_frags(np, skb, &tmpq);
+                /*
+                 * Truesize approximates the size of true data plus
+                 * any supervisor overheads. Adding hypervisor
+                 * overheads has been shown to significantly reduce
+                 * achievable bandwidth with the default receive
+                 * buffer size. It is therefore not wise to account
+                 * for it here.
+                 *
+                 * After alloc_skb(RX_COPY_THRESHOLD), truesize is set
+                 * to RX_COPY_THRESHOLD + the supervisor
+                 * overheads. Here, we add the size of the data pulled
+                 * in xennet_fill_frags().
+                 *
+                 * We also adjust for any unused space in the main
+                 * data area by subtracting (RX_COPY_THRESHOLD -
+                 * len). This is especially important with drivers
+                 * which split incoming packets into header and data,
+                 * using only 66 bytes of the main data area (see the
+                 * e1000 driver for example.)  On such systems,
+                 * without this last adjustement, our achievable
+                 * receive throughout using the standard receive
+                 * buffer size was cut by 25%(!!!).
+                 */
+                skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
+                skb->len += skb->data_len;
+                if (rx->flags & NETRXF_csum_blank)
+                        skb->ip_summed = CHECKSUM_PARTIAL;
+                else if (rx->flags & NETRXF_data_validated)
+                        skb->ip_summed = CHECKSUM_UNNECESSARY;
+                __skb_queue_tail(&rxq, skb);
+                np->rx.rsp_cons = ++i;
+                work_done++;
+        }
+        while ((skb = __skb_dequeue(&errq)))
+                kfree_skb(skb);
+        work_done -= handle_incoming_queue(dev, &rxq);
+        /* If we get a callback with very few responses, reduce fill target. */
+        /* NB. Note exponential increase, linear decrease. */
+        if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
+             ((3*np->rx_target) / 4)) &&
+            (--np->rx_target < np->rx_min_target))
+                np->rx_target = np->rx_min_target;
+        xennet_alloc_rx_buffers(dev);
+        *pbudget   -= work_done;
+        dev->quota -= work_done;
+        if (work_done < budget) {
+                local_irq_save(flags);
+                RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
+                if (!more_to_do)
+                        __netif_rx_complete(dev);
+                local_irq_restore(flags);
+        }
+        spin_unlock(&np->rx_lock);
+        return more_to_do;
+}
+static int xennet_change_mtu(struct net_device *dev, int mtu)
+{
+        int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+        if (mtu > max)
+                return -EINVAL;
+        dev->mtu = mtu;
+        return 0;
+}
+static void xennet_release_tx_bufs(struct netfront_info *np)
+{
+        struct sk_buff *skb;
+        int i;
+        for (i = 0; i < NET_TX_RING_SIZE; i++) {
+                /* Skip over entries which are actually freelist references */
+                if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET)
+                        continue;
+                skb = np->tx_skbs[i].skb;
+                gnttab_end_foreign_access_ref(np->grant_tx_ref[i],
+                                              GNTMAP_readonly);
+                gnttab_release_grant_reference(&np->gref_tx_head,
+                                               np->grant_tx_ref[i]);
+                np->grant_tx_ref[i] = GRANT_INVALID_REF;
+                add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, i);
+                dev_kfree_skb_irq(skb);
+        }
+}
+static void xennet_release_rx_bufs(struct netfront_info *np)
+{
+        struct mmu_update      *mmu = np->rx_mmu;
+        struct multicall_entry *mcl = np->rx_mcl;
+        struct sk_buff_head free_list;
+        struct sk_buff *skb;
+        unsigned long mfn;
+        int xfer = 0, noxfer = 0, unused = 0;
+        int id, ref;
+        dev_warn(&np->netdev->dev, "%s: fix me for copying receiver.\n",
+                         __func__);
+        return;
+        skb_queue_head_init(&free_list);
+        spin_lock_bh(&np->rx_lock);
+        for (id = 0; id < NET_RX_RING_SIZE; id++) {
+                ref = np->grant_rx_ref[id];
+                if (ref == GRANT_INVALID_REF) {
+                        unused++;
+                        continue;
+                }
+                skb = np->rx_skbs[id];
+                mfn = gnttab_end_foreign_transfer_ref(ref);
+                gnttab_release_grant_reference(&np->gref_rx_head, ref);
+                np->grant_rx_ref[id] = GRANT_INVALID_REF;
+                if (0 == mfn) {
+                        skb_shinfo(skb)->nr_frags = 0;
+                        dev_kfree_skb(skb);
+                        noxfer++;
+                        continue;
+                }
+                if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                        /* Remap the page. */
+                        struct page *page = skb_shinfo(skb)->frags[0].page;
+                        unsigned long pfn = page_to_pfn(page);
+                        void *vaddr = page_address(page);
+                        MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
+                                                mfn_pte(mfn, PAGE_KERNEL),
+                                                0);
+                        mcl++;
+                        mmu->ptr = ((u64)mfn << PAGE_SHIFT)
+                                | MMU_MACHPHYS_UPDATE;
+                        mmu->val = pfn;
+                        mmu++;
+                        set_phys_to_machine(pfn, mfn);
+                }
+                __skb_queue_tail(&free_list, skb);
+                xfer++;
+        }
+        dev_info(&np->netdev->dev, "%s: %d xfer, %d noxfer, %d unused\n",
+                 __func__, xfer, noxfer, unused);
+        if (xfer) {
+                if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                        /* Do all the remapping work and M2P updates. */
+                        MULTI_mmu_update(mcl, np->rx_mmu, mmu - np->rx_mmu,
+                                         0, DOMID_SELF);
+                        mcl++;
+                        HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
+                }
+        }
+        while ((skb = __skb_dequeue(&free_list)) != NULL)
+                dev_kfree_skb(skb);
+        spin_unlock_bh(&np->rx_lock);
+}
+static void xennet_uninit(struct net_device *dev)
+{
+        struct netfront_info *np = netdev_priv(dev);
+        xennet_release_tx_bufs(np);
+        xennet_release_rx_bufs(np);
+        gnttab_free_grant_references(np->gref_tx_head);
+        gnttab_free_grant_references(np->gref_rx_head);
+}
+static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev)
+{
+        int i, err;
+        struct net_device *netdev;
+        struct netfront_info *np;
+        netdev = alloc_etherdev(sizeof(struct netfront_info));
+        if (!netdev) {
+                printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
+                       __func__);
+                return ERR_PTR(-ENOMEM);
+        }
+        np                   = netdev_priv(netdev);
+        np->xbdev            = dev;
+        spin_lock_init(&np->tx_lock);
+        spin_lock_init(&np->rx_lock);
+        skb_queue_head_init(&np->rx_batch);
+        np->rx_target     = RX_DFL_MIN_TARGET;
+        np->rx_min_target = RX_DFL_MIN_TARGET;
+        np->rx_max_target = RX_MAX_TARGET;
+        init_timer(&np->rx_refill_timer);
+        np->rx_refill_timer.data = (unsigned long)netdev;
+        np->rx_refill_timer.function = rx_refill_timeout;
+        /* Initialise tx_skbs as a free chain containing every entry. */
+        np->tx_skb_freelist = 0;
+        for (i = 0; i < NET_TX_RING_SIZE; i++) {
+                np->tx_skbs[i].link = i+1;
+                np->grant_tx_ref[i] = GRANT_INVALID_REF;
+        }
+        /* Clear out rx_skbs */
+        for (i = 0; i < NET_RX_RING_SIZE; i++) {
+                np->rx_skbs[i] = NULL;
+                np->grant_rx_ref[i] = GRANT_INVALID_REF;
+        }
+        /* A grant for every tx ring slot */
+        if (gnttab_alloc_grant_references(TX_MAX_TARGET,
+                                          &np->gref_tx_head) < 0) {
+                printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
+                err = -ENOMEM;
+                goto exit;
+        }
+        /* A grant for every rx ring slot */
+        if (gnttab_alloc_grant_references(RX_MAX_TARGET,
+                                          &np->gref_rx_head) < 0) {
+                printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
+                err = -ENOMEM;
+                goto exit_free_tx;
+        }
+        netdev->open            = xennet_open;
+        netdev->hard_start_xmit = xennet_start_xmit;
+        netdev->stop            = xennet_close;
+        netdev->get_stats       = xennet_get_stats;
+        netdev->poll            = xennet_poll;
+        netdev->uninit          = xennet_uninit;
+        netdev->change_mtu      = xennet_change_mtu;
+        netdev->weight          = 64;
+        netdev->features        = NETIF_F_IP_CSUM;
+        SET_ETHTOOL_OPS(netdev, &xennet_ethtool_ops);
+        SET_MODULE_OWNER(netdev);
+        SET_NETDEV_DEV(netdev, &dev->dev);
+        np->netdev = netdev;
+        netif_carrier_off(netdev);
+        return netdev;
+ exit_free_tx:
+        gnttab_free_grant_references(np->gref_tx_head);
+ exit:
+        free_netdev(netdev);
+        return ERR_PTR(err);
+}
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffers for communication with the backend, and
+ * inform the backend of the appropriate details for those.
+ */
+static int __devinit netfront_probe(struct xenbus_device *dev,
+                                    const struct xenbus_device_id *id)
+{
+        int err;
+        struct net_device *netdev;
+        struct netfront_info *info;
+        netdev = xennet_create_dev(dev);
+        if (IS_ERR(netdev)) {
+                err = PTR_ERR(netdev);
+                xenbus_dev_fatal(dev, err, "creating netdev");
+                return err;
+        }
+        info = netdev_priv(netdev);
+        dev->dev.driver_data = info;
+        err = register_netdev(info->netdev);
+        if (err) {
+                printk(KERN_WARNING "%s: register_netdev err=%d\n",
+                       __func__, err);
+                goto fail;
+        }
+        err = xennet_sysfs_addif(info->netdev);
+        if (err) {
+                unregister_netdev(info->netdev);
+                printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
+                       __func__, err);
+                goto fail;
+        }
+        return 0;
+ fail:
+        free_netdev(netdev);
+        dev->dev.driver_data = NULL;
+        return err;
+}
+static void xennet_end_access(int ref, void *page)
+{
+        /* This frees the page as a side-effect */
+        if (ref != GRANT_INVALID_REF)
+                gnttab_end_foreign_access(ref, 0, (unsigned long)page);
+}
+static void xennet_disconnect_backend(struct netfront_info *info)
+{
+        /* Stop old i/f to prevent errors whilst we rebuild the state. */
+        spin_lock_bh(&info->rx_lock);
+        spin_lock_irq(&info->tx_lock);
+        netif_carrier_off(info->netdev);
+        spin_unlock_irq(&info->tx_lock);
+        spin_unlock_bh(&info->rx_lock);
+        if (info->netdev->irq)
+                unbind_from_irqhandler(info->netdev->irq, info->netdev);
+        info->evtchn = info->netdev->irq = 0;
+        /* End access and free the pages */
+        xennet_end_access(info->tx_ring_ref, info->tx.sring);
+        xennet_end_access(info->rx_ring_ref, info->rx.sring);
+        info->tx_ring_ref = GRANT_INVALID_REF;
+        info->rx_ring_ref = GRANT_INVALID_REF;
+        info->tx.sring = NULL;
+        info->rx.sring = NULL;
+}
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our netif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int netfront_resume(struct xenbus_device *dev)
+{
+        struct netfront_info *info = dev->dev.driver_data;
+        dev_dbg(&dev->dev, "%s\n", dev->nodename);
+        xennet_disconnect_backend(info);
+        return 0;
+}
+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+        char *s, *e, *macstr;
+        int i;
+        macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+        if (IS_ERR(macstr))
+                return PTR_ERR(macstr);
+        for (i = 0; i < ETH_ALEN; i++) {
+                mac[i] = simple_strtoul(s, &e, 16);
+                if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+                        kfree(macstr);
+                        return -ENOENT;
+                }
+                s = e+1;
+        }
+        kfree(macstr);
+        return 0;
+}
+static irqreturn_t xennet_interrupt(int irq, void *dev_id)
+{
+        struct net_device *dev = dev_id;
+        struct netfront_info *np = netdev_priv(dev);
+        unsigned long flags;
+        spin_lock_irqsave(&np->tx_lock, flags);
+        if (likely(netif_carrier_ok(dev))) {
+                xennet_tx_buf_gc(dev);
+                /* Under tx_lock: protects access to rx shared-ring indexes. */
+                if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
+                        netif_rx_schedule(dev);
+        }
+        spin_unlock_irqrestore(&np->tx_lock, flags);
+        return IRQ_HANDLED;
+}
+static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
+{
+        struct xen_netif_tx_sring *txs;
+        struct xen_netif_rx_sring *rxs;
+        int err;
+        struct net_device *netdev = info->netdev;
+        info->tx_ring_ref = GRANT_INVALID_REF;
+        info->rx_ring_ref = GRANT_INVALID_REF;
+        info->rx.sring = NULL;
+        info->tx.sring = NULL;
+        netdev->irq = 0;
+        err = xen_net_read_mac(dev, netdev->dev_addr);
+        if (err) {
+                xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
+                goto fail;
+        }
+        txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
+        if (!txs) {
+                err = -ENOMEM;
+                xenbus_dev_fatal(dev, err, "allocating tx ring page");
+                goto fail;
+        }
+        SHARED_RING_INIT(txs);
+        FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
+        err = xenbus_grant_ring(dev, virt_to_mfn(txs));
+        if (err < 0) {
+                free_page((unsigned long)txs);
+                goto fail;
+        }
+        info->tx_ring_ref = err;
+        rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
+        if (!rxs) {
+                err = -ENOMEM;
+                xenbus_dev_fatal(dev, err, "allocating rx ring page");
+                goto fail;
+        }
+        SHARED_RING_INIT(rxs);
+        FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
+        err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
+        if (err < 0) {
+                free_page((unsigned long)rxs);
+                goto fail;
+        }
+        info->rx_ring_ref = err;
+        err = xenbus_alloc_evtchn(dev, &info->evtchn);
+        if (err)
+                goto fail;
+        err = bind_evtchn_to_irqhandler(info->evtchn, xennet_interrupt,
+                                        IRQF_SAMPLE_RANDOM, netdev->name,
+                                        netdev);
+        if (err < 0)
+                goto fail;
+        netdev->irq = err;
+        return 0;
+ fail:
+        return err;
+}
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+                           struct netfront_info *info)
+{
+        const char *message;
+        struct xenbus_transaction xbt;
+        int err;
+        /* Create shared ring, alloc event channel. */
+        err = setup_netfront(dev, info);
+        if (err)
+                goto out;
+again:
+        err = xenbus_transaction_start(&xbt);
+        if (err) {
+                xenbus_dev_fatal(dev, err, "starting transaction");
+                goto destroy_ring;
+        }
+        err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref", "%u",
+                            info->tx_ring_ref);
+        if (err) {
+                message = "writing tx ring-ref";
+                goto abort_transaction;
+        }
+        err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref", "%u",
+                            info->rx_ring_ref);
+        if (err) {
+                message = "writing rx ring-ref";
+                goto abort_transaction;
+        }
+        err = xenbus_printf(xbt, dev->nodename,
+                            "event-channel", "%u", info->evtchn);
+        if (err) {
+                message = "writing event-channel";
+                goto abort_transaction;
+        }
+        err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
+                            1);
+        if (err) {
+                message = "writing request-rx-copy";
+                goto abort_transaction;
+        }
+        err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
+        if (err) {
+                message = "writing feature-rx-notify";
+                goto abort_transaction;
+        }
+        err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
+        if (err) {
+                message = "writing feature-sg";
+                goto abort_transaction;
+        }
+        err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
+        if (err) {
+                message = "writing feature-gso-tcpv4";
+                goto abort_transaction;
+        }
+        err = xenbus_transaction_end(xbt, 0);
+        if (err) {
+                if (err == -EAGAIN)
+                        goto again;
+                xenbus_dev_fatal(dev, err, "completing transaction");
+                goto destroy_ring;
+        }
+        return 0;
+ abort_transaction:
+        xenbus_transaction_end(xbt, 1);
+        xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_ring:
+        xennet_disconnect_backend(info);
+ out:
+        return err;
+}
+static int xennet_set_sg(struct net_device *dev, u32 data)
+{
+        if (data) {
+                struct netfront_info *np = netdev_priv(dev);
+                int val;
+                if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
+                                 "%d", &val) < 0)
+                        val = 0;
+                if (!val)
+                        return -ENOSYS;
+        } else if (dev->mtu > ETH_DATA_LEN)
+                dev->mtu = ETH_DATA_LEN;
+        return ethtool_op_set_sg(dev, data);
+}
+static int xennet_set_tso(struct net_device *dev, u32 data)
+{
+        if (data) {
+                struct netfront_info *np = netdev_priv(dev);
+                int val;
+                if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+                                 "feature-gso-tcpv4", "%d", &val) < 0)
+                        val = 0;
+                if (!val)
+                        return -ENOSYS;
+        }
+        return ethtool_op_set_tso(dev, data);
+}
+static void xennet_set_features(struct net_device *dev)
+{
+        /* Turn off all GSO bits except ROBUST. */
+        dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
+        dev->features |= NETIF_F_GSO_ROBUST;
+        xennet_set_sg(dev, 0);
+        /* We need checksum offload to enable scatter/gather and TSO. */
+        if (!(dev->features & NETIF_F_IP_CSUM))
+                return;
+        if (!xennet_set_sg(dev, 1))
+                xennet_set_tso(dev, 1);
+}
+static int xennet_connect(struct net_device *dev)
+{
+        struct netfront_info *np = netdev_priv(dev);
+        int i, requeue_idx, err;
+        struct sk_buff *skb;
+        grant_ref_t ref;
+        struct xen_netif_rx_request *req;
+        unsigned int feature_rx_copy;
+        err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+                           "feature-rx-copy", "%u", &feature_rx_copy);
+        if (err != 1)
+                feature_rx_copy = 0;
+        if (!feature_rx_copy) {
+                dev_info(&dev->dev,
+                         "backend does not support copying recieve path");
+                return -ENODEV;
+        }
+        err = talk_to_backend(np->xbdev, np);
+        if (err)
+                return err;
+        xennet_set_features(dev);
+        spin_lock_bh(&np->rx_lock);
+        spin_lock_irq(&np->tx_lock);
+        /* Step 1: Discard all pending TX packet fragments. */
+        xennet_release_tx_bufs(np);
+        /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
+        for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
+                if (!np->rx_skbs[i])
+                        continue;
+                skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
+                ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
+                req = RING_GET_REQUEST(&np->rx, requeue_idx);
+                gnttab_grant_foreign_access_ref(
+                        ref, np->xbdev->otherend_id,
+                        pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
+                                               frags->page)),
+                        0);
+                req->gref = ref;
+                req->id   = requeue_idx;
+                requeue_idx++;
+        }
+        np->rx.req_prod_pvt = requeue_idx;
+        /*
+         * Step 3: All public and private state should now be sane.  Get
+         * ready to start sending and receiving packets and give the driver
+         * domain a kick because we've probably just requeued some
+         * packets.
+         */
+        netif_carrier_on(np->netdev);
+        notify_remote_via_irq(np->netdev->irq);
+        xennet_tx_buf_gc(dev);
+        xennet_alloc_rx_buffers(dev);
+        spin_unlock_irq(&np->tx_lock);
+        spin_unlock_bh(&np->rx_lock);
+        return 0;
+}
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+                            enum xenbus_state backend_state)
+{
+        struct netfront_info *np = dev->dev.driver_data;
+        struct net_device *netdev = np->netdev;
+        dev_dbg(&dev->dev, "%s\n", xenbus_strstate(backend_state));
+        switch (backend_state) {
+        case XenbusStateInitialising:
+        case XenbusStateInitialised:
+        case XenbusStateConnected:
+        case XenbusStateUnknown:
+        case XenbusStateClosed:
+                break;
+        case XenbusStateInitWait:
+                if (dev->state != XenbusStateInitialising)
+                        break;
+                if (xennet_connect(netdev) != 0)
+                        break;
+                xenbus_switch_state(dev, XenbusStateConnected);
+                break;
+        case XenbusStateClosing:
+                xenbus_frontend_closed(dev);
+                break;
+        }
+}
+static struct ethtool_ops xennet_ethtool_ops =
+{
+        .get_tx_csum = ethtool_op_get_tx_csum,
+        .set_tx_csum = ethtool_op_set_tx_csum,
+        .get_sg = ethtool_op_get_sg,
+        .set_sg = xennet_set_sg,
+        .get_tso = ethtool_op_get_tso,
+        .set_tso = xennet_set_tso,
+        .get_link = ethtool_op_get_link,
+};
+#ifdef CONFIG_SYSFS
+static ssize_t show_rxbuf_min(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+        struct net_device *netdev = to_net_dev(dev);
+        struct netfront_info *info = netdev_priv(netdev);
+        return sprintf(buf, "%u\n", info->rx_min_target);
+}
+static ssize_t store_rxbuf_min(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t len)
+{
+        struct net_device *netdev = to_net_dev(dev);
+        struct netfront_info *np = netdev_priv(netdev);
+        char *endp;
+        unsigned long target;
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+        target = simple_strtoul(buf, &endp, 0);
+        if (endp == buf)
+                return -EBADMSG;
+        if (target < RX_MIN_TARGET)
+                target = RX_MIN_TARGET;
+        if (target > RX_MAX_TARGET)
+                target = RX_MAX_TARGET;
+        spin_lock_bh(&np->rx_lock);
+        if (target > np->rx_max_target)
+                np->rx_max_target = target;
+        np->rx_min_target = target;
+        if (target > np->rx_target)
+                np->rx_target = target;
+        xennet_alloc_rx_buffers(netdev);
+        spin_unlock_bh(&np->rx_lock);
+        return len;
+}
+static ssize_t show_rxbuf_max(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+        struct net_device *netdev = to_net_dev(dev);
+        struct netfront_info *info = netdev_priv(netdev);
+        return sprintf(buf, "%u\n", info->rx_max_target);
+}
+static ssize_t store_rxbuf_max(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t len)
+{
+        struct net_device *netdev = to_net_dev(dev);
+        struct netfront_info *np = netdev_priv(netdev);
+        char *endp;
+        unsigned long target;
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+        target = simple_strtoul(buf, &endp, 0);
+        if (endp == buf)
+                return -EBADMSG;
+        if (target < RX_MIN_TARGET)
+                target = RX_MIN_TARGET;
+        if (target > RX_MAX_TARGET)
+                target = RX_MAX_TARGET;
+        spin_lock_bh(&np->rx_lock);
+        if (target < np->rx_min_target)
+                np->rx_min_target = target;
+        np->rx_max_target = target;
+        if (target < np->rx_target)
+                np->rx_target = target;
+        xennet_alloc_rx_buffers(netdev);
+        spin_unlock_bh(&np->rx_lock);
+        return len;
+}
+static ssize_t show_rxbuf_cur(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+        struct net_device *netdev = to_net_dev(dev);
+        struct netfront_info *info = netdev_priv(netdev);
+        return sprintf(buf, "%u\n", info->rx_target);
+}
+static struct device_attribute xennet_attrs[] = {
+        __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
+        __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
+        __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
+};
+static int xennet_sysfs_addif(struct net_device *netdev)
+{
+        int i;
+        int err;
+        for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
+                err = device_create_file(&netdev->dev,
+                                           &xennet_attrs[i]);
+                if (err)
+                        goto fail;
+        }
+        return 0;
+ fail:
+        while (--i >= 0)
+                device_remove_file(&netdev->dev, &xennet_attrs[i]);
+        return err;
+}
+static void xennet_sysfs_delif(struct net_device *netdev)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++)
+                device_remove_file(&netdev->dev, &xennet_attrs[i]);
+}
+#endif /* CONFIG_SYSFS */
+static struct xenbus_device_id netfront_ids[] = {
+        { "vif" },
+        { "" }
+};
+static int __devexit xennet_remove(struct xenbus_device *dev)
+{
+        struct netfront_info *info = dev->dev.driver_data;
+        dev_dbg(&dev->dev, "%s\n", dev->nodename);
+        unregister_netdev(info->netdev);
+        xennet_disconnect_backend(info);
+        del_timer_sync(&info->rx_refill_timer);
+        xennet_sysfs_delif(info->netdev);
+        free_netdev(info->netdev);
+        return 0;
+}
+static struct xenbus_driver netfront = {
+        .name = "vif",
+        .owner = THIS_MODULE,
+        .ids = netfront_ids,
+        .probe = netfront_probe,
+        .remove = __devexit_p(xennet_remove),
+        .resume = netfront_resume,
+        .otherend_changed = backend_changed,
+};
+static int __init netif_init(void)
+{
+        if (!is_running_on_xen())
+                return -ENODEV;
+        if (is_initial_xendomain())
+                return 0;
+        printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n");
+        return xenbus_register_frontend(&netfront);
+}
+module_init(netif_init);
+static void __exit netif_exit(void)
+{
+        if (is_initial_xendomain())
+                return;
+        return xenbus_unregister_driver(&netfront);
+}
+module_exit(netif_exit);
+MODULE_DESCRIPTION("Xen virtual network device frontend");
+MODULE_LICENSE("GPL");
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index 03baf1c64a2e..ed112ee16012 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -147,7 +147,7 @@ static int pnp_dock_event(int dock, struct pnp_docking_station_info *info)
                info->location_id, info->serial, info->capabilities);
        envp[i] = NULL;
        
-        value = call_usermodehelper (argv [0], argv, envp, 0);
+        value = call_usermodehelper (argv [0], argv, envp, UMH_WAIT_EXEC);
        kfree (buf);
        kfree (envp);
        return 0;
diff --git a/drivers/sbus/char/bbc_envctrl.c b/drivers/sbus/char/bbc_envctrl.c
index a54e4140683a..e821a155b658 100644
--- a/drivers/sbus/char/bbc_envctrl.c
+++ b/drivers/sbus/char/bbc_envctrl.c
@@ -7,6 +7,7 @@
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/kmod.h>
+#include <linux/reboot.h>
 #include <asm/oplib.h>
 #include <asm/ebus.h>
@@ -170,8 +171,6 @@ static void get_current_temps(struct bbc_cpu_temperature *tp)
 static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
 {
        static int shutting_down = 0;
-        static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
-        char *argv[] = { "/sbin/shutdown", "-h", "now", NULL };
        char *type = "???";
        s8 val = -1;
@@ -195,7 +194,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp)
        printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n");
        shutting_down = 1;
-        if (call_usermodehelper("/sbin/shutdown", argv, envp, 0) < 0)
+        if (orderly_poweroff(true) < 0)
                printk(KERN_CRIT "envctrl: shutdown execution failed\n");
 }
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index 8328acab47fd..dadabef116b6 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -26,6 +26,7 @@
 #include <linux/ioport.h>
 #include <linux/miscdevice.h>
 #include <linux/kmod.h>
+#include <linux/reboot.h>
 #include <asm/ebus.h>
 #include <asm/uaccess.h>
@@ -966,10 +967,6 @@ static struct i2c_child_t *envctrl_get_i2c_child(unsigned char mon_type)
 static void envctrl_do_shutdown(void)
 {
        static int inprog = 0;
-        static char *envp[] = { 
-                "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
-        char *argv[] = { 
-                "/sbin/shutdown", "-h", "now", NULL };  
        int ret;
        if (inprog != 0)
@@ -977,7 +974,7 @@ static void envctrl_do_shutdown(void)
        inprog = 1;
        printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n");
-        ret = call_usermodehelper("/sbin/shutdown", argv, envp, 0);
+        ret = orderly_poweroff(true);
        if (ret < 0) {
                printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); 
                inprog = 0;  /* unlikely to succeed, but we could try again */
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
new file mode 100644
index 000000000000..56592f0d6cef
--- /dev/null
+++ b/drivers/xen/Makefile
@@ -0,0 +1,2 @@
+obj-y   += grant-table.o
+obj-y   += xenbus/
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
new file mode 100644
index 000000000000..ea94dbabf9a9
--- /dev/null
+++ b/drivers/xen/grant-table.c
@@ -0,0 +1,582 @@
+/******************************************************************************
+ * grant_table.c
+ *
+ * Granting foreign access to our memory reservation.
+ *
+ * Copyright (c) 2005-2006, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <xen/interface/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <asm/pgtable.h>
+#include <asm/sync_bitops.h>
+/* External tools reserve first few grant table entries. */
+#define NR_RESERVED_ENTRIES 8
+#define GNTTAB_LIST_END 0xffffffff
+#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry))
+static grant_ref_t **gnttab_list;
+static unsigned int nr_grant_frames;
+static unsigned int boot_max_nr_grant_frames;
+static int gnttab_free_count;
+static grant_ref_t gnttab_free_head;
+static DEFINE_SPINLOCK(gnttab_list_lock);
+static struct grant_entry *shared;
+static struct gnttab_free_callback *gnttab_free_callback_list;
+static int gnttab_expand(unsigned int req_entries);
+#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
+static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
+{
+        return &gnttab_list[(entry) / RPP][(entry) % RPP];
+}
+/* This can be used as an l-value */
+#define gnttab_entry(entry) (*__gnttab_entry(entry))
+static int get_free_entries(unsigned count)
+{
+        unsigned long flags;
+        int ref, rc;
+        grant_ref_t head;
+        spin_lock_irqsave(&gnttab_list_lock, flags);
+        if ((gnttab_free_count < count) &&
+            ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) {
+                spin_unlock_irqrestore(&gnttab_list_lock, flags);
+                return rc;
+        }
+        ref = head = gnttab_free_head;
+        gnttab_free_count -= count;
+        while (count-- > 1)
+                head = gnttab_entry(head);
+        gnttab_free_head = gnttab_entry(head);
+        gnttab_entry(head) = GNTTAB_LIST_END;
+        spin_unlock_irqrestore(&gnttab_list_lock, flags);
+        return ref;
+}
+static void do_free_callbacks(void)
+{
+        struct gnttab_free_callback *callback, *next;
+        callback = gnttab_free_callback_list;
+        gnttab_free_callback_list = NULL;
+        while (callback != NULL) {
+                next = callback->next;
+                if (gnttab_free_count >= callback->count) {
+                        callback->next = NULL;
+                        callback->fn(callback->arg);
+                } else {
+                        callback->next = gnttab_free_callback_list;
+                        gnttab_free_callback_list = callback;
+                }
+                callback = next;
+        }
+}
+static inline void check_free_callbacks(void)
+{
+        if (unlikely(gnttab_free_callback_list))
+                do_free_callbacks();
+}
+static void put_free_entry(grant_ref_t ref)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&gnttab_list_lock, flags);
+        gnttab_entry(ref) = gnttab_free_head;
+        gnttab_free_head = ref;
+        gnttab_free_count++;
+        check_free_callbacks();
+        spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+static void update_grant_entry(grant_ref_t ref, domid_t domid,
+                               unsigned long frame, unsigned flags)
+{
+        /*
+         * Introducing a valid entry into the grant table:
+         *  1. Write ent->domid.
+         *  2. Write ent->frame:
+         *      GTF_permit_access:   Frame to which access is permitted.
+         *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
+         *                           frame, or zero if none.
+         *  3. Write memory barrier (WMB).
+         *  4. Write ent->flags, inc. valid type.
+         */
+        shared[ref].frame = frame;
+        shared[ref].domid = domid;
+        wmb();
+        shared[ref].flags = flags;
+}
+/*
+ * Public grant-issuing interface functions
+ */
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+                                     unsigned long frame, int readonly)
+{
+        update_grant_entry(ref, domid, frame,
+                           GTF_permit_access | (readonly ? GTF_readonly : 0));
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+                                int readonly)
+{
+        int ref;
+        ref = get_free_entries(1);
+        if (unlikely(ref < 0))
+                return -ENOSPC;
+        gnttab_grant_foreign_access_ref(ref, domid, frame, readonly);
+        return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
+int gnttab_query_foreign_access(grant_ref_t ref)
+{
+        u16 nflags;
+        nflags = shared[ref].flags;
+        return (nflags & (GTF_reading|GTF_writing));
+}
+EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+{
+        u16 flags, nflags;
+        nflags = shared[ref].flags;
+        do {
+                flags = nflags;
+                if (flags & (GTF_reading|GTF_writing)) {
+                        printk(KERN_ALERT "WARNING: g.e. still in use!\n");
+                        return 0;
+                }
+        } while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags);
+        return 1;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
+void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
+                               unsigned long page)
+{
+        if (gnttab_end_foreign_access_ref(ref, readonly)) {
+                put_free_entry(ref);
+                if (page != 0)
+                        free_page(page);
+        } else {
+                /* XXX This needs to be fixed so that the ref and page are
+                   placed on a list to be freed up later. */
+                printk(KERN_WARNING
+                       "WARNING: leaking g.e. and page still in use!\n");
+        }
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
+{
+        int ref;
+        ref = get_free_entries(1);
+        if (unlikely(ref < 0))
+                return -ENOSPC;
+        gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
+        return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
+void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
+                                       unsigned long pfn)
+{
+        update_grant_entry(ref, domid, pfn, GTF_accept_transfer);
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+{
+        unsigned long frame;
+        u16           flags;
+        /*
+         * If a transfer is not even yet started, try to reclaim the grant
+         * reference and return failure (== 0).
+         */
+        while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
+                if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags)
+                        return 0;
+                cpu_relax();
+        }
+        /* If a transfer is in progress then wait until it is completed. */
+        while (!(flags & GTF_transfer_completed)) {
+                flags = shared[ref].flags;
+                cpu_relax();
+        }
+        rmb();  /* Read the frame number /after/ reading completion status. */
+        frame = shared[ref].frame;
+        BUG_ON(frame == 0);
+        return frame;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
+{
+        unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
+        put_free_entry(ref);
+        return frame;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
+void gnttab_free_grant_reference(grant_ref_t ref)
+{
+        put_free_entry(ref);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
+void gnttab_free_grant_references(grant_ref_t head)
+{
+        grant_ref_t ref;
+        unsigned long flags;
+        int count = 1;
+        if (head == GNTTAB_LIST_END)
+                return;
+        spin_lock_irqsave(&gnttab_list_lock, flags);
+        ref = head;
+        while (gnttab_entry(ref) != GNTTAB_LIST_END) {
+                ref = gnttab_entry(ref);
+                count++;
+        }
+        gnttab_entry(ref) = gnttab_free_head;
+        gnttab_free_head = head;
+        gnttab_free_count += count;
+        check_free_callbacks();
+        spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
+{
+        int h = get_free_entries(count);
+        if (h < 0)
+                return -ENOSPC;
+        *head = h;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
+int gnttab_empty_grant_references(const grant_ref_t *private_head)
+{
+        return (*private_head == GNTTAB_LIST_END);
+}
+EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
+int gnttab_claim_grant_reference(grant_ref_t *private_head)
+{
+        grant_ref_t g = *private_head;
+        if (unlikely(g == GNTTAB_LIST_END))
+                return -ENOSPC;
+        *private_head = gnttab_entry(g);
+        return g;
+}
+EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+                                    grant_ref_t release)
+{
+        gnttab_entry(release) = *private_head;
+        *private_head = release;
+}
+EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+                                  void (*fn)(void *), void *arg, u16 count)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&gnttab_list_lock, flags);
+        if (callback->next)
+                goto out;
+        callback->fn = fn;
+        callback->arg = arg;
+        callback->count = count;
+        callback->next = gnttab_free_callback_list;
+        gnttab_free_callback_list = callback;
+        check_free_callbacks();
+out:
+        spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
+{
+        struct gnttab_free_callback **pcb;
+        unsigned long flags;
+        spin_lock_irqsave(&gnttab_list_lock, flags);
+        for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
+                if (*pcb == callback) {
+                        *pcb = callback->next;
+                        break;
+                }
+        }
+        spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
+static int grow_gnttab_list(unsigned int more_frames)
+{
+        unsigned int new_nr_grant_frames, extra_entries, i;
+        new_nr_grant_frames = nr_grant_frames + more_frames;
+        extra_entries       = more_frames * GREFS_PER_GRANT_FRAME;
+        for (i = nr_grant_frames; i < new_nr_grant_frames; i++) {
+                gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
+                if (!gnttab_list[i])
+                        goto grow_nomem;
+        }
+        for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames;
+             i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++)
+                gnttab_entry(i) = i + 1;
+        gnttab_entry(i) = gnttab_free_head;
+        gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames;
+        gnttab_free_count += extra_entries;
+        nr_grant_frames = new_nr_grant_frames;
+        check_free_callbacks();
+        return 0;
+grow_nomem:
+        for ( ; i >= nr_grant_frames; i--)
+                free_page((unsigned long) gnttab_list[i]);
+        return -ENOMEM;
+}
+static unsigned int __max_nr_grant_frames(void)
+{
+        struct gnttab_query_size query;
+        int rc;
+        query.dom = DOMID_SELF;
+        rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1);
+        if ((rc < 0) || (query.status != GNTST_okay))
+                return 4; /* Legacy max supported number of frames */
+        return query.max_nr_frames;
+}
+static inline unsigned int max_nr_grant_frames(void)
+{
+        unsigned int xen_max = __max_nr_grant_frames();
+        if (xen_max > boot_max_nr_grant_frames)
+                return boot_max_nr_grant_frames;
+        return xen_max;
+}
+static int map_pte_fn(pte_t *pte, struct page *pmd_page,
+                      unsigned long addr, void *data)
+{
+        unsigned long **frames = (unsigned long **)data;
+        set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
+        (*frames)++;
+        return 0;
+}
+static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
+                        unsigned long addr, void *data)
+{
+        set_pte_at(&init_mm, addr, pte, __pte(0));
+        return 0;
+}
+static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+{
+        struct gnttab_setup_table setup;
+        unsigned long *frames;
+        unsigned int nr_gframes = end_idx + 1;
+        int rc;
+        frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
+        if (!frames)
+                return -ENOMEM;
+        setup.dom        = DOMID_SELF;
+        setup.nr_frames  = nr_gframes;
+        setup.frame_list = frames;
+        rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
+        if (rc == -ENOSYS) {
+                kfree(frames);
+                return -ENOSYS;
+        }
+        BUG_ON(rc || setup.status);
+        if (shared == NULL) {
+                struct vm_struct *area;
+                area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames());
+                BUG_ON(area == NULL);
+                shared = area->addr;
+        }
+        rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+                                 PAGE_SIZE * nr_gframes,
+                                 map_pte_fn, &frames);
+        BUG_ON(rc);
+        frames -= nr_gframes; /* adjust after map_pte_fn() */
+        kfree(frames);
+        return 0;
+}
+static int gnttab_resume(void)
+{
+        if (max_nr_grant_frames() < nr_grant_frames)
+                return -ENOSYS;
+        return gnttab_map(0, nr_grant_frames - 1);
+}
+static int gnttab_suspend(void)
+{
+        apply_to_page_range(&init_mm, (unsigned long)shared,
+                            PAGE_SIZE * nr_grant_frames,
+                            unmap_pte_fn, NULL);
+        return 0;
+}
+static int gnttab_expand(unsigned int req_entries)
+{
+        int rc;
+        unsigned int cur, extra;
+        cur = nr_grant_frames;
+        extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
+                 GREFS_PER_GRANT_FRAME);
+        if (cur + extra > max_nr_grant_frames())
+                return -ENOSPC;
+        rc = gnttab_map(cur, cur + extra - 1);
+        if (rc == 0)
+                rc = grow_gnttab_list(extra);
+        return rc;
+}
+static int __devinit gnttab_init(void)
+{
+        int i;
+        unsigned int max_nr_glist_frames;
+        unsigned int nr_init_grefs;
+        if (!is_running_on_xen())
+                return -ENODEV;
+        nr_grant_frames = 1;
+        boot_max_nr_grant_frames = __max_nr_grant_frames();
+        /* Determine the maximum number of frames required for the
+         * grant reference free list on the current hypervisor.
+         */
+        max_nr_glist_frames = (boot_max_nr_grant_frames *
+                               GREFS_PER_GRANT_FRAME /
+                               (PAGE_SIZE / sizeof(grant_ref_t)));
+        gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
+                              GFP_KERNEL);
+        if (gnttab_list == NULL)
+                return -ENOMEM;
+        for (i = 0; i < nr_grant_frames; i++) {
+                gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
+                if (gnttab_list[i] == NULL)
+                        goto ini_nomem;
+        }
+        if (gnttab_resume() < 0)
+                return -ENODEV;
+        nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME;
+        for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
+                gnttab_entry(i) = i + 1;
+        gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END;
+        gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
+        gnttab_free_head  = NR_RESERVED_ENTRIES;
+        printk("Grant table initialized\n");
+        return 0;
+ ini_nomem:
+        for (i--; i >= 0; i--)
+                free_page((unsigned long)gnttab_list[i]);
+        kfree(gnttab_list);
+        return -ENOMEM;
+}
+core_initcall(gnttab_init);
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
new file mode 100644
index 000000000000..5571f5b84223
--- /dev/null
+++ b/drivers/xen/xenbus/Makefile
@@ -0,0 +1,7 @@
+obj-y   += xenbus.o
+xenbus-objs =
+xenbus-objs += xenbus_client.o
+xenbus-objs += xenbus_comms.o
+xenbus-objs += xenbus_xs.o
+xenbus-objs += xenbus_probe.o
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
new file mode 100644
index 000000000000..9fd2f70ab46d
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -0,0 +1,569 @@
+/******************************************************************************
+ * Client-facing interface for the Xenbus driver.  In other words, the
+ * interface between the Xenbus and the device-specific code, be it the
+ * frontend or the backend of that driver.
+ *
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+const char *xenbus_strstate(enum xenbus_state state)
+{
+        static const char *const name[] = {
+                [ XenbusStateUnknown      ] = "Unknown",
+                [ XenbusStateInitialising ] = "Initialising",
+                [ XenbusStateInitWait     ] = "InitWait",
+                [ XenbusStateInitialised  ] = "Initialised",
+                [ XenbusStateConnected    ] = "Connected",
+                [ XenbusStateClosing      ] = "Closing",
+                [ XenbusStateClosed       ] = "Closed",
+        };
+        return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
+}
+EXPORT_SYMBOL_GPL(xenbus_strstate);
+/**
+ * xenbus_watch_path - register a watch
+ * @dev: xenbus device
+ * @path: path to watch
+ * @watch: watch to register
+ * @callback: callback to register
+ *
+ * Register a @watch on the given path, using the given xenbus_watch structure
+ * for storage, and the given @callback function as the callback.  Return 0 on
+ * success, or -errno on error.  On success, the given @path will be saved as
+ * @watch->node, and remains the caller's to free.  On error, @watch->node will
+ * be NULL, the device will switch to %XenbusStateClosing, and the error will
+ * be saved in the store.
+ */
+int xenbus_watch_path(struct xenbus_device *dev, const char *path,
+                      struct xenbus_watch *watch,
+                      void (*callback)(struct xenbus_watch *,
+                                       const char **, unsigned int))
+{
+        int err;
+        watch->node = path;
+        watch->callback = callback;
+        err = register_xenbus_watch(watch);
+        if (err) {
+                watch->node = NULL;
+                watch->callback = NULL;
+                xenbus_dev_fatal(dev, err, "adding watch on %s", path);
+        }
+        return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_watch_path);
+/**
+ * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path
+ * @dev: xenbus device
+ * @watch: watch to register
+ * @callback: callback to register
+ * @pathfmt: format of path to watch
+ *
+ * Register a watch on the given @path, using the given xenbus_watch
+ * structure for storage, and the given @callback function as the callback.
+ * Return 0 on success, or -errno on error.  On success, the watched path
+ * (@path/@path2) will be saved as @watch->node, and becomes the caller's to
+ * kfree().  On error, watch->node will be NULL, so the caller has nothing to
+ * free, the device will switch to %XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
+int xenbus_watch_pathfmt(struct xenbus_device *dev,
+                         struct xenbus_watch *watch,
+                         void (*callback)(struct xenbus_watch *,
+                                        const char **, unsigned int),
+                         const char *pathfmt, ...)
+{
+        int err;
+        va_list ap;
+        char *path;
+        va_start(ap, pathfmt);
+        path = kvasprintf(GFP_KERNEL, pathfmt, ap);
+        va_end(ap);
+        if (!path) {
+                xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
+                return -ENOMEM;
+        }
+        err = xenbus_watch_path(dev, path, watch, callback);
+        if (err)
+                kfree(path);
+        return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
+/**
+ * xenbus_switch_state
+ * @dev: xenbus device
+ * @xbt: transaction handle
+ * @state: new state
+ *
+ * Advertise in the store a change of the given driver to the given new_state.
+ * Return 0 on success, or -errno on error.  On error, the device will switch
+ * to XenbusStateClosing, and the error will be saved in the store.
+ */
+int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
+{
+        /* We check whether the state is currently set to the given value, and
+           if not, then the state is set.  We don't want to unconditionally
+           write the given state, because we don't want to fire watches
+           unnecessarily.  Furthermore, if the node has gone, we don't write
+           to it, as the device will be tearing down, and we don't want to
+           resurrect that directory.
+           Note that, because of this cached value of our state, this function
+           will not work inside a Xenstore transaction (something it was
+           trying to in the past) because dev->state would not get reset if
+           the transaction was aborted.
+         */
+        int current_state;
+        int err;
+        if (state == dev->state)
+                return 0;
+        err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
+                           &current_state);
+        if (err != 1)
+                return 0;
+        err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
+        if (err) {
+                if (state != XenbusStateClosing) /* Avoid looping */
+                        xenbus_dev_fatal(dev, err, "writing new state");
+                return err;
+        }
+        dev->state = state;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_switch_state);
+int xenbus_frontend_closed(struct xenbus_device *dev)
+{
+        xenbus_switch_state(dev, XenbusStateClosed);
+        complete(&dev->down);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_frontend_closed);
+/**
+ * Return the path to the error node for the given device, or NULL on failure.
+ * If the value returned is non-NULL, then it is the caller's to kfree.
+ */
+static char *error_path(struct xenbus_device *dev)
+{
+        return kasprintf(GFP_KERNEL, "error/%s", dev->nodename);
+}
+static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
+                                const char *fmt, va_list ap)
+{
+        int ret;
+        unsigned int len;
+        char *printf_buffer = NULL;
+        char *path_buffer = NULL;
+#define PRINTF_BUFFER_SIZE 4096
+        printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
+        if (printf_buffer == NULL)
+                goto fail;
+        len = sprintf(printf_buffer, "%i ", -err);
+        ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
+        BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
+        dev_err(&dev->dev, "%s\n", printf_buffer);
+        path_buffer = error_path(dev);
+        if (path_buffer == NULL) {
+                dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
+                       dev->nodename, printf_buffer);
+                goto fail;
+        }
+        if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
+                dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
+                       dev->nodename, printf_buffer);
+                goto fail;
+        }
+fail:
+        kfree(printf_buffer);
+        kfree(path_buffer);
+}
+/**
+ * xenbus_dev_error
+ * @dev: xenbus device
+ * @err: error to report
+ * @fmt: error message format
+ *
+ * Report the given negative errno into the store, along with the given
+ * formatted message.
+ */
+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+        xenbus_va_dev_error(dev, err, fmt, ap);
+        va_end(ap);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_error);
+/**
+ * xenbus_dev_fatal
+ * @dev: xenbus device
+ * @err: error to report
+ * @fmt: error message format
+ *
+ * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
+ * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
+ * closedown of this driver and its peer.
+ */
+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+        xenbus_va_dev_error(dev, err, fmt, ap);
+        va_end(ap);
+        xenbus_switch_state(dev, XenbusStateClosing);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
+/**
+ * xenbus_grant_ring
+ * @dev: xenbus device
+ * @ring_mfn: mfn of ring to grant
+ * Grant access to the given @ring_mfn to the peer of the given device.  Return
+ * 0 on success, or -errno on error.  On error, the device will switch to
+ * XenbusStateClosing, and the error will be saved in the store.
+ */
+int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
+{
+        int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
+        if (err < 0)
+                xenbus_dev_fatal(dev, err, "granting access to ring page");
+        return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_grant_ring);
+/**
+ * Allocate an event channel for the given xenbus_device, assigning the newly
+ * created local port to *port.  Return 0 on success, or -errno on error.  On
+ * error, the device will switch to XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
+int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
+{
+        struct evtchn_alloc_unbound alloc_unbound;
+        int err;
+        alloc_unbound.dom = DOMID_SELF;
+        alloc_unbound.remote_dom = dev->otherend_id;
+        err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+                                          &alloc_unbound);
+        if (err)
+                xenbus_dev_fatal(dev, err, "allocating event channel");
+        else
+                *port = alloc_unbound.port;
+        return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
+/**
+ * Bind to an existing interdomain event channel in another domain. Returns 0
+ * on success and stores the local port in *port. On error, returns -errno,
+ * switches the device to XenbusStateClosing, and saves the error in XenStore.
+ */
+int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
+{
+        struct evtchn_bind_interdomain bind_interdomain;
+        int err;
+        bind_interdomain.remote_dom = dev->otherend_id;
+        bind_interdomain.remote_port = remote_port;
+        err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+                                          &bind_interdomain);
+        if (err)
+                xenbus_dev_fatal(dev, err,
+                                 "binding to event channel %d from domain %d",
+                                 remote_port, dev->otherend_id);
+        else
+                *port = bind_interdomain.local_port;
+        return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
+/**
+ * Free an existing event channel. Returns 0 on success or -errno on error.
+ */
+int xenbus_free_evtchn(struct xenbus_device *dev, int port)
+{
+        struct evtchn_close close;
+        int err;
+        close.port = port;
+        err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+        if (err)
+                xenbus_dev_error(dev, err, "freeing event channel %d", port);
+        return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
+/**
+ * xenbus_map_ring_valloc
+ * @dev: xenbus device
+ * @gnt_ref: grant reference
+ * @vaddr: pointer to address to be filled out by mapping
+ *
+ * Based on Rusty Russell's skeleton driver's map_page.
+ * Map a page of memory into this domain from another domain's grant table.
+ * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
+ * page to that address, and sets *vaddr to that address.
+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
+ * or -ENOMEM on error. If an error is returned, device will switch to
+ * XenbusStateClosing and the error message will be saved in XenStore.
+ */
+int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
+{
+        struct gnttab_map_grant_ref op = {
+                .flags = GNTMAP_host_map,
+                .ref   = gnt_ref,
+                .dom   = dev->otherend_id,
+        };
+        struct vm_struct *area;
+        *vaddr = NULL;
+        area = alloc_vm_area(PAGE_SIZE);
+        if (!area)
+                return -ENOMEM;
+        op.host_addr = (unsigned long)area->addr;
+        if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+                BUG();
+        if (op.status != GNTST_okay) {
+                free_vm_area(area);
+                xenbus_dev_fatal(dev, op.status,
+                                 "mapping in shared page %d from domain %d",
+                                 gnt_ref, dev->otherend_id);
+                return op.status;
+        }
+        /* Stuff the handle in an unused field */
+        area->phys_addr = (unsigned long)op.handle;
+        *vaddr = area->addr;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
+/**
+ * xenbus_map_ring
+ * @dev: xenbus device
+ * @gnt_ref: grant reference
+ * @handle: pointer to grant handle to be filled
+ * @vaddr: address to be mapped to
+ *
+ * Map a page of memory into this domain from another domain's grant table.
+ * xenbus_map_ring does not allocate the virtual address space (you must do
+ * this yourself!). It only maps in the page to the specified address.
+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
+ * or -ENOMEM on error. If an error is returned, device will switch to
+ * XenbusStateClosing and the error message will be saved in XenStore.
+ */
+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
+                    grant_handle_t *handle, void *vaddr)
+{
+        struct gnttab_map_grant_ref op = {
+                .host_addr = (unsigned long)vaddr,
+                .flags     = GNTMAP_host_map,
+                .ref       = gnt_ref,
+                .dom       = dev->otherend_id,
+        };
+        if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+                BUG();
+        if (op.status != GNTST_okay) {
+                xenbus_dev_fatal(dev, op.status,
+                                 "mapping in shared page %d from domain %d",
+                                 gnt_ref, dev->otherend_id);
+        } else
+                *handle = op.handle;
+        return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring);
+/**
+ * xenbus_unmap_ring_vfree
+ * @dev: xenbus device
+ * @vaddr: addr to unmap
+ *
+ * Based on Rusty Russell's skeleton driver's unmap_page.
+ * Unmap a page of memory in this domain that was imported from another domain.
+ * Use xenbus_unmap_ring_vfree if you mapped in your memory with
+ * xenbus_map_ring_valloc (it will free the virtual address space).
+ * Returns 0 on success and returns GNTST_* on error
+ * (see xen/include/interface/grant_table.h).
+ */
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
+{
+        struct vm_struct *area;
+        struct gnttab_unmap_grant_ref op = {
+                .host_addr = (unsigned long)vaddr,
+        };
+        /* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr)
+         * method so that we don't have to muck with vmalloc internals here.
+         * We could force the user to hang on to their struct vm_struct from
+         * xenbus_map_ring_valloc, but these 6 lines considerably simplify
+         * this API.
+         */
+        read_lock(&vmlist_lock);
+        for (area = vmlist; area != NULL; area = area->next) {
+                if (area->addr == vaddr)
+                        break;
+        }
+        read_unlock(&vmlist_lock);
+        if (!area) {
+                xenbus_dev_error(dev, -ENOENT,
+                                 "can't find mapped virtual address %p", vaddr);
+                return GNTST_bad_virt_addr;
+        }
+        op.handle = (grant_handle_t)area->phys_addr;
+        if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+                BUG();
+        if (op.status == GNTST_okay)
+                free_vm_area(area);
+        else
+                xenbus_dev_error(dev, op.status,
+                                 "unmapping page at handle %d error %d",
+                                 (int16_t)area->phys_addr, op.status);
+        return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+/**
+ * xenbus_unmap_ring
+ * @dev: xenbus device
+ * @handle: grant handle
+ * @vaddr: addr to unmap
+ *
+ * Unmap a page of memory in this domain that was imported from another domain.
+ * Returns 0 on success and returns GNTST_* on error
+ * (see xen/include/interface/grant_table.h).
+ */
+int xenbus_unmap_ring(struct xenbus_device *dev,
+                      grant_handle_t handle, void *vaddr)
+{
+        struct gnttab_unmap_grant_ref op = {
+                .host_addr = (unsigned long)vaddr,
+                .handle    = handle,
+        };
+        if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+                BUG();
+        if (op.status != GNTST_okay)
+                xenbus_dev_error(dev, op.status,
+                                 "unmapping page at handle %d error %d",
+                                 handle, op.status);
+        return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
+/**
+ * xenbus_read_driver_state
+ * @path: path for driver
+ *
+ * Return the state of the driver rooted at the given store path, or
+ * XenbusStateUnknown if no state can be read.
+ */
+enum xenbus_state xenbus_read_driver_state(const char *path)
+{
+        enum xenbus_state result;
+        int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
+        if (err)
+                result = XenbusStateUnknown;
+        return result;
+}
+EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
new file mode 100644
index 000000000000..6efbe3f29ca5
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * xenbus_comms.c
+ *
+ * Low level code to talks to Xen Store: ringbuffer and event channel.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <linux/wait.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <xen/xenbus.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include "xenbus_comms.h"
+static int xenbus_irq;
+static DECLARE_WORK(probe_work, xenbus_probe);
+static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
+static irqreturn_t wake_waiting(int irq, void *unused)
+{
+        if (unlikely(xenstored_ready == 0)) {
+                xenstored_ready = 1;
+                schedule_work(&probe_work);
+        }
+        wake_up(&xb_waitq);
+        return IRQ_HANDLED;
+}
+static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
+{
+        return ((prod - cons) <= XENSTORE_RING_SIZE);
+}
+static void *get_output_chunk(XENSTORE_RING_IDX cons,
+                              XENSTORE_RING_IDX prod,
+                              char *buf, uint32_t *len)
+{
+        *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
+        if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
+                *len = XENSTORE_RING_SIZE - (prod - cons);
+        return buf + MASK_XENSTORE_IDX(prod);
+}
+static const void *get_input_chunk(XENSTORE_RING_IDX cons,
+                                   XENSTORE_RING_IDX prod,
+                                   const char *buf, uint32_t *len)
+{
+        *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
+        if ((prod - cons) < *len)
+                *len = prod - cons;
+        return buf + MASK_XENSTORE_IDX(cons);
+}
+/**
+ * xb_write - low level write
+ * @data: buffer to send
+ * @len: length of buffer
+ *
+ * Returns 0 on success, error otherwise.
+ */
+int xb_write(const void *data, unsigned len)
+{
+        struct xenstore_domain_interface *intf = xen_store_interface;
+        XENSTORE_RING_IDX cons, prod;
+        int rc;
+        while (len != 0) {
+                void *dst;
+                unsigned int avail;
+                rc = wait_event_interruptible(
+                        xb_waitq,
+                        (intf->req_prod - intf->req_cons) !=
+                        XENSTORE_RING_SIZE);
+                if (rc < 0)
+                        return rc;
+                /* Read indexes, then verify. */
+                cons = intf->req_cons;
+                prod = intf->req_prod;
+                if (!check_indexes(cons, prod)) {
+                        intf->req_cons = intf->req_prod = 0;
+                        return -EIO;
+                }
+                dst = get_output_chunk(cons, prod, intf->req, &avail);
+                if (avail == 0)
+                        continue;
+                if (avail > len)
+                        avail = len;
+                /* Must write data /after/ reading the consumer index. */
+                mb();
+                memcpy(dst, data, avail);
+                data += avail;
+                len -= avail;
+                /* Other side must not see new producer until data is there. */
+                wmb();
+                intf->req_prod += avail;
+                /* Implies mb(): other side will see the updated producer. */
+                notify_remote_via_evtchn(xen_store_evtchn);
+        }
+        return 0;
+}
+int xb_data_to_read(void)
+{
+        struct xenstore_domain_interface *intf = xen_store_interface;
+        return (intf->rsp_cons != intf->rsp_prod);
+}
+int xb_wait_for_data_to_read(void)
+{
+        return wait_event_interruptible(xb_waitq, xb_data_to_read());
+}
+int xb_read(void *data, unsigned len)
+{
+        struct xenstore_domain_interface *intf = xen_store_interface;
+        XENSTORE_RING_IDX cons, prod;
+        int rc;
+        while (len != 0) {
+                unsigned int avail;
+                const char *src;
+                rc = xb_wait_for_data_to_read();
+                if (rc < 0)
+                        return rc;
+                /* Read indexes, then verify. */
+                cons = intf->rsp_cons;
+                prod = intf->rsp_prod;
+                if (!check_indexes(cons, prod)) {
+                        intf->rsp_cons = intf->rsp_prod = 0;
+                        return -EIO;
+                }
+                src = get_input_chunk(cons, prod, intf->rsp, &avail);
+                if (avail == 0)
+                        continue;
+                if (avail > len)
+                        avail = len;
+                /* Must read data /after/ reading the producer index. */
+                rmb();
+                memcpy(data, src, avail);
+                data += avail;
+                len -= avail;
+                /* Other side must not see free space until we've copied out */
+                mb();
+                intf->rsp_cons += avail;
+                pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
+                /* Implies mb(): other side will see the updated consumer. */
+                notify_remote_via_evtchn(xen_store_evtchn);
+        }
+        return 0;
+}
+/**
+ * xb_init_comms - Set up interrupt handler off store event channel.
+ */
+int xb_init_comms(void)
+{
+        struct xenstore_domain_interface *intf = xen_store_interface;
+        int err;
+        if (intf->req_prod != intf->req_cons)
+                printk(KERN_ERR "XENBUS request ring is not quiescent "
+                       "(%08x:%08x)!\n", intf->req_cons, intf->req_prod);
+        if (intf->rsp_prod != intf->rsp_cons) {
+                printk(KERN_WARNING "XENBUS response ring is not quiescent "
+                       "(%08x:%08x): fixing up\n",
+                       intf->rsp_cons, intf->rsp_prod);
+                intf->rsp_cons = intf->rsp_prod;
+        }
+        if (xenbus_irq)
+                unbind_from_irqhandler(xenbus_irq, &xb_waitq);
+        err = bind_evtchn_to_irqhandler(
+                xen_store_evtchn, wake_waiting,
+                0, "xenbus", &xb_waitq);
+        if (err <= 0) {
+                printk(KERN_ERR "XENBUS request irq failed %i\n", err);
+                return err;
+        }
+        xenbus_irq = err;
+        return 0;
+}
diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h
new file mode 100644
index 000000000000..c21db7513736
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_comms.h
@@ -0,0 +1,46 @@
+/*
+ * Private include for xenbus communications.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef _XENBUS_COMMS_H
+#define _XENBUS_COMMS_H
+int xs_init(void);
+int xb_init_comms(void);
+/* Low level routines. */
+int xb_write(const void *data, unsigned len);
+int xb_read(void *data, unsigned len);
+int xb_data_to_read(void);
+int xb_wait_for_data_to_read(void);
+int xs_input_avail(void);
+extern struct xenstore_domain_interface *xen_store_interface;
+extern int xen_store_evtchn;
+#endif /* _XENBUS_COMMS_H */
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
new file mode 100644
index 000000000000..0b769f7c4a48
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -0,0 +1,935 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005, 2006 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#define DPRINTK(fmt, args...)                           \
+        pr_debug("xenbus_probe (%s:%d) " fmt ".\n",     \
+                 __func__, __LINE__, ##args)
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include "xenbus_comms.h"
+#include "xenbus_probe.h"
+int xen_store_evtchn;
+struct xenstore_domain_interface *xen_store_interface;
+static unsigned long xen_store_mfn;
+static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
+static void wait_for_devices(struct xenbus_driver *xendrv);
+static int xenbus_probe_frontend(const char *type, const char *name);
+static void xenbus_dev_shutdown(struct device *_dev);
+/* If something in array of ids matches this device, return it. */
+static const struct xenbus_device_id *
+match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
+{
+        for (; *arr->devicetype != '\0'; arr++) {
+                if (!strcmp(arr->devicetype, dev->devicetype))
+                        return arr;
+        }
+        return NULL;
+}
+int xenbus_match(struct device *_dev, struct device_driver *_drv)
+{
+        struct xenbus_driver *drv = to_xenbus_driver(_drv);
+        if (!drv->ids)
+                return 0;
+        return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
+}
+/* device/<type>/<id> => <type>-<id> */
+static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
+{
+        nodename = strchr(nodename, '/');
+        if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
+                printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
+                return -EINVAL;
+        }
+        strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
+        if (!strchr(bus_id, '/')) {
+                printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
+                return -EINVAL;
+        }
+        *strchr(bus_id, '/') = '-';
+        return 0;
+}
+static void free_otherend_details(struct xenbus_device *dev)
+{
+        kfree(dev->otherend);
+        dev->otherend = NULL;
+}
+static void free_otherend_watch(struct xenbus_device *dev)
+{
+        if (dev->otherend_watch.node) {
+                unregister_xenbus_watch(&dev->otherend_watch);
+                kfree(dev->otherend_watch.node);
+                dev->otherend_watch.node = NULL;
+        }
+}
+int read_otherend_details(struct xenbus_device *xendev,
+                                 char *id_node, char *path_node)
+{
+        int err = xenbus_gather(XBT_NIL, xendev->nodename,
+                                id_node, "%i", &xendev->otherend_id,
+                                path_node, NULL, &xendev->otherend,
+                                NULL);
+        if (err) {
+                xenbus_dev_fatal(xendev, err,
+                                 "reading other end details from %s",
+                                 xendev->nodename);
+                return err;
+        }
+        if (strlen(xendev->otherend) == 0 ||
+            !xenbus_exists(XBT_NIL, xendev->otherend, "")) {
+                xenbus_dev_fatal(xendev, -ENOENT,
+                                 "unable to read other end from %s.  "
+                                 "missing or inaccessible.",
+                                 xendev->nodename);
+                free_otherend_details(xendev);
+                return -ENOENT;
+        }
+        return 0;
+}
+static int read_backend_details(struct xenbus_device *xendev)
+{
+        return read_otherend_details(xendev, "backend-id", "backend");
+}
+/* Bus type for frontend drivers. */
+static struct xen_bus_type xenbus_frontend = {
+        .root = "device",
+        .levels = 2,            /* device/type/<id> */
+        .get_bus_id = frontend_bus_id,
+        .probe = xenbus_probe_frontend,
+        .bus = {
+                .name     = "xen",
+                .match    = xenbus_match,
+                .probe    = xenbus_dev_probe,
+                .remove   = xenbus_dev_remove,
+                .shutdown = xenbus_dev_shutdown,
+        },
+};
+static void otherend_changed(struct xenbus_watch *watch,
+                             const char **vec, unsigned int len)
+{
+        struct xenbus_device *dev =
+                container_of(watch, struct xenbus_device, otherend_watch);
+        struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+        enum xenbus_state state;
+        /* Protect us against watches firing on old details when the otherend
+           details change, say immediately after a resume. */
+        if (!dev->otherend ||
+            strncmp(dev->otherend, vec[XS_WATCH_PATH],
+                    strlen(dev->otherend))) {
+                dev_dbg(&dev->dev, "Ignoring watch at %s", vec[XS_WATCH_PATH]);
+                return;
+        }
+        state = xenbus_read_driver_state(dev->otherend);
+        dev_dbg(&dev->dev, "state is %d, (%s), %s, %s",
+                state, xenbus_strstate(state), dev->otherend_watch.node,
+                vec[XS_WATCH_PATH]);
+        /*
+         * Ignore xenbus transitions during shutdown. This prevents us doing
+         * work that can fail e.g., when the rootfs is gone.
+         */
+        if (system_state > SYSTEM_RUNNING) {
+                struct xen_bus_type *bus = bus;
+                bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
+                /* If we're frontend, drive the state machine to Closed. */
+                /* This should cause the backend to release our resources. */
+                if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
+                        xenbus_frontend_closed(dev);
+                return;
+        }
+        if (drv->otherend_changed)
+                drv->otherend_changed(dev, state);
+}
+static int talk_to_otherend(struct xenbus_device *dev)
+{
+        struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+        free_otherend_watch(dev);
+        free_otherend_details(dev);
+        return drv->read_otherend_details(dev);
+}
+static int watch_otherend(struct xenbus_device *dev)
+{
+        return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
+                                    "%s/%s", dev->otherend, "state");
+}
+int xenbus_dev_probe(struct device *_dev)
+{
+        struct xenbus_device *dev = to_xenbus_device(_dev);
+        struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
+        const struct xenbus_device_id *id;
+        int err;
+        DPRINTK("%s", dev->nodename);
+        if (!drv->probe) {
+                err = -ENODEV;
+                goto fail;
+        }
+        id = match_device(drv->ids, dev);
+        if (!id) {
+                err = -ENODEV;
+                goto fail;
+        }
+        err = talk_to_otherend(dev);
+        if (err) {
+                dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n",
+                         dev->nodename);
+                return err;
+        }
+        err = drv->probe(dev, id);
+        if (err)
+                goto fail;
+        err = watch_otherend(dev);
+        if (err) {
+                dev_warn(&dev->dev, "watch_otherend on %s failed.\n",
+                       dev->nodename);
+                return err;
+        }
+        return 0;
+fail:
+        xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
+        xenbus_switch_state(dev, XenbusStateClosed);
+        return -ENODEV;
+}
+int xenbus_dev_remove(struct device *_dev)
+{
+        struct xenbus_device *dev = to_xenbus_device(_dev);
+        struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
+        DPRINTK("%s", dev->nodename);
+        free_otherend_watch(dev);
+        free_otherend_details(dev);
+        if (drv->remove)
+                drv->remove(dev);
+        xenbus_switch_state(dev, XenbusStateClosed);
+        return 0;
+}
+static void xenbus_dev_shutdown(struct device *_dev)
+{
+        struct xenbus_device *dev = to_xenbus_device(_dev);
+        unsigned long timeout = 5*HZ;
+        DPRINTK("%s", dev->nodename);
+        get_device(&dev->dev);
+        if (dev->state != XenbusStateConnected) {
+                printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__,
+                       dev->nodename, xenbus_strstate(dev->state));
+                goto out;
+        }
+        xenbus_switch_state(dev, XenbusStateClosing);
+        timeout = wait_for_completion_timeout(&dev->down, timeout);
+        if (!timeout)
+                printk(KERN_INFO "%s: %s timeout closing device\n",
+                       __func__, dev->nodename);
+ out:
+        put_device(&dev->dev);
+}
+int xenbus_register_driver_common(struct xenbus_driver *drv,
+                                  struct xen_bus_type *bus,
+                                  struct module *owner,
+                                  const char *mod_name)
+{
+        drv->driver.name = drv->name;
+        drv->driver.bus = &bus->bus;
+        drv->driver.owner = owner;
+        drv->driver.mod_name = mod_name;
+        return driver_register(&drv->driver);
+}
+int __xenbus_register_frontend(struct xenbus_driver *drv,
+                               struct module *owner, const char *mod_name)
+{
+        int ret;
+        drv->read_otherend_details = read_backend_details;
+        ret = xenbus_register_driver_common(drv, &xenbus_frontend,
+                                            owner, mod_name);
+        if (ret)
+                return ret;
+        /* If this driver is loaded as a module wait for devices to attach. */
+        wait_for_devices(drv);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
+void xenbus_unregister_driver(struct xenbus_driver *drv)
+{
+        driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
+struct xb_find_info
+{
+        struct xenbus_device *dev;
+        const char *nodename;
+};
+static int cmp_dev(struct device *dev, void *data)
+{
+        struct xenbus_device *xendev = to_xenbus_device(dev);
+        struct xb_find_info *info = data;
+        if (!strcmp(xendev->nodename, info->nodename)) {
+                info->dev = xendev;
+                get_device(dev);
+                return 1;
+        }
+        return 0;
+}
+struct xenbus_device *xenbus_device_find(const char *nodename,
+                                         struct bus_type *bus)
+{
+        struct xb_find_info info = { .dev = NULL, .nodename = nodename };
+        bus_for_each_dev(bus, NULL, &info, cmp_dev);
+        return info.dev;
+}
+static int cleanup_dev(struct device *dev, void *data)
+{
+        struct xenbus_device *xendev = to_xenbus_device(dev);
+        struct xb_find_info *info = data;
+        int len = strlen(info->nodename);
+        DPRINTK("%s", info->nodename);
+        /* Match the info->nodename path, or any subdirectory of that path. */
+        if (strncmp(xendev->nodename, info->nodename, len))
+                return 0;
+        /* If the node name is longer, ensure it really is a subdirectory. */
+        if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
+                return 0;
+        info->dev = xendev;
+        get_device(dev);
+        return 1;
+}
+static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
+{
+        struct xb_find_info info = { .nodename = path };
+        do {
+                info.dev = NULL;
+                bus_for_each_dev(bus, NULL, &info, cleanup_dev);
+                if (info.dev) {
+                        device_unregister(&info.dev->dev);
+                        put_device(&info.dev->dev);
+                }
+        } while (info.dev);
+}
+static void xenbus_dev_release(struct device *dev)
+{
+        if (dev)
+                kfree(to_xenbus_device(dev));
+}
+static ssize_t xendev_show_nodename(struct device *dev,
+                                    struct device_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
+}
+DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
+static ssize_t xendev_show_devtype(struct device *dev,
+                                   struct device_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
+}
+DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
+int xenbus_probe_node(struct xen_bus_type *bus,
+                      const char *type,
+                      const char *nodename)
+{
+        int err;
+        struct xenbus_device *xendev;
+        size_t stringlen;
+        char *tmpstring;
+        enum xenbus_state state = xenbus_read_driver_state(nodename);
+        if (state != XenbusStateInitialising) {
+                /* Device is not new, so ignore it.  This can happen if a
+                   device is going away after switching to Closed.  */
+                return 0;
+        }
+        stringlen = strlen(nodename) + 1 + strlen(type) + 1;
+        xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
+        if (!xendev)
+                return -ENOMEM;
+        xendev->state = XenbusStateInitialising;
+        /* Copy the strings into the extra space. */
+        tmpstring = (char *)(xendev + 1);
+        strcpy(tmpstring, nodename);
+        xendev->nodename = tmpstring;
+        tmpstring += strlen(tmpstring) + 1;
+        strcpy(tmpstring, type);
+        xendev->devicetype = tmpstring;
+        init_completion(&xendev->down);
+        xendev->dev.bus = &bus->bus;
+        xendev->dev.release = xenbus_dev_release;
+        err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
+        if (err)
+                goto fail;
+        /* Register with generic device framework. */
+        err = device_register(&xendev->dev);
+        if (err)
+                goto fail;
+        err = device_create_file(&xendev->dev, &dev_attr_nodename);
+        if (err)
+                goto fail_unregister;
+        err = device_create_file(&xendev->dev, &dev_attr_devtype);
+        if (err)
+                goto fail_remove_file;
+        return 0;
+fail_remove_file:
+        device_remove_file(&xendev->dev, &dev_attr_nodename);
+fail_unregister:
+        device_unregister(&xendev->dev);
+fail:
+        kfree(xendev);
+        return err;
+}
+/* device/<typename>/<name> */
+static int xenbus_probe_frontend(const char *type, const char *name)
+{
+        char *nodename;
+        int err;
+        nodename = kasprintf(GFP_KERNEL, "%s/%s/%s",
+                             xenbus_frontend.root, type, name);
+        if (!nodename)
+                return -ENOMEM;
+        DPRINTK("%s", nodename);
+        err = xenbus_probe_node(&xenbus_frontend, type, nodename);
+        kfree(nodename);
+        return err;
+}
+static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
+{
+        int err = 0;
+        char **dir;
+        unsigned int dir_n = 0;
+        int i;
+        dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
+        if (IS_ERR(dir))
+                return PTR_ERR(dir);
+        for (i = 0; i < dir_n; i++) {
+                err = bus->probe(type, dir[i]);
+                if (err)
+                        break;
+        }
+        kfree(dir);
+        return err;
+}
+int xenbus_probe_devices(struct xen_bus_type *bus)
+{
+        int err = 0;
+        char **dir;
+        unsigned int i, dir_n;
+        dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
+        if (IS_ERR(dir))
+                return PTR_ERR(dir);
+        for (i = 0; i < dir_n; i++) {
+                err = xenbus_probe_device_type(bus, dir[i]);
+                if (err)
+                        break;
+        }
+        kfree(dir);
+        return err;
+}
+static unsigned int char_count(const char *str, char c)
+{
+        unsigned int i, ret = 0;
+        for (i = 0; str[i]; i++)
+                if (str[i] == c)
+                        ret++;
+        return ret;
+}
+static int strsep_len(const char *str, char c, unsigned int len)
+{
+        unsigned int i;
+        for (i = 0; str[i]; i++)
+                if (str[i] == c) {
+                        if (len == 0)
+                                return i;
+                        len--;
+                }
+        return (len == 0) ? i : -ERANGE;
+}
+void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
+{
+        int exists, rootlen;
+        struct xenbus_device *dev;
+        char type[BUS_ID_SIZE];
+        const char *p, *root;
+        if (char_count(node, '/') < 2)
+                return;
+        exists = xenbus_exists(XBT_NIL, node, "");
+        if (!exists) {
+                xenbus_cleanup_devices(node, &bus->bus);
+                return;
+        }
+        /* backend/<type>/... or device/<type>/... */
+        p = strchr(node, '/') + 1;
+        snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
+        type[BUS_ID_SIZE-1] = '\0';
+        rootlen = strsep_len(node, '/', bus->levels);
+        if (rootlen < 0)
+                return;
+        root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node);
+        if (!root)
+                return;
+        dev = xenbus_device_find(root, &bus->bus);
+        if (!dev)
+                xenbus_probe_node(bus, type, root);
+        else
+                put_device(&dev->dev);
+        kfree(root);
+}
+static void frontend_changed(struct xenbus_watch *watch,
+                             const char **vec, unsigned int len)
+{
+        DPRINTK("");
+        xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+}
+/* We watch for devices appearing and vanishing. */
+static struct xenbus_watch fe_watch = {
+        .node = "device",
+        .callback = frontend_changed,
+};
+static int suspend_dev(struct device *dev, void *data)
+{
+        int err = 0;
+        struct xenbus_driver *drv;
+        struct xenbus_device *xdev;
+        DPRINTK("");
+        if (dev->driver == NULL)
+                return 0;
+        drv = to_xenbus_driver(dev->driver);
+        xdev = container_of(dev, struct xenbus_device, dev);
+        if (drv->suspend)
+                err = drv->suspend(xdev);
+        if (err)
+                printk(KERN_WARNING
+                       "xenbus: suspend %s failed: %i\n", dev->bus_id, err);
+        return 0;
+}
+static int suspend_cancel_dev(struct device *dev, void *data)
+{
+        int err = 0;
+        struct xenbus_driver *drv;
+        struct xenbus_device *xdev;
+        DPRINTK("");
+        if (dev->driver == NULL)
+                return 0;
+        drv = to_xenbus_driver(dev->driver);
+        xdev = container_of(dev, struct xenbus_device, dev);
+        if (drv->suspend_cancel)
+                err = drv->suspend_cancel(xdev);
+        if (err)
+                printk(KERN_WARNING
+                       "xenbus: suspend_cancel %s failed: %i\n",
+                       dev->bus_id, err);
+        return 0;
+}
+static int resume_dev(struct device *dev, void *data)
+{
+        int err;
+        struct xenbus_driver *drv;
+        struct xenbus_device *xdev;
+        DPRINTK("");
+        if (dev->driver == NULL)
+                return 0;
+        drv = to_xenbus_driver(dev->driver);
+        xdev = container_of(dev, struct xenbus_device, dev);
+        err = talk_to_otherend(xdev);
+        if (err) {
+                printk(KERN_WARNING
+                       "xenbus: resume (talk_to_otherend) %s failed: %i\n",
+                       dev->bus_id, err);
+                return err;
+        }
+        xdev->state = XenbusStateInitialising;
+        if (drv->resume) {
+                err = drv->resume(xdev);
+                if (err) {
+                        printk(KERN_WARNING
+                               "xenbus: resume %s failed: %i\n",
+                               dev->bus_id, err);
+                        return err;
+                }
+        }
+        err = watch_otherend(xdev);
+        if (err) {
+                printk(KERN_WARNING
+                       "xenbus_probe: resume (watch_otherend) %s failed: "
+                       "%d.\n", dev->bus_id, err);
+                return err;
+        }
+        return 0;
+}
+void xenbus_suspend(void)
+{
+        DPRINTK("");
+        bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
+        xenbus_backend_suspend(suspend_dev);
+        xs_suspend();
+}
+EXPORT_SYMBOL_GPL(xenbus_suspend);
+void xenbus_resume(void)
+{
+        xb_init_comms();
+        xs_resume();
+        bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
+        xenbus_backend_resume(resume_dev);
+}
+EXPORT_SYMBOL_GPL(xenbus_resume);
+void xenbus_suspend_cancel(void)
+{
+        xs_suspend_cancel();
+        bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
+        xenbus_backend_resume(suspend_cancel_dev);
+}
+EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
+/* A flag to determine if xenstored is 'ready' (i.e. has started) */
+int xenstored_ready = 0;
+int register_xenstore_notifier(struct notifier_block *nb)
+{
+        int ret = 0;
+        if (xenstored_ready > 0)
+                ret = nb->notifier_call(nb, 0, NULL);
+        else
+                blocking_notifier_chain_register(&xenstore_chain, nb);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(register_xenstore_notifier);
+void unregister_xenstore_notifier(struct notifier_block *nb)
+{
+        blocking_notifier_chain_unregister(&xenstore_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
+void xenbus_probe(struct work_struct *unused)
+{
+        BUG_ON((xenstored_ready <= 0));
+        /* Enumerate devices in xenstore and watch for changes. */
+        xenbus_probe_devices(&xenbus_frontend);
+        register_xenbus_watch(&fe_watch);
+        xenbus_backend_probe_and_watch();
+        /* Notify others that xenstore is up */
+        blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
+}
+static int __init xenbus_probe_init(void)
+{
+        int err = 0;
+        DPRINTK("");
+        err = -ENODEV;
+        if (!is_running_on_xen())
+                goto out_error;
+        /* Register ourselves with the kernel bus subsystem */
+        err = bus_register(&xenbus_frontend.bus);
+        if (err)
+                goto out_error;
+        err = xenbus_backend_bus_register();
+        if (err)
+                goto out_unreg_front;
+        /*
+         * Domain0 doesn't have a store_evtchn or store_mfn yet.
+         */
+        if (is_initial_xendomain()) {
+                /* dom0 not yet supported */
+        } else {
+                xenstored_ready = 1;
+                xen_store_evtchn = xen_start_info->store_evtchn;
+                xen_store_mfn = xen_start_info->store_mfn;
+        }
+        xen_store_interface = mfn_to_virt(xen_store_mfn);
+        /* Initialize the interface to xenstore. */
+        err = xs_init();
+        if (err) {
+                printk(KERN_WARNING
+                       "XENBUS: Error initializing xenstore comms: %i\n", err);
+                goto out_unreg_back;
+        }
+        if (!is_initial_xendomain())
+                xenbus_probe(NULL);
+        return 0;
+  out_unreg_back:
+        xenbus_backend_bus_unregister();
+  out_unreg_front:
+        bus_unregister(&xenbus_frontend.bus);
+  out_error:
+        return err;
+}
+postcore_initcall(xenbus_probe_init);
+MODULE_LICENSE("GPL");
+static int is_disconnected_device(struct device *dev, void *data)
+{
+        struct xenbus_device *xendev = to_xenbus_device(dev);
+        struct device_driver *drv = data;
+        /*
+         * A device with no driver will never connect. We care only about
+         * devices which should currently be in the process of connecting.
+         */
+        if (!dev->driver)
+                return 0;
+        /* Is this search limited to a particular driver? */
+        if (drv && (dev->driver != drv))
+                return 0;
+        return (xendev->state != XenbusStateConnected);
+}
+static int exists_disconnected_device(struct device_driver *drv)
+{
+        return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+                                is_disconnected_device);
+}
+static int print_device_status(struct device *dev, void *data)
+{
+        struct xenbus_device *xendev = to_xenbus_device(dev);
+        struct device_driver *drv = data;
+        /* Is this operation limited to a particular driver? */
+        if (drv && (dev->driver != drv))
+                return 0;
+        if (!dev->driver) {
+                /* Information only: is this too noisy? */
+                printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
+                       xendev->nodename);
+        } else if (xendev->state != XenbusStateConnected) {
+                printk(KERN_WARNING "XENBUS: Timeout connecting "
+                       "to device: %s (state %d)\n",
+                       xendev->nodename, xendev->state);
+        }
+        return 0;
+}
+/* We only wait for device setup after most initcalls have run. */
+static int ready_to_wait_for_devices;
+/*
+ * On a 10 second timeout, wait for all devices currently configured.  We need
+ * to do this to guarantee that the filesystems and / or network devices
+ * needed for boot are available, before we can allow the boot to proceed.
+ *
+ * This needs to be on a late_initcall, to happen after the frontend device
+ * drivers have been initialised, but before the root fs is mounted.
+ *
+ * A possible improvement here would be to have the tools add a per-device
+ * flag to the store entry, indicating whether it is needed at boot time.
+ * This would allow people who knew what they were doing to accelerate their
+ * boot slightly, but of course needs tools or manual intervention to set up
+ * those flags correctly.
+ */
+static void wait_for_devices(struct xenbus_driver *xendrv)
+{
+        unsigned long timeout = jiffies + 10*HZ;
+        struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+        if (!ready_to_wait_for_devices || !is_running_on_xen())
+                return;
+        while (exists_disconnected_device(drv)) {
+                if (time_after(jiffies, timeout))
+                        break;
+                schedule_timeout_interruptible(HZ/10);
+        }
+        bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+                         print_device_status);
+}
+#ifndef MODULE
+static int __init boot_wait_for_devices(void)
+{
+        ready_to_wait_for_devices = 1;
+        wait_for_devices(NULL);
+        return 0;
+}
+late_initcall(boot_wait_for_devices);
+#endif
diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h
new file mode 100644
index 000000000000..e09b19415a40
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@ -0,0 +1,74 @@
+/******************************************************************************
+ * xenbus_probe.h
+ *
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 XenSource Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef _XENBUS_PROBE_H
+#define _XENBUS_PROBE_H
+#ifdef CONFIG_XEN_BACKEND
+extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
+extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
+extern void xenbus_backend_probe_and_watch(void);
+extern int xenbus_backend_bus_register(void);
+extern void xenbus_backend_bus_unregister(void);
+#else
+static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
+static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
+static inline void xenbus_backend_probe_and_watch(void) {}
+static inline int xenbus_backend_bus_register(void) { return 0; }
+static inline void xenbus_backend_bus_unregister(void) {}
+#endif
+struct xen_bus_type
+{
+        char *root;
+        unsigned int levels;
+        int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
+        int (*probe)(const char *type, const char *dir);
+        struct bus_type bus;
+};
+extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
+extern int xenbus_dev_probe(struct device *_dev);
+extern int xenbus_dev_remove(struct device *_dev);
+extern int xenbus_register_driver_common(struct xenbus_driver *drv,
+                                         struct xen_bus_type *bus,
+                                         struct module *owner,
+                                         const char *mod_name);
+extern int xenbus_probe_node(struct xen_bus_type *bus,
+                             const char *type,
+                             const char *nodename);
+extern int xenbus_probe_devices(struct xen_bus_type *bus);
+extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
+#endif
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
new file mode 100644
index 000000000000..9e943fbce81b
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -0,0 +1,861 @@
+/******************************************************************************
+ * xenbus_xs.c
+ *
+ * This is the kernel equivalent of the "xs" library.  We don't need everything
+ * and we use xenbus_comms for communication.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <linux/unistd.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/fcntl.h>
+#include <linux/kthread.h>
+#include <linux/rwsem.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <xen/xenbus.h>
+#include "xenbus_comms.h"
+struct xs_stored_msg {
+        struct list_head list;
+        struct xsd_sockmsg hdr;
+        union {
+                /* Queued replies. */
+                struct {
+                        char *body;
+                } reply;
+                /* Queued watch events. */
+                struct {
+                        struct xenbus_watch *handle;
+                        char **vec;
+                        unsigned int vec_size;
+                } watch;
+        } u;
+};
+struct xs_handle {
+        /* A list of replies. Currently only one will ever be outstanding. */
+        struct list_head reply_list;
+        spinlock_t reply_lock;
+        wait_queue_head_t reply_waitq;
+        /*
+         * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex.
+         * response_mutex is never taken simultaneously with the other three.
+         */
+        /* One request at a time. */
+        struct mutex request_mutex;
+        /* Protect xenbus reader thread against save/restore. */
+        struct mutex response_mutex;
+        /* Protect transactions against save/restore. */
+        struct rw_semaphore transaction_mutex;
+        /* Protect watch (de)register against save/restore. */
+        struct rw_semaphore watch_mutex;
+};
+static struct xs_handle xs_state;
+/* List of registered watches, and a lock to protect it. */
+static LIST_HEAD(watches);
+static DEFINE_SPINLOCK(watches_lock);
+/* List of pending watch callback events, and a lock to protect it. */
+static LIST_HEAD(watch_events);
+static DEFINE_SPINLOCK(watch_events_lock);
+/*
+ * Details of the xenwatch callback kernel thread. The thread waits on the
+ * watch_events_waitq for work to do (queued on watch_events list). When it
+ * wakes up it acquires the xenwatch_mutex before reading the list and
+ * carrying out work.
+ */
+static pid_t xenwatch_pid;
+static DEFINE_MUTEX(xenwatch_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
+static int get_error(const char *errorstring)
+{
+        unsigned int i;
+        for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
+                if (i == ARRAY_SIZE(xsd_errors) - 1) {
+                        printk(KERN_WARNING
+                               "XENBUS xen store gave: unknown error %s",
+                               errorstring);
+                        return EINVAL;
+                }
+        }
+        return xsd_errors[i].errnum;
+}
+static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
+{
+        struct xs_stored_msg *msg;
+        char *body;
+        spin_lock(&xs_state.reply_lock);
+        while (list_empty(&xs_state.reply_list)) {
+                spin_unlock(&xs_state.reply_lock);
+                /* XXX FIXME: Avoid synchronous wait for response here. */
+                wait_event(xs_state.reply_waitq,
+                           !list_empty(&xs_state.reply_list));
+                spin_lock(&xs_state.reply_lock);
+        }
+        msg = list_entry(xs_state.reply_list.next,
+                         struct xs_stored_msg, list);
+        list_del(&msg->list);
+        spin_unlock(&xs_state.reply_lock);
+        *type = msg->hdr.type;
+        if (len)
+                *len = msg->hdr.len;
+        body = msg->u.reply.body;
+        kfree(msg);
+        return body;
+}
+void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
+{
+        void *ret;
+        struct xsd_sockmsg req_msg = *msg;
+        int err;
+        if (req_msg.type == XS_TRANSACTION_START)
+                down_read(&xs_state.transaction_mutex);
+        mutex_lock(&xs_state.request_mutex);
+        err = xb_write(msg, sizeof(*msg) + msg->len);
+        if (err) {
+                msg->type = XS_ERROR;
+                ret = ERR_PTR(err);
+        } else
+                ret = read_reply(&msg->type, &msg->len);
+        mutex_unlock(&xs_state.request_mutex);
+        if ((msg->type == XS_TRANSACTION_END) ||
+            ((req_msg.type == XS_TRANSACTION_START) &&
+             (msg->type == XS_ERROR)))
+                up_read(&xs_state.transaction_mutex);
+        return ret;
+}
+/* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
+static void *xs_talkv(struct xenbus_transaction t,
+                      enum xsd_sockmsg_type type,
+                      const struct kvec *iovec,
+                      unsigned int num_vecs,
+                      unsigned int *len)
+{
+        struct xsd_sockmsg msg;
+        void *ret = NULL;
+        unsigned int i;
+        int err;
+        msg.tx_id = t.id;
+        msg.req_id = 0;
+        msg.type = type;
+        msg.len = 0;
+        for (i = 0; i < num_vecs; i++)
+                msg.len += iovec[i].iov_len;
+        mutex_lock(&xs_state.request_mutex);
+        err = xb_write(&msg, sizeof(msg));
+        if (err) {
+                mutex_unlock(&xs_state.request_mutex);
+                return ERR_PTR(err);
+        }
+        for (i = 0; i < num_vecs; i++) {
+                err = xb_write(iovec[i].iov_base, iovec[i].iov_len);
+                if (err) {
+                        mutex_unlock(&xs_state.request_mutex);
+                        return ERR_PTR(err);
+                }
+        }
+        ret = read_reply(&msg.type, len);
+        mutex_unlock(&xs_state.request_mutex);
+        if (IS_ERR(ret))
+                return ret;
+        if (msg.type == XS_ERROR) {
+                err = get_error(ret);
+                kfree(ret);
+                return ERR_PTR(-err);
+        }
+        if (msg.type != type) {
+                if (printk_ratelimit())
+                        printk(KERN_WARNING
+                               "XENBUS unexpected type [%d], expected [%d]\n",
+                               msg.type, type);
+                kfree(ret);
+                return ERR_PTR(-EINVAL);
+        }
+        return ret;
+}
+/* Simplified version of xs_talkv: single message. */
+static void *xs_single(struct xenbus_transaction t,
+                       enum xsd_sockmsg_type type,
+                       const char *string,
+                       unsigned int *len)
+{
+        struct kvec iovec;
+        iovec.iov_base = (void *)string;
+        iovec.iov_len = strlen(string) + 1;
+        return xs_talkv(t, type, &iovec, 1, len);
+}
+/* Many commands only need an ack, don't care what it says. */
+static int xs_error(char *reply)
+{
+        if (IS_ERR(reply))
+                return PTR_ERR(reply);
+        kfree(reply);
+        return 0;
+}
+static unsigned int count_strings(const char *strings, unsigned int len)
+{
+        unsigned int num;
+        const char *p;
+        for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
+                num++;
+        return num;
+}
+/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
+static char *join(const char *dir, const char *name)
+{
+        char *buffer;
+        if (strlen(name) == 0)
+                buffer = kasprintf(GFP_KERNEL, "%s", dir);
+        else
+                buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
+        return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
+}
+static char **split(char *strings, unsigned int len, unsigned int *num)
+{
+        char *p, **ret;
+        /* Count the strings. */
+        *num = count_strings(strings, len);
+        /* Transfer to one big alloc for easy freeing. */
+        ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
+        if (!ret) {
+                kfree(strings);
+                return ERR_PTR(-ENOMEM);
+        }
+        memcpy(&ret[*num], strings, len);
+        kfree(strings);
+        strings = (char *)&ret[*num];
+        for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
+                ret[(*num)++] = p;
+        return ret;
+}
+char **xenbus_directory(struct xenbus_transaction t,
+                        const char *dir, const char *node, unsigned int *num)
+{
+        char *strings, *path;
+        unsigned int len;
+        path = join(dir, node);
+        if (IS_ERR(path))
+                return (char **)path;
+        strings = xs_single(t, XS_DIRECTORY, path, &len);
+        kfree(path);
+        if (IS_ERR(strings))
+                return (char **)strings;
+        return split(strings, len, num);
+}
+EXPORT_SYMBOL_GPL(xenbus_directory);
+/* Check if a path exists. Return 1 if it does. */
+int xenbus_exists(struct xenbus_transaction t,
+                  const char *dir, const char *node)
+{
+        char **d;
+        int dir_n;
+        d = xenbus_directory(t, dir, node, &dir_n);
+        if (IS_ERR(d))
+                return 0;
+        kfree(d);
+        return 1;
+}
+EXPORT_SYMBOL_GPL(xenbus_exists);
+/* Get the value of a single file.
+ * Returns a kmalloced value: call free() on it after use.
+ * len indicates length in bytes.
+ */
+void *xenbus_read(struct xenbus_transaction t,
+                  const char *dir, const char *node, unsigned int *len)
+{
+        char *path;
+        void *ret;
+        path = join(dir, node);
+        if (IS_ERR(path))
+                return (void *)path;
+        ret = xs_single(t, XS_READ, path, len);
+        kfree(path);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_read);
+/* Write the value of a single file.
+ * Returns -err on failure.
+ */
+int xenbus_write(struct xenbus_transaction t,
+                 const char *dir, const char *node, const char *string)
+{
+        const char *path;
+        struct kvec iovec[2];
+        int ret;
+        path = join(dir, node);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        iovec[0].iov_base = (void *)path;
+        iovec[0].iov_len = strlen(path) + 1;
+        iovec[1].iov_base = (void *)string;
+        iovec[1].iov_len = strlen(string);
+        ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
+        kfree(path);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_write);
+/* Create a new directory. */
+int xenbus_mkdir(struct xenbus_transaction t,
+                 const char *dir, const char *node)
+{
+        char *path;
+        int ret;
+        path = join(dir, node);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
+        kfree(path);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_mkdir);
+/* Destroy a file or directory (directories must be empty). */
+int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
+{
+        char *path;
+        int ret;
+        path = join(dir, node);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        ret = xs_error(xs_single(t, XS_RM, path, NULL));
+        kfree(path);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_rm);
+/* Start a transaction: changes by others will not be seen during this
+ * transaction, and changes will not be visible to others until end.
+ */
+int xenbus_transaction_start(struct xenbus_transaction *t)
+{
+        char *id_str;
+        down_read(&xs_state.transaction_mutex);
+        id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
+        if (IS_ERR(id_str)) {
+                up_read(&xs_state.transaction_mutex);
+                return PTR_ERR(id_str);
+        }
+        t->id = simple_strtoul(id_str, NULL, 0);
+        kfree(id_str);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_transaction_start);
+/* End a transaction.
+ * If abandon is true, transaction is discarded instead of committed.
+ */
+int xenbus_transaction_end(struct xenbus_transaction t, int abort)
+{
+        char abortstr[2];
+        int err;
+        if (abort)
+                strcpy(abortstr, "F");
+        else
+                strcpy(abortstr, "T");
+        err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
+        up_read(&xs_state.transaction_mutex);
+        return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_transaction_end);
+/* Single read and scanf: returns -errno or num scanned. */
+int xenbus_scanf(struct xenbus_transaction t,
+                 const char *dir, const char *node, const char *fmt, ...)
+{
+        va_list ap;
+        int ret;
+        char *val;
+        val = xenbus_read(t, dir, node, NULL);
+        if (IS_ERR(val))
+                return PTR_ERR(val);
+        va_start(ap, fmt);
+        ret = vsscanf(val, fmt, ap);
+        va_end(ap);
+        kfree(val);
+        /* Distinctive errno. */
+        if (ret == 0)
+                return -ERANGE;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_scanf);
+/* Single printf and write: returns -errno or 0. */
+int xenbus_printf(struct xenbus_transaction t,
+                  const char *dir, const char *node, const char *fmt, ...)
+{
+        va_list ap;
+        int ret;
+#define PRINTF_BUFFER_SIZE 4096
+        char *printf_buffer;
+        printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
+        if (printf_buffer == NULL)
+                return -ENOMEM;
+        va_start(ap, fmt);
+        ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
+        va_end(ap);
+        BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
+        ret = xenbus_write(t, dir, node, printf_buffer);
+        kfree(printf_buffer);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_printf);
+/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
+int xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
+{
+        va_list ap;
+        const char *name;
+        int ret = 0;
+        va_start(ap, dir);
+        while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
+                const char *fmt = va_arg(ap, char *);
+                void *result = va_arg(ap, void *);
+                char *p;
+                p = xenbus_read(t, dir, name, NULL);
+                if (IS_ERR(p)) {
+                        ret = PTR_ERR(p);
+                        break;
+                }
+                if (fmt) {
+                        if (sscanf(p, fmt, result) == 0)
+                                ret = -EINVAL;
+                        kfree(p);
+                } else
+                        *(char **)result = p;
+        }
+        va_end(ap);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_gather);
+static int xs_watch(const char *path, const char *token)
+{
+        struct kvec iov[2];
+        iov[0].iov_base = (void *)path;
+        iov[0].iov_len = strlen(path) + 1;
+        iov[1].iov_base = (void *)token;
+        iov[1].iov_len = strlen(token) + 1;
+        return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov,
+                                 ARRAY_SIZE(iov), NULL));
+}
+static int xs_unwatch(const char *path, const char *token)
+{
+        struct kvec iov[2];
+        iov[0].iov_base = (char *)path;
+        iov[0].iov_len = strlen(path) + 1;
+        iov[1].iov_base = (char *)token;
+        iov[1].iov_len = strlen(token) + 1;
+        return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov,
+                                 ARRAY_SIZE(iov), NULL));
+}
+static struct xenbus_watch *find_watch(const char *token)
+{
+        struct xenbus_watch *i, *cmp;
+        cmp = (void *)simple_strtoul(token, NULL, 16);
+        list_for_each_entry(i, &watches, list)
+                if (i == cmp)
+                        return i;
+        return NULL;
+}
+/* Register callback to watch this node. */
+int register_xenbus_watch(struct xenbus_watch *watch)
+{
+        /* Pointer in ascii is the token. */
+        char token[sizeof(watch) * 2 + 1];
+        int err;
+        sprintf(token, "%lX", (long)watch);
+        down_read(&xs_state.watch_mutex);
+        spin_lock(&watches_lock);
+        BUG_ON(find_watch(token));
+        list_add(&watch->list, &watches);
+        spin_unlock(&watches_lock);
+        err = xs_watch(watch->node, token);
+        /* Ignore errors due to multiple registration. */
+        if ((err != 0) && (err != -EEXIST)) {
+                spin_lock(&watches_lock);
+                list_del(&watch->list);
+                spin_unlock(&watches_lock);
+        }
+        up_read(&xs_state.watch_mutex);
+        return err;
+}
+EXPORT_SYMBOL_GPL(register_xenbus_watch);
+void unregister_xenbus_watch(struct xenbus_watch *watch)
+{
+        struct xs_stored_msg *msg, *tmp;
+        char token[sizeof(watch) * 2 + 1];
+        int err;
+        sprintf(token, "%lX", (long)watch);
+        down_read(&xs_state.watch_mutex);
+        spin_lock(&watches_lock);
+        BUG_ON(!find_watch(token));
+        list_del(&watch->list);
+        spin_unlock(&watches_lock);
+        err = xs_unwatch(watch->node, token);
+        if (err)
+                printk(KERN_WARNING
+                       "XENBUS Failed to release watch %s: %i\n",
+                       watch->node, err);
+        up_read(&xs_state.watch_mutex);
+        /* Make sure there are no callbacks running currently (unless
+           its us) */
+        if (current->pid != xenwatch_pid)
+                mutex_lock(&xenwatch_mutex);
+        /* Cancel pending watch events. */
+        spin_lock(&watch_events_lock);
+        list_for_each_entry_safe(msg, tmp, &watch_events, list) {
+                if (msg->u.watch.handle != watch)
+                        continue;
+                list_del(&msg->list);
+                kfree(msg->u.watch.vec);
+                kfree(msg);
+        }
+        spin_unlock(&watch_events_lock);
+        if (current->pid != xenwatch_pid)
+                mutex_unlock(&xenwatch_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
+void xs_suspend(void)
+{
+        down_write(&xs_state.transaction_mutex);
+        down_write(&xs_state.watch_mutex);
+        mutex_lock(&xs_state.request_mutex);
+        mutex_lock(&xs_state.response_mutex);
+}
+void xs_resume(void)
+{
+        struct xenbus_watch *watch;
+        char token[sizeof(watch) * 2 + 1];
+        mutex_unlock(&xs_state.response_mutex);
+        mutex_unlock(&xs_state.request_mutex);
+        up_write(&xs_state.transaction_mutex);
+        /* No need for watches_lock: the watch_mutex is sufficient. */
+        list_for_each_entry(watch, &watches, list) {
+                sprintf(token, "%lX", (long)watch);
+                xs_watch(watch->node, token);
+        }
+        up_write(&xs_state.watch_mutex);
+}
+void xs_suspend_cancel(void)
+{
+        mutex_unlock(&xs_state.response_mutex);
+        mutex_unlock(&xs_state.request_mutex);
+        up_write(&xs_state.watch_mutex);
+        up_write(&xs_state.transaction_mutex);
+}
+static int xenwatch_thread(void *unused)
+{
+        struct list_head *ent;
+        struct xs_stored_msg *msg;
+        for (;;) {
+                wait_event_interruptible(watch_events_waitq,
+                                         !list_empty(&watch_events));
+                if (kthread_should_stop())
+                        break;
+                mutex_lock(&xenwatch_mutex);
+                spin_lock(&watch_events_lock);
+                ent = watch_events.next;
+                if (ent != &watch_events)
+                        list_del(ent);
+                spin_unlock(&watch_events_lock);
+                if (ent != &watch_events) {
+                        msg = list_entry(ent, struct xs_stored_msg, list);
+                        msg->u.watch.handle->callback(
+                                msg->u.watch.handle,
+                                (const char **)msg->u.watch.vec,
+                                msg->u.watch.vec_size);
+                        kfree(msg->u.watch.vec);
+                        kfree(msg);
+                }
+                mutex_unlock(&xenwatch_mutex);
+        }
+        return 0;
+}
+static int process_msg(void)
+{
+        struct xs_stored_msg *msg;
+        char *body;
+        int err;
+        /*
+         * We must disallow save/restore while reading a xenstore message.
+         * A partial read across s/r leaves us out of sync with xenstored.
+         */
+        for (;;) {
+                err = xb_wait_for_data_to_read();
+                if (err)
+                        return err;
+                mutex_lock(&xs_state.response_mutex);
+                if (xb_data_to_read())
+                        break;
+                /* We raced with save/restore: pending data 'disappeared'. */
+                mutex_unlock(&xs_state.response_mutex);
+        }
+        msg = kmalloc(sizeof(*msg), GFP_KERNEL);
+        if (msg == NULL) {
+                err = -ENOMEM;
+                goto out;
+        }
+        err = xb_read(&msg->hdr, sizeof(msg->hdr));
+        if (err) {
+                kfree(msg);
+                goto out;
+        }
+        body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
+        if (body == NULL) {
+                kfree(msg);
+                err = -ENOMEM;
+                goto out;
+        }
+        err = xb_read(body, msg->hdr.len);
+        if (err) {
+                kfree(body);
+                kfree(msg);
+                goto out;
+        }
+        body[msg->hdr.len] = '\0';
+        if (msg->hdr.type == XS_WATCH_EVENT) {
+                msg->u.watch.vec = split(body, msg->hdr.len,
+                                         &msg->u.watch.vec_size);
+                if (IS_ERR(msg->u.watch.vec)) {
+                        kfree(msg);
+                        err = PTR_ERR(msg->u.watch.vec);
+                        goto out;
+                }
+                spin_lock(&watches_lock);
+                msg->u.watch.handle = find_watch(
+                        msg->u.watch.vec[XS_WATCH_TOKEN]);
+                if (msg->u.watch.handle != NULL) {
+                        spin_lock(&watch_events_lock);
+                        list_add_tail(&msg->list, &watch_events);
+                        wake_up(&watch_events_waitq);
+                        spin_unlock(&watch_events_lock);
+                } else {
+                        kfree(msg->u.watch.vec);
+                        kfree(msg);
+                }
+                spin_unlock(&watches_lock);
+        } else {
+                msg->u.reply.body = body;
+                spin_lock(&xs_state.reply_lock);
+                list_add_tail(&msg->list, &xs_state.reply_list);
+                spin_unlock(&xs_state.reply_lock);
+                wake_up(&xs_state.reply_waitq);
+        }
+ out:
+        mutex_unlock(&xs_state.response_mutex);
+        return err;
+}
+static int xenbus_thread(void *unused)
+{
+        int err;
+        for (;;) {
+                err = process_msg();
+                if (err)
+                        printk(KERN_WARNING "XENBUS error %d while reading "
+                               "message\n", err);
+                if (kthread_should_stop())
+                        break;
+        }
+        return 0;
+}
+int xs_init(void)
+{
+        int err;
+        struct task_struct *task;
+        INIT_LIST_HEAD(&xs_state.reply_list);
+        spin_lock_init(&xs_state.reply_lock);
+        init_waitqueue_head(&xs_state.reply_waitq);
+        mutex_init(&xs_state.request_mutex);
+        mutex_init(&xs_state.response_mutex);
+        init_rwsem(&xs_state.transaction_mutex);
+        init_rwsem(&xs_state.watch_mutex);
+        /* Initialize the shared memory rings to talk to xenstored */
+        err = xb_init_comms();
+        if (err)
+                return err;
+        task = kthread_run(xenwatch_thread, NULL, "xenwatch");
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        xenwatch_pid = task->pid;
+        task = kthread_run(xenbus_thread, NULL, "xenbus");
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        return 0;
+}
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 352eb4a13f98..c4c36171240d 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -209,7 +209,7 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
        envp[2] = NULL;
-        ret = call_usermodehelper(argv[0], argv, envp, 1);
+        ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
        if (ret < 0)
                mlog_errno(ret);
 }
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 9e15ce0006eb..36f310632c49 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -41,6 +41,7 @@ extern int irqbalance_disable(char *str);
 extern void fixup_irqs(cpumask_t map);
 #endif
+unsigned int do_IRQ(struct pt_regs *regs);
 void init_IRQ(void);
 void __init native_init_IRQ(void);
diff --git a/include/asm-i386/mach-default/irq_vectors_limits.h b/include/asm-i386/mach-default/irq_vectors_limits.h
index 7f161e760be6..a90c7a60109f 100644
--- a/include/asm-i386/mach-default/irq_vectors_limits.h
+++ b/include/asm-i386/mach-default/irq_vectors_limits.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_IRQ_VECTORS_LIMITS_H
 #define _ASM_IRQ_VECTORS_LIMITS_H
-#ifdef CONFIG_X86_IO_APIC
+#if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT)
 #define NR_IRQS 224
 # if (224 >= 32 * NR_CPUS)
 # define NR_IRQ_VECTORS NR_IRQS
diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h
index 8198d1cca1f3..7eb0b0b1fb3c 100644
--- a/include/asm-i386/mmu_context.h
+++ b/include/asm-i386/mmu_context.h
@@ -32,6 +32,8 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 #endif
 }
+void leave_mm(unsigned long cpu);
 static inline void switch_mm(struct mm_struct *prev,
                             struct mm_struct *next,
                             struct task_struct *tsk)
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 7f846a7d6bcc..7df88be2dd9e 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -52,6 +52,8 @@ struct paravirt_ops
        /* Basic arch-specific setup */
        void (*arch_setup)(void);
        char *(*memory_setup)(void);
+        void (*post_allocator_init)(void);
        void (*init_IRQ)(void);
        void (*time_init)(void);
@@ -116,7 +118,7 @@ struct paravirt_ops
        u64 (*read_tsc)(void);
        u64 (*read_pmc)(void);
-        u64 (*get_scheduled_cycles)(void);
+        unsigned long long (*sched_clock)(void);
        unsigned long (*get_cpu_khz)(void);
        /* Segment descriptor handling */
@@ -173,7 +175,7 @@ struct paravirt_ops
                                 unsigned long va);
        /* Hooks for allocating/releasing pagetable pages */
-        void (*alloc_pt)(u32 pfn);
+        void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
        void (*alloc_pd)(u32 pfn);
        void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
        void (*release_pt)(u32 pfn);
@@ -260,6 +262,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len)
 unsigned paravirt_patch_insns(void *site, unsigned len,
                              const char *start, const char *end);
+int paravirt_disable_iospace(void);
 /*
 * This generates an indirect call based on the operation type number.
@@ -563,7 +566,10 @@ static inline u64 paravirt_read_tsc(void)
 #define rdtscll(val) (val = paravirt_read_tsc())
-#define get_scheduled_cycles(val) (val = paravirt_ops.get_scheduled_cycles())
+static inline unsigned long long paravirt_sched_clock(void)
+{
+        return PVOP_CALL0(unsigned long long, sched_clock);
+}
 #define calculate_cpu_khz() (paravirt_ops.get_cpu_khz())
 #define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
@@ -669,6 +675,12 @@ static inline void setup_secondary_clock(void)
 }
 #endif
+static inline void paravirt_post_allocator_init(void)
+{
+        if (paravirt_ops.post_allocator_init)
+                (*paravirt_ops.post_allocator_init)();
+}
 static inline void paravirt_pagetable_setup_start(pgd_t *base)
 {
        if (paravirt_ops.pagetable_setup_start)
@@ -725,9 +737,9 @@ static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
        PVOP_VCALL3(flush_tlb_others, &cpumask, mm, va);
 }
-static inline void paravirt_alloc_pt(unsigned pfn)
+static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
 {
-        PVOP_VCALL1(alloc_pt, pfn);
+        PVOP_VCALL2(alloc_pt, mm, pfn);
 }
 static inline void paravirt_release_pt(unsigned pfn)
 {
diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h
index d07b7afc2692..f2fc33ceb9f2 100644
--- a/include/asm-i386/pgalloc.h
+++ b/include/asm-i386/pgalloc.h
@@ -7,7 +7,7 @@
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
-#define paravirt_alloc_pt(pfn) do { } while (0)
+#define paravirt_alloc_pt(mm, pfn) do { } while (0)
 #define paravirt_alloc_pd(pfn) do { } while (0)
 #define paravirt_alloc_pd(pfn) do { } while (0)
 #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
@@ -17,13 +17,13 @@
 #define pmd_populate_kernel(mm, pmd, pte)                       \
 do {                                                            \
-        paravirt_alloc_pt(__pa(pte) >> PAGE_SHIFT);             \
+        paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);         \
        set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));           \
 } while (0)
 #define pmd_populate(mm, pmd, pte)                              \
 do {                                                            \
-        paravirt_alloc_pt(page_to_pfn(pte));                    \
+        paravirt_alloc_pt(mm, page_to_pfn(pte));                \
        set_pmd(pmd, __pmd(_PAGE_TABLE +                        \
                ((unsigned long long)page_to_pfn(pte) <<        \
                        (unsigned long long) PAGE_SHIFT)));     \
diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h
index 0d5bff9dc4a5..7862fe858a9e 100644
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -81,6 +81,10 @@ void __init add_memory_region(unsigned long long start,
 extern unsigned long init_pg_tables_end;
+#ifndef CONFIG_PARAVIRT
+#define paravirt_post_allocator_init()  do {} while (0)
+#endif
 #endif /* __ASSEMBLY__ */
 #endif  /*  __KERNEL__  */
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 0c7132787062..1f73bde165b1 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -43,9 +43,12 @@ extern u8 x86_cpu_to_apicid[];
 #define cpu_physical_id(cpu)    x86_cpu_to_apicid[cpu]
+extern void set_cpu_sibling_map(int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
+extern void remove_siblinginfo(int cpu);
 #endif
 struct smp_ops
@@ -129,6 +132,8 @@ extern int __cpu_disable(void);
 extern void __cpu_die(unsigned int cpu);
 extern unsigned int num_processors;
+void __cpuinit smp_store_cpu_info(int id);
 #endif /* !__ASSEMBLY__ */
 #else /* CONFIG_SMP */
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index 153770e25faa..51a713e33a9e 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -15,8 +15,38 @@ extern int no_sync_cmos_clock;
 extern int recalibrate_cpu_khz(void);
 #ifndef CONFIG_PARAVIRT
-#define get_scheduled_cycles(val) rdtscll(val)
 #define calculate_cpu_khz() native_calculate_cpu_khz()
 #endif
+/* Accellerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *              ns = cycles / (freq / ns_per_sec)
+ *              ns = cycles * (ns_per_sec / freq)
+ *              ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *              ns = cycles * (10^6 / cpu_khz)
+ *
+ *      Then we use scaling math (suggested by george@mvista.com) to get:
+ *              ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *              ns = cycles * cyc2ns_scale / SC
+ *
+ *      And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+extern unsigned long cyc2ns_scale __read_mostly;
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+        return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+}
 #endif
diff --git a/include/asm-i386/vmi_time.h b/include/asm-i386/vmi_time.h
index 213930b995cb..478188130328 100644
--- a/include/asm-i386/vmi_time.h
+++ b/include/asm-i386/vmi_time.h
@@ -49,7 +49,7 @@ extern struct vmi_timer_ops {
 extern void __init vmi_time_init(void);
 extern unsigned long vmi_get_wallclock(void);
 extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_get_sched_cycles(void);
+extern unsigned long long vmi_sched_clock(void);
 extern unsigned long vmi_cpu_khz(void);
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/include/asm-i386/xen/hypercall.h b/include/asm-i386/xen/hypercall.h
new file mode 100644
index 000000000000..bc0ee7d961ca
--- /dev/null
+++ b/include/asm-i386/xen/hypercall.h
@@ -0,0 +1,413 @@
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef __HYPERCALL_H__
+#define __HYPERCALL_H__
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/physdev.h>
+extern struct { char _entry[32]; } hypercall_page[];
+#define _hypercall0(type, name)                                         \
+({                                                                      \
+        long __res;                                                     \
+        asm volatile (                                                  \
+                "call %[call]"                                          \
+                : "=a" (__res)                                          \
+                : [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
+                : "memory" );                                           \
+        (type)__res;                                                    \
+})
+#define _hypercall1(type, name, a1)                                     \
+({                                                                      \
+        long __res, __ign1;                                             \
+        asm volatile (                                                  \
+                "call %[call]"                                          \
+                : "=a" (__res), "=b" (__ign1)                           \
+                : "1" ((long)(a1)),                                     \
+                  [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
+                : "memory" );                                           \
+        (type)__res;                                                    \
+})
+#define _hypercall2(type, name, a1, a2)                                 \
+({                                                                      \
+        long __res, __ign1, __ign2;                                     \
+        asm volatile (                                                  \
+                "call %[call]"                                          \
+                : "=a" (__res), "=b" (__ign1), "=c" (__ign2)            \
+                : "1" ((long)(a1)), "2" ((long)(a2)),                   \
+                  [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
+                : "memory" );                                           \
+        (type)__res;                                                    \
+})
+#define _hypercall3(type, name, a1, a2, a3)                             \
+({                                                                      \
+        long __res, __ign1, __ign2, __ign3;                             \
+        asm volatile (                                                  \
+                "call %[call]"                                          \
+                : "=a" (__res), "=b" (__ign1), "=c" (__ign2),           \
+                "=d" (__ign3)                                           \
+                : "1" ((long)(a1)), "2" ((long)(a2)),                   \
+                  "3" ((long)(a3)),                                     \
+                  [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
+                : "memory" );                                           \
+        (type)__res;                                                    \
+})
+#define _hypercall4(type, name, a1, a2, a3, a4)                         \
+({                                                                      \
+        long __res, __ign1, __ign2, __ign3, __ign4;                     \
+        asm volatile (                                                  \
+                "call %[call]"                                          \
+                : "=a" (__res), "=b" (__ign1), "=c" (__ign2),           \
+                "=d" (__ign3), "=S" (__ign4)                            \
+                : "1" ((long)(a1)), "2" ((long)(a2)),                   \
+                  "3" ((long)(a3)), "4" ((long)(a4)),                   \
+                  [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
+                : "memory" );                                           \
+        (type)__res;                                                    \
+})
+#define _hypercall5(type, name, a1, a2, a3, a4, a5)                     \
+({                                                                      \
+        long __res, __ign1, __ign2, __ign3, __ign4, __ign5;             \
+        asm volatile (                                                  \
+                "call %[call]"                                          \
+                : "=a" (__res), "=b" (__ign1), "=c" (__ign2),           \
+                "=d" (__ign3), "=S" (__ign4), "=D" (__ign5)             \
+                : "1" ((long)(a1)), "2" ((long)(a2)),                   \
+                  "3" ((long)(a3)), "4" ((long)(a4)),                   \
+                  "5" ((long)(a5)),                                     \
+                  [call] "m" (hypercall_page[__HYPERVISOR_##name])      \
+                : "memory" );                                           \
+        (type)__res;                                                    \
+})
+static inline int
+HYPERVISOR_set_trap_table(struct trap_info *table)
+{
+        return _hypercall1(int, set_trap_table, table);
+}
+static inline int
+HYPERVISOR_mmu_update(struct mmu_update *req, int count,
+                      int *success_count, domid_t domid)
+{
+        return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+static inline int
+HYPERVISOR_mmuext_op(struct mmuext_op *op, int count,
+                     int *success_count, domid_t domid)
+{
+        return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+static inline int
+HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
+{
+        return _hypercall2(int, set_gdt, frame_list, entries);
+}
+static inline int
+HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
+{
+        return _hypercall2(int, stack_switch, ss, esp);
+}
+static inline int
+HYPERVISOR_set_callbacks(unsigned long event_selector,
+                         unsigned long event_address,
+                         unsigned long failsafe_selector,
+                         unsigned long failsafe_address)
+{
+        return _hypercall4(int, set_callbacks,
+                           event_selector, event_address,
+                           failsafe_selector, failsafe_address);
+}
+static inline int
+HYPERVISOR_fpu_taskswitch(int set)
+{
+        return _hypercall1(int, fpu_taskswitch, set);
+}
+static inline int
+HYPERVISOR_sched_op(int cmd, unsigned long arg)
+{
+        return _hypercall2(int, sched_op, cmd, arg);
+}
+static inline long
+HYPERVISOR_set_timer_op(u64 timeout)
+{
+        unsigned long timeout_hi = (unsigned long)(timeout>>32);
+        unsigned long timeout_lo = (unsigned long)timeout;
+        return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
+}
+static inline int
+HYPERVISOR_set_debugreg(int reg, unsigned long value)
+{
+        return _hypercall2(int, set_debugreg, reg, value);
+}
+static inline unsigned long
+HYPERVISOR_get_debugreg(int reg)
+{
+        return _hypercall1(unsigned long, get_debugreg, reg);
+}
+static inline int
+HYPERVISOR_update_descriptor(u64 ma, u64 desc)
+{
+        return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
+}
+static inline int
+HYPERVISOR_memory_op(unsigned int cmd, void *arg)
+{
+        return _hypercall2(int, memory_op, cmd, arg);
+}
+static inline int
+HYPERVISOR_multicall(void *call_list, int nr_calls)
+{
+        return _hypercall2(int, multicall, call_list, nr_calls);
+}
+static inline int
+HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
+                             unsigned long flags)
+{
+        unsigned long pte_hi = 0;
+#ifdef CONFIG_X86_PAE
+        pte_hi = new_val.pte_high;
+#endif
+        return _hypercall4(int, update_va_mapping, va,
+                           new_val.pte_low, pte_hi, flags);
+}
+static inline int
+HYPERVISOR_event_channel_op(int cmd, void *arg)
+{
+        int rc = _hypercall2(int, event_channel_op, cmd, arg);
+        if (unlikely(rc == -ENOSYS)) {
+                struct evtchn_op op;
+                op.cmd = cmd;
+                memcpy(&op.u, arg, sizeof(op.u));
+                rc = _hypercall1(int, event_channel_op_compat, &op);
+                memcpy(arg, &op.u, sizeof(op.u));
+        }
+        return rc;
+}
+static inline int
+HYPERVISOR_xen_version(int cmd, void *arg)
+{
+        return _hypercall2(int, xen_version, cmd, arg);
+}
+static inline int
+HYPERVISOR_console_io(int cmd, int count, char *str)
+{
+        return _hypercall3(int, console_io, cmd, count, str);
+}
+static inline int
+HYPERVISOR_physdev_op(int cmd, void *arg)
+{
+        int rc = _hypercall2(int, physdev_op, cmd, arg);
+        if (unlikely(rc == -ENOSYS)) {
+                struct physdev_op op;
+                op.cmd = cmd;
+                memcpy(&op.u, arg, sizeof(op.u));
+                rc = _hypercall1(int, physdev_op_compat, &op);
+                memcpy(arg, &op.u, sizeof(op.u));
+        }
+        return rc;
+}
+static inline int
+HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
+{
+        return _hypercall3(int, grant_table_op, cmd, uop, count);
+}
+static inline int
+HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
+                                         unsigned long flags, domid_t domid)
+{
+        unsigned long pte_hi = 0;
+#ifdef CONFIG_X86_PAE
+        pte_hi = new_val.pte_high;
+#endif
+        return _hypercall5(int, update_va_mapping_otherdomain, va,
+                           new_val.pte_low, pte_hi, flags, domid);
+}
+static inline int
+HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
+{
+        return _hypercall2(int, vm_assist, cmd, type);
+}
+static inline int
+HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
+{
+        return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+static inline int
+HYPERVISOR_suspend(unsigned long srec)
+{
+        return _hypercall3(int, sched_op, SCHEDOP_shutdown,
+                           SHUTDOWN_suspend, srec);
+}
+static inline int
+HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
+{
+        return _hypercall2(int, nmi_op, op, arg);
+}
+static inline void
+MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
+                        pte_t new_val, unsigned long flags)
+{
+        mcl->op = __HYPERVISOR_update_va_mapping;
+        mcl->args[0] = va;
+#ifdef CONFIG_X86_PAE
+        mcl->args[1] = new_val.pte_low;
+        mcl->args[2] = new_val.pte_high;
+#else
+        mcl->args[1] = new_val.pte_low;
+        mcl->args[2] = 0;
+#endif
+        mcl->args[3] = flags;
+}
+static inline void
+MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
+                     void *uop, unsigned int count)
+{
+        mcl->op = __HYPERVISOR_grant_table_op;
+        mcl->args[0] = cmd;
+        mcl->args[1] = (unsigned long)uop;
+        mcl->args[2] = count;
+}
+static inline void
+MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va,
+                                    pte_t new_val, unsigned long flags,
+                                    domid_t domid)
+{
+        mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
+        mcl->args[0] = va;
+#ifdef CONFIG_X86_PAE
+        mcl->args[1] = new_val.pte_low;
+        mcl->args[2] = new_val.pte_high;
+#else
+        mcl->args[1] = new_val.pte_low;
+        mcl->args[2] = 0;
+#endif
+        mcl->args[3] = flags;
+        mcl->args[4] = domid;
+}
+static inline void
+MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
+                        struct desc_struct desc)
+{
+        mcl->op = __HYPERVISOR_update_descriptor;
+        mcl->args[0] = maddr;
+        mcl->args[1] = maddr >> 32;
+        mcl->args[2] = desc.a;
+        mcl->args[3] = desc.b;
+}
+static inline void
+MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
+{
+        mcl->op = __HYPERVISOR_memory_op;
+        mcl->args[0] = cmd;
+        mcl->args[1] = (unsigned long)arg;
+}
+static inline void
+MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
+                 int count, int *success_count, domid_t domid)
+{
+        mcl->op = __HYPERVISOR_mmu_update;
+        mcl->args[0] = (unsigned long)req;
+        mcl->args[1] = count;
+        mcl->args[2] = (unsigned long)success_count;
+        mcl->args[3] = domid;
+}
+static inline void
+MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
+                int *success_count, domid_t domid)
+{
+        mcl->op = __HYPERVISOR_mmuext_op;
+        mcl->args[0] = (unsigned long)op;
+        mcl->args[1] = count;
+        mcl->args[2] = (unsigned long)success_count;
+        mcl->args[3] = domid;
+}
+static inline void
+MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
+{
+        mcl->op = __HYPERVISOR_set_gdt;
+        mcl->args[0] = (unsigned long)frames;
+        mcl->args[1] = entries;
+}
+static inline void
+MULTI_stack_switch(struct multicall_entry *mcl,
+                   unsigned long ss, unsigned long esp)
+{
+        mcl->op = __HYPERVISOR_stack_switch;
+        mcl->args[0] = ss;
+        mcl->args[1] = esp;
+}
+#endif /* __HYPERCALL_H__ */
diff --git a/include/asm-i386/xen/hypervisor.h b/include/asm-i386/xen/hypervisor.h
new file mode 100644
index 000000000000..8e15dd28c91f
--- /dev/null
+++ b/include/asm-i386/xen/hypervisor.h
@@ -0,0 +1,73 @@
+/******************************************************************************
+ * hypervisor.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef __HYPERVISOR_H__
+#define __HYPERVISOR_H__
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+#include <asm/ptrace.h>
+#include <asm/page.h>
+#include <asm/desc.h>
+#if defined(__i386__)
+#  ifdef CONFIG_X86_PAE
+#   include <asm-generic/pgtable-nopud.h>
+#  else
+#   include <asm-generic/pgtable-nopmd.h>
+#  endif
+#endif
+#include <asm/xen/hypercall.h>
+/* arch/i386/kernel/setup.c */
+extern struct shared_info *HYPERVISOR_shared_info;
+extern struct start_info *xen_start_info;
+#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
+/* arch/i386/mach-xen/evtchn.c */
+/* Force a proper event-channel callback from Xen. */
+extern void force_evtchn_callback(void);
+/* Turn jiffies into Xen system time. */
+u64 jiffies_to_st(unsigned long jiffies);
+#define MULTI_UVMFLAGS_INDEX 3
+#define MULTI_UVMDOMID_INDEX 4
+#define is_running_on_xen()     (xen_start_info ? 1 : 0)
+#endif /* __HYPERVISOR_H__ */
diff --git a/include/asm-i386/xen/interface.h b/include/asm-i386/xen/interface.h
new file mode 100644
index 000000000000..165c3968e138
--- /dev/null
+++ b/include/asm-i386/xen/interface.h
@@ -0,0 +1,188 @@
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 32-bit Xen.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
+#define __XEN_PUBLIC_ARCH_X86_32_H__
+#ifdef __XEN__
+#define __DEFINE_GUEST_HANDLE(name, type) \
+    typedef struct { type *p; } __guest_handle_ ## name
+#else
+#define __DEFINE_GUEST_HANDLE(name, type) \
+    typedef type * __guest_handle_ ## name
+#endif
+#define DEFINE_GUEST_HANDLE_STRUCT(name) \
+        __DEFINE_GUEST_HANDLE(name, struct name)
+#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
+#define GUEST_HANDLE(name)        __guest_handle_ ## name
+#ifndef __ASSEMBLY__
+/* Guest handles for primitive C types. */
+__DEFINE_GUEST_HANDLE(uchar, unsigned char);
+__DEFINE_GUEST_HANDLE(uint,  unsigned int);
+__DEFINE_GUEST_HANDLE(ulong, unsigned long);
+DEFINE_GUEST_HANDLE(char);
+DEFINE_GUEST_HANDLE(int);
+DEFINE_GUEST_HANDLE(long);
+DEFINE_GUEST_HANDLE(void);
+#endif
+/*
+ * SEGMENT DESCRIPTOR TABLES
+ */
+/*
+ * A number of GDT entries are reserved by Xen. These are not situated at the
+ * start of the GDT because some stupid OSes export hard-coded selector values
+ * in their ABI. These hard-coded values are always near the start of the GDT,
+ * so Xen places itself out of the way, at the far end of the GDT.
+ */
+#define FIRST_RESERVED_GDT_PAGE  14
+#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS    FLAT_RING3_CS
+#define FLAT_USER_DS    FLAT_RING3_DS
+#define FLAT_USER_SS    FLAT_RING3_SS
+/* And the trap vector is... */
+#define TRAP_INSTR "int $0x82"
+/*
+ * Virtual addresses beyond this are not modifiable by guest OSes. The
+ * machine->physical mapping table starts at this address, read-only.
+ */
+#ifdef CONFIG_X86_PAE
+#define __HYPERVISOR_VIRT_START 0xF5800000
+#else
+#define __HYPERVISOR_VIRT_START 0xFC000000
+#endif
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+#ifndef __ASSEMBLY__
+/*
+ * Send an array of these to HYPERVISOR_set_trap_table()
+ */
+#define TI_GET_DPL(_ti)         ((_ti)->flags & 3)
+#define TI_GET_IF(_ti)          ((_ti)->flags & 4)
+#define TI_SET_DPL(_ti, _dpl)   ((_ti)->flags |= (_dpl))
+#define TI_SET_IF(_ti, _if)     ((_ti)->flags |= ((!!(_if))<<2))
+struct trap_info {
+    uint8_t       vector;  /* exception vector                              */
+    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
+    uint16_t      cs;      /* code selector                                 */
+    unsigned long address; /* code offset                                   */
+};
+DEFINE_GUEST_HANDLE_STRUCT(trap_info);
+struct cpu_user_regs {
+    uint32_t ebx;
+    uint32_t ecx;
+    uint32_t edx;
+    uint32_t esi;
+    uint32_t edi;
+    uint32_t ebp;
+    uint32_t eax;
+    uint16_t error_code;    /* private */
+    uint16_t entry_vector;  /* private */
+    uint32_t eip;
+    uint16_t cs;
+    uint8_t  saved_upcall_mask;
+    uint8_t  _pad0;
+    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
+    uint32_t esp;
+    uint16_t ss, _pad1;
+    uint16_t es, _pad2;
+    uint16_t ds, _pad3;
+    uint16_t fs, _pad4;
+    uint16_t gs, _pad5;
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+/*
+ * The following is all CPU context. Note that the fpu_ctxt block is filled
+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ */
+struct vcpu_guest_context {
+    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
+    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_HVM_GUEST  (1<<1)
+#define VGCF_IN_KERNEL  (1<<2)
+    unsigned long flags;                    /* VGCF_* flags                 */
+    struct cpu_user_regs user_regs;         /* User-level CPU registers     */
+    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
+    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
+    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
+    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
+    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
+    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
+    unsigned long event_callback_cs;        /* CS:EIP of event callback     */
+    unsigned long event_callback_eip;
+    unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
+    unsigned long failsafe_callback_eip;
+    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+};
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+struct arch_shared_info {
+    unsigned long max_pfn;                  /* max pfn that appears in table */
+    /* Frame containing list of mfns containing list of mfns containing p2m. */
+    unsigned long pfn_to_mfn_frame_list_list;
+    unsigned long nmi_reason;
+};
+struct arch_vcpu_info {
+    unsigned long cr2;
+    unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
+};
+#endif /* !__ASSEMBLY__ */
+/*
+ * Prefix forces emulation of some non-trapping instructions.
+ * Currently only CPUID.
+ */
+#ifdef __ASSEMBLY__
+#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
+#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
+#else
+#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
+#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
+#endif
+#endif
diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
index 9a1e0674e56c..e831759b2fb5 100644
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -38,17 +38,25 @@
 * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two")
 *      ELFNOTE(XYZCo, 12, .long, 0xdeadbeef)
 */
-#define ELFNOTE(name, type, desctype, descdata) \
+#define ELFNOTE_START(name, type, flags)        \
-.pushsection .note.name, "",@note       ;       \
+.pushsection .note.name, flags,@note    ;       \
-  .align 4                              ;       \
+  .balign 4                             ;       \
  .long 2f - 1f         /* namesz */    ;       \
-  .long 4f - 3f         /* descsz */    ;       \
+  .long 4484f - 3f      /* descsz */    ;       \
  .long type                            ;       \
 1:.asciz #name                          ;       \
-2:.align 4                              ;       \
+2:.balign 4                             ;       \
-3:desctype descdata                     ;       \
+3:
-4:.align 4                              ;       \
+#define ELFNOTE_END                             \
+4484:.balign 4                          ;       \
 .popsection                             ;
+#define ELFNOTE(name, type, desc)               \
+        ELFNOTE_START(name, type, "")           \
+                desc                    ;       \
+        ELFNOTE_END
 #else   /* !__ASSEMBLER__ */
 #include <linux/elf.h>
 /*
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 10f505c8431d..5dc13848891b 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -36,13 +36,57 @@ static inline int request_module(const char * name, ...) { return -ENOSYS; }
 #define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
 struct key;
-extern int call_usermodehelper_keys(char *path, char *argv[], char *envp[],
+struct file;
-                                    struct key *session_keyring, int wait);
+struct subprocess_info;
+/* Allocate a subprocess_info structure */
+struct subprocess_info *call_usermodehelper_setup(char *path,
+                                                  char **argv, char **envp);
+/* Set various pieces of state into the subprocess_info structure */
+void call_usermodehelper_setkeys(struct subprocess_info *info,
+                                 struct key *session_keyring);
+int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
+                                  struct file **filp);
+void call_usermodehelper_setcleanup(struct subprocess_info *info,
+                                    void (*cleanup)(char **argv, char **envp));
+enum umh_wait {
+        UMH_NO_WAIT = -1,       /* don't wait at all */
+        UMH_WAIT_EXEC = 0,      /* wait for the exec, but not the process */
+        UMH_WAIT_PROC = 1,      /* wait for the process to complete */
+};
+/* Actually execute the sub-process */
+int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait);
+/* Free the subprocess_info. This is only needed if you're not going
+   to call call_usermodehelper_exec */
+void call_usermodehelper_freeinfo(struct subprocess_info *info);
 static inline int
-call_usermodehelper(char *path, char **argv, char **envp, int wait)
+call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait)
 {
-        return call_usermodehelper_keys(path, argv, envp, NULL, wait);
+        struct subprocess_info *info;
+        info = call_usermodehelper_setup(path, argv, envp);
+        if (info == NULL)
+                return -ENOMEM;
+        return call_usermodehelper_exec(info, wait);
+}
+static inline int
+call_usermodehelper_keys(char *path, char **argv, char **envp,
+                         struct key *session_keyring, enum umh_wait wait)
+{
+        struct subprocess_info *info;
+        info = call_usermodehelper_setup(path, argv, envp);
+        if (info == NULL)
+                return -ENOMEM;
+        call_usermodehelper_setkeys(info, session_keyring);
+        return call_usermodehelper_exec(info, wait);
 }
 extern void usermodehelper_init(void);
diff --git a/include/linux/major.h b/include/linux/major.h
index 7e7c9093919a..0cb98053537a 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -158,6 +158,8 @@
 #define VXSPEC_MAJOR            200     /* VERITAS volume config driver */
 #define VXDMP_MAJOR             201     /* VERITAS volume multipath driver */
+#define XENVBD_MAJOR            202     /* Xen virtual block device */
 #define MSR_MAJOR               202
 #define CPUID_MAJOR             203
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ae2d79f2107e..731cd2ac3227 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -92,6 +92,7 @@
 /* PG_owner_priv_1 users should have descriptive aliases */
 #define PG_checked              PG_owner_priv_1 /* Used by some filesystems */
+#define PG_pinned               PG_owner_priv_1 /* Xen pinned pagetable */
 #if (BITS_PER_LONG > 32)
 /*
@@ -170,6 +171,10 @@ static inline void SetPageUptodate(struct page *page)
 #define SetPageChecked(page)    set_bit(PG_checked, &(page)->flags)
 #define ClearPageChecked(page)  clear_bit(PG_checked, &(page)->flags)
+#define PagePinned(page)        test_bit(PG_pinned, &(page)->flags)
+#define SetPagePinned(page)     set_bit(PG_pinned, &(page)->flags)
+#define ClearPagePinned(page)   clear_bit(PG_pinned, &(page)->flags)
 #define PageReserved(page)      test_bit(PG_reserved, &(page)->flags)
 #define SetPageReserved(page)   set_bit(PG_reserved, &(page)->flags)
 #define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags)
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 1dd1c707311f..85ea63f462af 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -67,6 +67,11 @@ extern void kernel_power_off(void);
 void ctrl_alt_del(void);
+#define POWEROFF_CMD_PATH_LEN   256
+extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
+extern int orderly_poweroff(bool force);
 /*
 * Emergency restart, callable from an interrupt handler.
 */
diff --git a/include/linux/string.h b/include/linux/string.h
index 7f2eb6a477f9..836062b7582a 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -105,8 +105,12 @@ extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 extern char *kstrdup(const char *s, gfp_t gfp);
+extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
+extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
+extern void argv_free(char **argv);
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 132b260aef1e..c2b10cae5da5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -70,6 +70,10 @@ extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
                        struct page ***pages);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
+/* Allocate/destroy a 'vmalloc' VM area. */
+extern struct vm_struct *alloc_vm_area(size_t size);
+extern void free_vm_area(struct vm_struct *area);
 /*
 *      Internals.  Dont't use..
 */
diff --git a/include/xen/events.h b/include/xen/events.h
new file mode 100644
index 000000000000..2bde54d29be5
--- /dev/null
+++ b/include/xen/events.h
@@ -0,0 +1,48 @@
+#ifndef _XEN_EVENTS_H
+#define _XEN_EVENTS_H
+#include <linux/interrupt.h>
+#include <xen/interface/event_channel.h>
+#include <asm/xen/hypercall.h>
+enum ipi_vector {
+        XEN_RESCHEDULE_VECTOR,
+        XEN_CALL_FUNCTION_VECTOR,
+        XEN_NR_IPIS,
+};
+int bind_evtchn_to_irq(unsigned int evtchn);
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+                              irq_handler_t handler,
+                              unsigned long irqflags, const char *devname,
+                              void *dev_id);
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+                            irq_handler_t handler,
+                            unsigned long irqflags, const char *devname,
+                            void *dev_id);
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+                           unsigned int cpu,
+                           irq_handler_t handler,
+                           unsigned long irqflags,
+                           const char *devname,
+                           void *dev_id);
+/*
+ * Common unbind function for all event sources. Takes IRQ to unbind from.
+ * Automatically closes the underlying event channel (even for bindings
+ * made with bind_evtchn_to_irqhandler()).
+ */
+void unbind_from_irqhandler(unsigned int irq, void *dev_id);
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector);
+static inline void notify_remote_via_evtchn(int port)
+{
+        struct evtchn_send send = { .port = port };
+        (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
+}
+extern void notify_remote_via_irq(int irq);
+#endif  /* _XEN_EVENTS_H */
diff --git a/include/xen/features.h b/include/xen/features.h
new file mode 100644
index 000000000000..27292d4d2a6a
--- /dev/null
+++ b/include/xen/features.h
@@ -0,0 +1,23 @@
+/******************************************************************************
+ * features.h
+ *
+ * Query the features reported by Xen.
+ *
+ * Copyright (c) 2006, Ian Campbell
+ */
+#ifndef __XEN_FEATURES_H__
+#define __XEN_FEATURES_H__
+#include <xen/interface/features.h>
+void xen_setup_features(void);
+extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
+static inline int xen_feature(int flag)
+{
+        return xen_features[flag];
+}
+#endif /* __ASM_XEN_FEATURES_H__ */
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
new file mode 100644
index 000000000000..761c83498e03
--- /dev/null
+++ b/include/xen/grant_table.h
@@ -0,0 +1,107 @@
+/******************************************************************************
+ * grant_table.h
+ *
+ * Two sets of functionality:
+ * 1. Granting foreign access to our memory reservation.
+ * 2. Accessing others' memory reservations via grant references.
+ * (i.e., mechanisms for both sender and recipient of grant references)
+ *
+ * Copyright (c) 2004-2005, K A Fraser
+ * Copyright (c) 2005, Christopher Clark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef __ASM_GNTTAB_H__
+#define __ASM_GNTTAB_H__
+#include <asm/xen/hypervisor.h>
+#include <xen/interface/grant_table.h>
+/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
+#define NR_GRANT_FRAMES 4
+struct gnttab_free_callback {
+        struct gnttab_free_callback *next;
+        void (*fn)(void *);
+        void *arg;
+        u16 count;
+};
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+                                int readonly);
+/*
+ * End access through the given grant reference, iff the grant entry is no
+ * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
+ * use.
+ */
+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
+/*
+ * Eventually end access through the given grant reference, and once that
+ * access has been ended, free the given page too.  Access will be ended
+ * immediately iff the grant entry is not in use, otherwise it will happen
+ * some time later.  page may be 0, in which case no freeing will occur.
+ */
+void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
+                               unsigned long page);
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
+int gnttab_query_foreign_access(grant_ref_t ref);
+/*
+ * operations on reserved batches of grant references
+ */
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
+void gnttab_free_grant_reference(grant_ref_t ref);
+void gnttab_free_grant_references(grant_ref_t head);
+int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
+int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+                                    grant_ref_t release);
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+                                  void (*fn)(void *), void *arg, u16 count);
+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+                                     unsigned long frame, int readonly);
+void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
+                                       unsigned long pfn);
+#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
+#endif /* __ASM_GNTTAB_H__ */
diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h
new file mode 100644
index 000000000000..21c0ecfd786d
--- /dev/null
+++ b/include/xen/hvc-console.h
@@ -0,0 +1,6 @@
+#ifndef XEN_HVC_CONSOLE_H
+#define XEN_HVC_CONSOLE_H
+extern struct console xenboot_console;
+#endif  /* XEN_HVC_CONSOLE_H */
diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h
new file mode 100644
index 000000000000..a64d3df5bd95
--- /dev/null
+++ b/include/xen/interface/elfnote.h
@@ -0,0 +1,133 @@
+/******************************************************************************
+ * elfnote.h
+ *
+ * Definitions used for the Xen ELF notes.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Ltd.
+ */
+#ifndef __XEN_PUBLIC_ELFNOTE_H__
+#define __XEN_PUBLIC_ELFNOTE_H__
+/*
+ * The notes should live in a SHT_NOTE segment and have "Xen" in the
+ * name field.
+ *
+ * Numeric types are either 4 or 8 bytes depending on the content of
+ * the desc field.
+ *
+ * LEGACY indicated the fields in the legacy __xen_guest string which
+ * this a note type replaces.
+ */
+/*
+ * NAME=VALUE pair (string).
+ *
+ * LEGACY: FEATURES and PAE
+ */
+#define XEN_ELFNOTE_INFO           0
+/*
+ * The virtual address of the entry point (numeric).
+ *
+ * LEGACY: VIRT_ENTRY
+ */
+#define XEN_ELFNOTE_ENTRY          1
+/* The virtual address of the hypercall transfer page (numeric).
+ *
+ * LEGACY: HYPERCALL_PAGE. (n.b. legacy value is a physical page
+ * number not a virtual address)
+ */
+#define XEN_ELFNOTE_HYPERCALL_PAGE 2
+/* The virtual address where the kernel image should be mapped (numeric).
+ *
+ * Defaults to 0.
+ *
+ * LEGACY: VIRT_BASE
+ */
+#define XEN_ELFNOTE_VIRT_BASE      3
+/*
+ * The offset of the ELF paddr field from the acutal required
+ * psuedo-physical address (numeric).
+ *
+ * This is used to maintain backwards compatibility with older kernels
+ * which wrote __PAGE_OFFSET into that field. This field defaults to 0
+ * if not present.
+ *
+ * LEGACY: ELF_PADDR_OFFSET. (n.b. legacy default is VIRT_BASE)
+ */
+#define XEN_ELFNOTE_PADDR_OFFSET   4
+/*
+ * The version of Xen that we work with (string).
+ *
+ * LEGACY: XEN_VER
+ */
+#define XEN_ELFNOTE_XEN_VERSION    5
+/*
+ * The name of the guest operating system (string).
+ *
+ * LEGACY: GUEST_OS
+ */
+#define XEN_ELFNOTE_GUEST_OS       6
+/*
+ * The version of the guest operating system (string).
+ *
+ * LEGACY: GUEST_VER
+ */
+#define XEN_ELFNOTE_GUEST_VERSION  7
+/*
+ * The loader type (string).
+ *
+ * LEGACY: LOADER
+ */
+#define XEN_ELFNOTE_LOADER         8
+/*
+ * The kernel supports PAE (x86/32 only, string = "yes" or "no").
+ *
+ * LEGACY: PAE (n.b. The legacy interface included a provision to
+ * indicate 'extended-cr3' support allowing L3 page tables to be
+ * placed above 4G. It is assumed that any kernel new enough to use
+ * these ELF notes will include this and therefore "yes" here is
+ * equivalent to "yes[entended-cr3]" in the __xen_guest interface.
+ */
+#define XEN_ELFNOTE_PAE_MODE       9
+/*
+ * The features supported/required by this kernel (string).
+ *
+ * The string must consist of a list of feature names (as given in
+ * features.h, without the "XENFEAT_" prefix) separated by '|'
+ * characters. If a feature is required for the kernel to function
+ * then the feature name must be preceded by a '!' character.
+ *
+ * LEGACY: FEATURES
+ */
+#define XEN_ELFNOTE_FEATURES      10
+/*
+ * The kernel requires the symbol table to be loaded (string = "yes" or "no")
+ * LEGACY: BSD_SYMTAB (n.b. The legacy treated the presence or absence
+ * of this string as a boolean flag rather than requiring "yes" or
+ * "no".
+ */
+#define XEN_ELFNOTE_BSD_SYMTAB    11
+#endif /* __XEN_PUBLIC_ELFNOTE_H__ */
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h
new file mode 100644
index 000000000000..919b5bdcb2bd
--- /dev/null
+++ b/include/xen/interface/event_channel.h
@@ -0,0 +1,195 @@
+/******************************************************************************
+ * event_channel.h
+ *
+ * Event channels between domains.
+ *
+ * Copyright (c) 2003-2004, K A Fraser.
+ */
+#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
+#define __XEN_PUBLIC_EVENT_CHANNEL_H__
+typedef uint32_t evtchn_port_t;
+DEFINE_GUEST_HANDLE(evtchn_port_t);
+/*
+ * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
+ * accepting interdomain bindings from domain <remote_dom>. A fresh port
+ * is allocated in <dom> and returned as <port>.
+ * NOTES:
+ *  1. If the caller is unprivileged then <dom> must be DOMID_SELF.
+ *  2. <rdom> may be DOMID_SELF, allowing loopback connections.
+ */
+#define EVTCHNOP_alloc_unbound    6
+struct evtchn_alloc_unbound {
+        /* IN parameters */
+        domid_t dom, remote_dom;
+        /* OUT parameters */
+        evtchn_port_t port;
+};
+/*
+ * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
+ * the calling domain and <remote_dom>. <remote_dom,remote_port> must identify
+ * a port that is unbound and marked as accepting bindings from the calling
+ * domain. A fresh port is allocated in the calling domain and returned as
+ * <local_port>.
+ * NOTES:
+ *  2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
+ */
+#define EVTCHNOP_bind_interdomain 0
+struct evtchn_bind_interdomain {
+        /* IN parameters. */
+        domid_t remote_dom;
+        evtchn_port_t remote_port;
+        /* OUT parameters. */
+        evtchn_port_t local_port;
+};
+/*
+ * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
+ * vcpu.
+ * NOTES:
+ *  1. A virtual IRQ may be bound to at most one event channel per vcpu.
+ *  2. The allocated event channel is bound to the specified vcpu. The binding
+ *     may not be changed.
+ */
+#define EVTCHNOP_bind_virq        1
+struct evtchn_bind_virq {
+        /* IN parameters. */
+        uint32_t virq;
+        uint32_t vcpu;
+        /* OUT parameters. */
+        evtchn_port_t port;
+};
+/*
+ * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
+ * NOTES:
+ *  1. A physical IRQ may be bound to at most one event channel per domain.
+ *  2. Only a sufficiently-privileged domain may bind to a physical IRQ.
+ */
+#define EVTCHNOP_bind_pirq        2
+struct evtchn_bind_pirq {
+        /* IN parameters. */
+        uint32_t pirq;
+#define BIND_PIRQ__WILL_SHARE 1
+        uint32_t flags; /* BIND_PIRQ__* */
+        /* OUT parameters. */
+        evtchn_port_t port;
+};
+/*
+ * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
+ * NOTES:
+ *  1. The allocated event channel is bound to the specified vcpu. The binding
+ *     may not be changed.
+ */
+#define EVTCHNOP_bind_ipi         7
+struct evtchn_bind_ipi {
+        uint32_t vcpu;
+        /* OUT parameters. */
+        evtchn_port_t port;
+};
+/*
+ * EVTCHNOP_close: Close a local event channel <port>. If the channel is
+ * interdomain then the remote end is placed in the unbound state
+ * (EVTCHNSTAT_unbound), awaiting a new connection.
+ */
+#define EVTCHNOP_close            3
+struct evtchn_close {
+        /* IN parameters. */
+        evtchn_port_t port;
+};
+/*
+ * EVTCHNOP_send: Send an event to the remote end of the channel whose local
+ * endpoint is <port>.
+ */
+#define EVTCHNOP_send             4
+struct evtchn_send {
+        /* IN parameters. */
+        evtchn_port_t port;
+};
+/*
+ * EVTCHNOP_status: Get the current status of the communication channel which
+ * has an endpoint at <dom, port>.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may obtain the status of an event
+ *     channel for which <dom> is not DOMID_SELF.
+ */
+#define EVTCHNOP_status           5
+struct evtchn_status {
+        /* IN parameters */
+        domid_t  dom;
+        evtchn_port_t port;
+        /* OUT parameters */
+#define EVTCHNSTAT_closed       0  /* Channel is not in use.                 */
+#define EVTCHNSTAT_unbound      1  /* Channel is waiting interdom connection.*/
+#define EVTCHNSTAT_interdomain  2  /* Channel is connected to remote domain. */
+#define EVTCHNSTAT_pirq         3  /* Channel is bound to a phys IRQ line.   */
+#define EVTCHNSTAT_virq         4  /* Channel is bound to a virtual IRQ line */
+#define EVTCHNSTAT_ipi          5  /* Channel is bound to a virtual IPI line */
+        uint32_t status;
+        uint32_t vcpu;             /* VCPU to which this channel is bound.   */
+        union {
+                struct {
+                        domid_t dom;
+                } unbound; /* EVTCHNSTAT_unbound */
+                struct {
+                        domid_t dom;
+                        evtchn_port_t port;
+                } interdomain; /* EVTCHNSTAT_interdomain */
+                uint32_t pirq;      /* EVTCHNSTAT_pirq        */
+                uint32_t virq;      /* EVTCHNSTAT_virq        */
+        } u;
+};
+/*
+ * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
+ * event is pending.
+ * NOTES:
+ *  1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
+ *     the binding. This binding cannot be changed.
+ *  2. All other channels notify vcpu0 by default. This default is set when
+ *     the channel is allocated (a port that is freed and subsequently reused
+ *     has its binding reset to vcpu0).
+ */
+#define EVTCHNOP_bind_vcpu        8
+struct evtchn_bind_vcpu {
+        /* IN parameters. */
+        evtchn_port_t port;
+        uint32_t vcpu;
+};
+/*
+ * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
+ * a notification to the appropriate VCPU if an event is pending.
+ */
+#define EVTCHNOP_unmask           9
+struct evtchn_unmask {
+        /* IN parameters. */
+        evtchn_port_t port;
+};
+struct evtchn_op {
+        uint32_t cmd; /* EVTCHNOP_* */
+        union {
+                struct evtchn_alloc_unbound    alloc_unbound;
+                struct evtchn_bind_interdomain bind_interdomain;
+                struct evtchn_bind_virq        bind_virq;
+                struct evtchn_bind_pirq        bind_pirq;
+                struct evtchn_bind_ipi         bind_ipi;
+                struct evtchn_close            close;
+                struct evtchn_send             send;
+                struct evtchn_status           status;
+                struct evtchn_bind_vcpu        bind_vcpu;
+                struct evtchn_unmask           unmask;
+        } u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);
+#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h
new file mode 100644
index 000000000000..d73228d16488
--- /dev/null
+++ b/include/xen/interface/features.h
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * features.h
+ *
+ * Feature flags, reported by XENVER_get_features.
+ *
+ * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
+ */
+#ifndef __XEN_PUBLIC_FEATURES_H__
+#define __XEN_PUBLIC_FEATURES_H__
+/*
+ * If set, the guest does not need to write-protect its pagetables, and can
+ * update them via direct writes.
+ */
+#define XENFEAT_writable_page_tables       0
+/*
+ * If set, the guest does not need to write-protect its segment descriptor
+ * tables, and can update them via direct writes.
+ */
+#define XENFEAT_writable_descriptor_tables 1
+/*
+ * If set, translation between the guest's 'pseudo-physical' address space
+ * and the host's machine address space are handled by the hypervisor. In this
+ * mode the guest does not need to perform phys-to/from-machine translations
+ * when performing page table operations.
+ */
+#define XENFEAT_auto_translated_physmap    2
+/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
+#define XENFEAT_supervisor_mode_kernel     3
+/*
+ * If set, the guest does not need to allocate x86 PAE page directories
+ * below 4GB. This flag is usually implied by auto_translated_physmap.
+ */
+#define XENFEAT_pae_pgdir_above_4gb        4
+#define XENFEAT_NR_SUBMAPS 1
+#endif /* __XEN_PUBLIC_FEATURES_H__ */
diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
new file mode 100644
index 000000000000..219049802cf2
--- /dev/null
+++ b/include/xen/interface/grant_table.h
@@ -0,0 +1,375 @@
+/******************************************************************************
+ * grant_table.h
+ *
+ * Interface for granting foreign access to page frames, and receiving
+ * page-ownership transfers.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
+#define __XEN_PUBLIC_GRANT_TABLE_H__
+/***********************************
+ * GRANT TABLE REPRESENTATION
+ */
+/* Some rough guidelines on accessing and updating grant-table entries
+ * in a concurrency-safe manner. For more information, Linux contains a
+ * reference implementation for guest OSes (arch/xen/kernel/grant_table.c).
+ *
+ * NB. WMB is a no-op on current-generation x86 processors. However, a
+ *     compiler barrier will still be required.
+ *
+ * Introducing a valid entry into the grant table:
+ *  1. Write ent->domid.
+ *  2. Write ent->frame:
+ *      GTF_permit_access:   Frame to which access is permitted.
+ *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
+ *                           frame, or zero if none.
+ *  3. Write memory barrier (WMB).
+ *  4. Write ent->flags, inc. valid type.
+ *
+ * Invalidating an unused GTF_permit_access entry:
+ *  1. flags = ent->flags.
+ *  2. Observe that !(flags & (GTF_reading|GTF_writing)).
+ *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
+ *  NB. No need for WMB as reuse of entry is control-dependent on success of
+ *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
+ *
+ * Invalidating an in-use GTF_permit_access entry:
+ *  This cannot be done directly. Request assistance from the domain controller
+ *  which can set a timeout on the use of a grant entry and take necessary
+ *  action. (NB. This is not yet implemented!).
+ *
+ * Invalidating an unused GTF_accept_transfer entry:
+ *  1. flags = ent->flags.
+ *  2. Observe that !(flags & GTF_transfer_committed). [*]
+ *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
+ *  NB. No need for WMB as reuse of entry is control-dependent on success of
+ *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
+ *  [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
+ *      The guest must /not/ modify the grant entry until the address of the
+ *      transferred frame is written. It is safe for the guest to spin waiting
+ *      for this to occur (detect by observing GTF_transfer_completed in
+ *      ent->flags).
+ *
+ * Invalidating a committed GTF_accept_transfer entry:
+ *  1. Wait for (ent->flags & GTF_transfer_completed).
+ *
+ * Changing a GTF_permit_access from writable to read-only:
+ *  Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
+ *
+ * Changing a GTF_permit_access from read-only to writable:
+ *  Use SMP-safe bit-setting instruction.
+ */
+/*
+ * A grant table comprises a packed array of grant entries in one or more
+ * page frames shared between Xen and a guest.
+ * [XEN]: This field is written by Xen and read by the sharing guest.
+ * [GST]: This field is written by the guest and read by Xen.
+ */
+struct grant_entry {
+    /* GTF_xxx: various type and flag information.  [XEN,GST] */
+    uint16_t flags;
+    /* The domain being granted foreign privileges. [GST] */
+    domid_t  domid;
+    /*
+     * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
+     * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
+     */
+    uint32_t frame;
+};
+/*
+ * Type of grant entry.
+ *  GTF_invalid: This grant entry grants no privileges.
+ *  GTF_permit_access: Allow @domid to map/access @frame.
+ *  GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
+ *                       to this guest. Xen writes the page number to @frame.
+ */
+#define GTF_invalid         (0U<<0)
+#define GTF_permit_access   (1U<<0)
+#define GTF_accept_transfer (2U<<0)
+#define GTF_type_mask       (3U<<0)
+/*
+ * Subflags for GTF_permit_access.
+ *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
+ *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
+ *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
+ */
+#define _GTF_readonly       (2)
+#define GTF_readonly        (1U<<_GTF_readonly)
+#define _GTF_reading        (3)
+#define GTF_reading         (1U<<_GTF_reading)
+#define _GTF_writing        (4)
+#define GTF_writing         (1U<<_GTF_writing)
+/*
+ * Subflags for GTF_accept_transfer:
+ *  GTF_transfer_committed: Xen sets this flag to indicate that it is committed
+ *      to transferring ownership of a page frame. When a guest sees this flag
+ *      it must /not/ modify the grant entry until GTF_transfer_completed is
+ *      set by Xen.
+ *  GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
+ *      after reading GTF_transfer_committed. Xen will always write the frame
+ *      address, followed by ORing this flag, in a timely manner.
+ */
+#define _GTF_transfer_committed (2)
+#define GTF_transfer_committed  (1U<<_GTF_transfer_committed)
+#define _GTF_transfer_completed (3)
+#define GTF_transfer_completed  (1U<<_GTF_transfer_completed)
+/***********************************
+ * GRANT TABLE QUERIES AND USES
+ */
+/*
+ * Reference to a grant entry in a specified domain's grant table.
+ */
+typedef uint32_t grant_ref_t;
+/*
+ * Handle to track a mapping created via a grant reference.
+ */
+typedef uint32_t grant_handle_t;
+/*
+ * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
+ * by devices and/or host CPUs. If successful, <handle> is a tracking number
+ * that must be presented later to destroy the mapping(s). On error, <handle>
+ * is a negative status code.
+ * NOTES:
+ *  1. If GNTMAP_device_map is specified then <dev_bus_addr> is the address
+ *     via which I/O devices may access the granted frame.
+ *  2. If GNTMAP_host_map is specified then a mapping will be added at
+ *     either a host virtual address in the current address space, or at
+ *     a PTE at the specified machine address.  The type of mapping to
+ *     perform is selected through the GNTMAP_contains_pte flag, and the
+ *     address is specified in <host_addr>.
+ *  3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
+ *     host mapping is destroyed by other means then it is *NOT* guaranteed
+ *     to be accounted to the correct grant reference!
+ */
+#define GNTTABOP_map_grant_ref        0
+struct gnttab_map_grant_ref {
+    /* IN parameters. */
+    uint64_t host_addr;
+    uint32_t flags;               /* GNTMAP_* */
+    grant_ref_t ref;
+    domid_t  dom;
+    /* OUT parameters. */
+    int16_t  status;              /* GNTST_* */
+    grant_handle_t handle;
+    uint64_t dev_bus_addr;
+};
+/*
+ * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
+ * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that
+ * field is ignored. If non-zero, they must refer to a device/host mapping
+ * that is tracked by <handle>
+ * NOTES:
+ *  1. The call may fail in an undefined manner if either mapping is not
+ *     tracked by <handle>.
+ *  3. After executing a batch of unmaps, it is guaranteed that no stale
+ *     mappings will remain in the device or host TLBs.
+ */
+#define GNTTABOP_unmap_grant_ref      1
+struct gnttab_unmap_grant_ref {
+    /* IN parameters. */
+    uint64_t host_addr;
+    uint64_t dev_bus_addr;
+    grant_handle_t handle;
+    /* OUT parameters. */
+    int16_t  status;              /* GNTST_* */
+};
+/*
+ * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
+ * <nr_frames> pages. The frame addresses are written to the <frame_list>.
+ * Only <nr_frames> addresses are written, even if the table is larger.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
+ *  3. Xen may not support more than a single grant-table page per domain.
+ */
+#define GNTTABOP_setup_table          2
+struct gnttab_setup_table {
+    /* IN parameters. */
+    domid_t  dom;
+    uint32_t nr_frames;
+    /* OUT parameters. */
+    int16_t  status;              /* GNTST_* */
+    ulong *frame_list;
+};
+/*
+ * GNTTABOP_dump_table: Dump the contents of the grant table to the
+ * xen console. Debugging use only.
+ */
+#define GNTTABOP_dump_table           3
+struct gnttab_dump_table {
+    /* IN parameters. */
+    domid_t dom;
+    /* OUT parameters. */
+    int16_t status;               /* GNTST_* */
+};
+/*
+ * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
+ * foreign domain has previously registered its interest in the transfer via
+ * <domid, ref>.
+ *
+ * Note that, even if the transfer fails, the specified page no longer belongs
+ * to the calling domain *unless* the error is GNTST_bad_page.
+ */
+#define GNTTABOP_transfer                4
+struct gnttab_transfer {
+    /* IN parameters. */
+    unsigned long mfn;
+    domid_t       domid;
+    grant_ref_t   ref;
+    /* OUT parameters. */
+    int16_t       status;
+};
+/*
+ * GNTTABOP_copy: Hypervisor based copy
+ * source and destinations can be eithers MFNs or, for foreign domains,
+ * grant references. the foreign domain has to grant read/write access
+ * in its grant table.
+ *
+ * The flags specify what type source and destinations are (either MFN
+ * or grant reference).
+ *
+ * Note that this can also be used to copy data between two domains
+ * via a third party if the source and destination domains had previously
+ * grant appropriate access to their pages to the third party.
+ *
+ * source_offset specifies an offset in the source frame, dest_offset
+ * the offset in the target frame and  len specifies the number of
+ * bytes to be copied.
+ */
+#define _GNTCOPY_source_gref      (0)
+#define GNTCOPY_source_gref       (1<<_GNTCOPY_source_gref)
+#define _GNTCOPY_dest_gref        (1)
+#define GNTCOPY_dest_gref         (1<<_GNTCOPY_dest_gref)
+#define GNTTABOP_copy                 5
+struct gnttab_copy {
+        /* IN parameters. */
+        struct {
+                union {
+                        grant_ref_t ref;
+                        unsigned long   gmfn;
+                } u;
+                domid_t  domid;
+                uint16_t offset;
+        } source, dest;
+        uint16_t      len;
+        uint16_t      flags;          /* GNTCOPY_* */
+        /* OUT parameters. */
+        int16_t       status;
+};
+/*
+ * GNTTABOP_query_size: Query the current and maximum sizes of the shared
+ * grant table.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
+ */
+#define GNTTABOP_query_size           6
+struct gnttab_query_size {
+    /* IN parameters. */
+    domid_t  dom;
+    /* OUT parameters. */
+    uint32_t nr_frames;
+    uint32_t max_nr_frames;
+    int16_t  status;              /* GNTST_* */
+};
+/*
+ * Bitfield values for update_pin_status.flags.
+ */
+ /* Map the grant entry for access by I/O devices. */
+#define _GNTMAP_device_map      (0)
+#define GNTMAP_device_map       (1<<_GNTMAP_device_map)
+ /* Map the grant entry for access by host CPUs. */
+#define _GNTMAP_host_map        (1)
+#define GNTMAP_host_map         (1<<_GNTMAP_host_map)
+ /* Accesses to the granted frame will be restricted to read-only access. */
+#define _GNTMAP_readonly        (2)
+#define GNTMAP_readonly         (1<<_GNTMAP_readonly)
+ /*
+  * GNTMAP_host_map subflag:
+  *  0 => The host mapping is usable only by the guest OS.
+  *  1 => The host mapping is usable by guest OS + current application.
+  */
+#define _GNTMAP_application_map (3)
+#define GNTMAP_application_map  (1<<_GNTMAP_application_map)
+ /*
+  * GNTMAP_contains_pte subflag:
+  *  0 => This map request contains a host virtual address.
+  *  1 => This map request contains the machine addess of the PTE to update.
+  */
+#define _GNTMAP_contains_pte    (4)
+#define GNTMAP_contains_pte     (1<<_GNTMAP_contains_pte)
+/*
+ * Values for error status returns. All errors are -ve.
+ */
+#define GNTST_okay             (0)  /* Normal return.                        */
+#define GNTST_general_error    (-1) /* General undefined error.              */
+#define GNTST_bad_domain       (-2) /* Unrecognsed domain id.                */
+#define GNTST_bad_gntref       (-3) /* Unrecognised or inappropriate gntref. */
+#define GNTST_bad_handle       (-4) /* Unrecognised or inappropriate handle. */
+#define GNTST_bad_virt_addr    (-5) /* Inappropriate virtual address to map. */
+#define GNTST_bad_dev_addr     (-6) /* Inappropriate device address to unmap.*/
+#define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
+#define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
+#define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
+#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary */
+#define GNTTABOP_error_msgs {                   \
+    "okay",                                     \
+    "undefined error",                          \
+    "unrecognised domain id",                   \
+    "invalid grant reference",                  \
+    "invalid mapping handle",                   \
+    "invalid virtual address",                  \
+    "invalid device address",                   \
+    "no spare translation slot in the I/O MMU", \
+    "permission denied",                        \
+    "bad page",                                 \
+    "copy arguments cross page boundary"        \
+}
+#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
new file mode 100644
index 000000000000..c2d1fa4dc1ee
--- /dev/null
+++ b/include/xen/interface/io/blkif.h
@@ -0,0 +1,94 @@
+/******************************************************************************
+ * blkif.h
+ *
+ * Unified block-device I/O interface for Xen guest OSes.
+ *
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+#ifndef __XEN_PUBLIC_IO_BLKIF_H__
+#define __XEN_PUBLIC_IO_BLKIF_H__
+#include "ring.h"
+#include "../grant_table.h"
+/*
+ * Front->back notifications: When enqueuing a new request, sending a
+ * notification can be made conditional on req_event (i.e., the generic
+ * hold-off mechanism provided by the ring macros). Backends must set
+ * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
+ *
+ * Back->front notifications: When enqueuing a new response, sending a
+ * notification can be made conditional on rsp_event (i.e., the generic
+ * hold-off mechanism provided by the ring macros). Frontends must set
+ * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
+ */
+typedef uint16_t blkif_vdev_t;
+typedef uint64_t blkif_sector_t;
+/*
+ * REQUEST CODES.
+ */
+#define BLKIF_OP_READ              0
+#define BLKIF_OP_WRITE             1
+/*
+ * Recognised only if "feature-barrier" is present in backend xenbus info.
+ * The "feature_barrier" node contains a boolean indicating whether barrier
+ * requests are likely to succeed or fail. Either way, a barrier request
+ * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
+ * the underlying block-device hardware. The boolean simply indicates whether
+ * or not it is worthwhile for the frontend to attempt barrier requests.
+ * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not*
+ * create the "feature-barrier" node!
+ */
+#define BLKIF_OP_WRITE_BARRIER     2
+/*
+ * Maximum scatter/gather segments per request.
+ * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
+ * NB. This could be 12 if the ring indexes weren't stored in the same page.
+ */
+#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
+struct blkif_request {
+        uint8_t        operation;    /* BLKIF_OP_???                         */
+        uint8_t        nr_segments;  /* number of segments                   */
+        blkif_vdev_t   handle;       /* only for read/write requests         */
+        uint64_t       id;           /* private guest value, echoed in resp  */
+        blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+        struct blkif_request_segment {
+                grant_ref_t gref;        /* reference to I/O buffer frame        */
+                /* @first_sect: first sector in frame to transfer (inclusive).   */
+                /* @last_sect: last sector in frame to transfer (inclusive).     */
+                uint8_t     first_sect, last_sect;
+        } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_response {
+        uint64_t        id;              /* copied from request */
+        uint8_t         operation;       /* copied from request */
+        int16_t         status;          /* BLKIF_RSP_???       */
+};
+/*
+ * STATUS RETURN CODES.
+ */
+ /* Operation not supported (only happens on barrier writes). */
+#define BLKIF_RSP_EOPNOTSUPP  -2
+ /* Operation failed for some unspecified reason (-EIO). */
+#define BLKIF_RSP_ERROR       -1
+ /* Operation completed successfully. */
+#define BLKIF_RSP_OKAY         0
+/*
+ * Generate blkif ring structures and types.
+ */
+DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
+#define VDISK_CDROM        0x1
+#define VDISK_REMOVABLE    0x2
+#define VDISK_READONLY     0x4
+#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
diff --git a/include/xen/interface/io/console.h b/include/xen/interface/io/console.h
new file mode 100644
index 000000000000..e563de70f784
--- /dev/null
+++ b/include/xen/interface/io/console.h
@@ -0,0 +1,23 @@
+/******************************************************************************
+ * console.h
+ *
+ * Console I/O interface for Xen guest OSes.
+ *
+ * Copyright (c) 2005, Keir Fraser
+ */
+#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
+#define __XEN_PUBLIC_IO_CONSOLE_H__
+typedef uint32_t XENCONS_RING_IDX;
+#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1))
+struct xencons_interface {
+    char in[1024];
+    char out[2048];
+    XENCONS_RING_IDX in_cons, in_prod;
+    XENCONS_RING_IDX out_cons, out_prod;
+};
+#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */
diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h
new file mode 100644
index 000000000000..518481c95f18
--- /dev/null
+++ b/include/xen/interface/io/netif.h
@@ -0,0 +1,158 @@
+/******************************************************************************
+ * netif.h
+ *
+ * Unified network-device I/O interface for Xen guest OSes.
+ *
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+#ifndef __XEN_PUBLIC_IO_NETIF_H__
+#define __XEN_PUBLIC_IO_NETIF_H__
+#include "ring.h"
+#include "../grant_table.h"
+/*
+ * Notifications after enqueuing any type of message should be conditional on
+ * the appropriate req_event or rsp_event field in the shared ring.
+ * If the client sends notification for rx requests then it should specify
+ * feature 'feature-rx-notify' via xenbus. Otherwise the backend will assume
+ * that it cannot safely queue packets (as it may not be kicked to send them).
+ */
+/*
+ * This is the 'wire' format for packets:
+ *  Request 1: netif_tx_request -- NETTXF_* (any flags)
+ * [Request 2: netif_tx_extra]  (only if request 1 has NETTXF_extra_info)
+ * [Request 3: netif_tx_extra]  (only if request 2 has XEN_NETIF_EXTRA_MORE)
+ *  Request 4: netif_tx_request -- NETTXF_more_data
+ *  Request 5: netif_tx_request -- NETTXF_more_data
+ *  ...
+ *  Request N: netif_tx_request -- 0
+ */
+/* Protocol checksum field is blank in the packet (hardware offload)? */
+#define _NETTXF_csum_blank     (0)
+#define  NETTXF_csum_blank     (1U<<_NETTXF_csum_blank)
+/* Packet data has been validated against protocol checksum. */
+#define _NETTXF_data_validated (1)
+#define  NETTXF_data_validated (1U<<_NETTXF_data_validated)
+/* Packet continues in the next request descriptor. */
+#define _NETTXF_more_data      (2)
+#define  NETTXF_more_data      (1U<<_NETTXF_more_data)
+/* Packet to be followed by extra descriptor(s). */
+#define _NETTXF_extra_info     (3)
+#define  NETTXF_extra_info     (1U<<_NETTXF_extra_info)
+struct xen_netif_tx_request {
+    grant_ref_t gref;      /* Reference to buffer page */
+    uint16_t offset;       /* Offset within buffer page */
+    uint16_t flags;        /* NETTXF_* */
+    uint16_t id;           /* Echoed in response message. */
+    uint16_t size;         /* Packet size in bytes.       */
+};
+/* Types of netif_extra_info descriptors. */
+#define XEN_NETIF_EXTRA_TYPE_NONE  (0)  /* Never used - invalid */
+#define XEN_NETIF_EXTRA_TYPE_GSO   (1)  /* u.gso */
+#define XEN_NETIF_EXTRA_TYPE_MAX   (2)
+/* netif_extra_info flags. */
+#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
+#define XEN_NETIF_EXTRA_FLAG_MORE  (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
+/* GSO types - only TCPv4 currently supported. */
+#define XEN_NETIF_GSO_TYPE_TCPV4        (1)
+/*
+ * This structure needs to fit within both netif_tx_request and
+ * netif_rx_response for compatibility.
+ */
+struct xen_netif_extra_info {
+        uint8_t type;  /* XEN_NETIF_EXTRA_TYPE_* */
+        uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
+        union {
+                struct {
+                        /*
+                         * Maximum payload size of each segment. For
+                         * example, for TCP this is just the path MSS.
+                         */
+                        uint16_t size;
+                        /*
+                         * GSO type. This determines the protocol of
+                         * the packet and any extra features required
+                         * to segment the packet properly.
+                         */
+                        uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
+                        /* Future expansion. */
+                        uint8_t pad;
+                        /*
+                         * GSO features. This specifies any extra GSO
+                         * features required to process this packet,
+                         * such as ECN support for TCPv4.
+                         */
+                        uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
+                } gso;
+                uint16_t pad[3];
+        } u;
+};
+struct xen_netif_tx_response {
+        uint16_t id;
+        int16_t  status;       /* NETIF_RSP_* */
+};
+struct xen_netif_rx_request {
+        uint16_t    id;        /* Echoed in response message.        */
+        grant_ref_t gref;      /* Reference to incoming granted frame */
+};
+/* Packet data has been validated against protocol checksum. */
+#define _NETRXF_data_validated (0)
+#define  NETRXF_data_validated (1U<<_NETRXF_data_validated)
+/* Protocol checksum field is blank in the packet (hardware offload)? */
+#define _NETRXF_csum_blank     (1)
+#define  NETRXF_csum_blank     (1U<<_NETRXF_csum_blank)
+/* Packet continues in the next request descriptor. */
+#define _NETRXF_more_data      (2)
+#define  NETRXF_more_data      (1U<<_NETRXF_more_data)
+/* Packet to be followed by extra descriptor(s). */
+#define _NETRXF_extra_info     (3)
+#define  NETRXF_extra_info     (1U<<_NETRXF_extra_info)
+struct xen_netif_rx_response {
+    uint16_t id;
+    uint16_t offset;       /* Offset in page of start of received packet  */
+    uint16_t flags;        /* NETRXF_* */
+    int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
+};
+/*
+ * Generate netif ring structures and types.
+ */
+DEFINE_RING_TYPES(xen_netif_tx,
+                  struct xen_netif_tx_request,
+                  struct xen_netif_tx_response);
+DEFINE_RING_TYPES(xen_netif_rx,
+                  struct xen_netif_rx_request,
+                  struct xen_netif_rx_response);
+#define NETIF_RSP_DROPPED         -2
+#define NETIF_RSP_ERROR           -1
+#define NETIF_RSP_OKAY             0
+/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
+#define NETIF_RSP_NULL             1
+#endif
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
new file mode 100644
index 000000000000..e8cbf431c8cc
--- /dev/null
+++ b/include/xen/interface/io/ring.h
@@ -0,0 +1,260 @@
+/******************************************************************************
+ * ring.h
+ *
+ * Shared producer-consumer ring macros.
+ *
+ * Tim Deegan and Andrew Warfield November 2004.
+ */
+#ifndef __XEN_PUBLIC_IO_RING_H__
+#define __XEN_PUBLIC_IO_RING_H__
+typedef unsigned int RING_IDX;
+/* Round a 32-bit unsigned constant down to the nearest power of two. */
+#define __RD2(_x)  (((_x) & 0x00000002) ? 0x2                  : ((_x) & 0x1))
+#define __RD4(_x)  (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2    : __RD2(_x))
+#define __RD8(_x)  (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4    : __RD4(_x))
+#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8    : __RD8(_x))
+#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
+/*
+ * Calculate size of a shared ring, given the total available space for the
+ * ring and indexes (_sz), and the name tag of the request/response structure.
+ * A ring contains as many entries as will fit, rounded down to the nearest
+ * power of two (so we can mask with (size-1) to loop around).
+ */
+#define __RING_SIZE(_s, _sz) \
+    (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
+/*
+ * Macros to make the correct C datatypes for a new kind of ring.
+ *
+ * To make a new ring datatype, you need to have two message structures,
+ * let's say struct request, and struct response already defined.
+ *
+ * In a header where you want the ring datatype declared, you then do:
+ *
+ *     DEFINE_RING_TYPES(mytag, struct request, struct response);
+ *
+ * These expand out to give you a set of types, as you can see below.
+ * The most important of these are:
+ *
+ *     struct mytag_sring      - The shared ring.
+ *     struct mytag_front_ring - The 'front' half of the ring.
+ *     struct mytag_back_ring  - The 'back' half of the ring.
+ *
+ * To initialize a ring in your code you need to know the location and size
+ * of the shared memory area (PAGE_SIZE, for instance). To initialise
+ * the front half:
+ *
+ *     struct mytag_front_ring front_ring;
+ *     SHARED_RING_INIT((struct mytag_sring *)shared_page);
+ *     FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page,
+ *                     PAGE_SIZE);
+ *
+ * Initializing the back follows similarly (note that only the front
+ * initializes the shared ring):
+ *
+ *     struct mytag_back_ring back_ring;
+ *     BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page,
+ *                    PAGE_SIZE);
+ */
+#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)                     \
+                                                                        \
+/* Shared ring entry */                                                 \
+union __name##_sring_entry {                                            \
+    __req_t req;                                                        \
+    __rsp_t rsp;                                                        \
+};                                                                      \
+                                                                        \
+/* Shared ring page */                                                  \
+struct __name##_sring {                                                 \
+    RING_IDX req_prod, req_event;                                       \
+    RING_IDX rsp_prod, rsp_event;                                       \
+    uint8_t  pad[48];                                                   \
+    union __name##_sring_entry ring[1]; /* variable-length */           \
+};                                                                      \
+                                                                        \
+/* "Front" end's private variables */                                   \
+struct __name##_front_ring {                                            \
+    RING_IDX req_prod_pvt;                                              \
+    RING_IDX rsp_cons;                                                  \
+    unsigned int nr_ents;                                               \
+    struct __name##_sring *sring;                                       \
+};                                                                      \
+                                                                        \
+/* "Back" end's private variables */                                    \
+struct __name##_back_ring {                                             \
+    RING_IDX rsp_prod_pvt;                                              \
+    RING_IDX req_cons;                                                  \
+    unsigned int nr_ents;                                               \
+    struct __name##_sring *sring;                                       \
+};
+/*
+ * Macros for manipulating rings.
+ *
+ * FRONT_RING_whatever works on the "front end" of a ring: here
+ * requests are pushed on to the ring and responses taken off it.
+ *
+ * BACK_RING_whatever works on the "back end" of a ring: here
+ * requests are taken off the ring and responses put on.
+ *
+ * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL.
+ * This is OK in 1-for-1 request-response situations where the
+ * requestor (front end) never has more than RING_SIZE()-1
+ * outstanding requests.
+ */
+/* Initialising empty rings */
+#define SHARED_RING_INIT(_s) do {                                       \
+    (_s)->req_prod  = (_s)->rsp_prod  = 0;                              \
+    (_s)->req_event = (_s)->rsp_event = 1;                              \
+    memset((_s)->pad, 0, sizeof((_s)->pad));                            \
+} while(0)
+#define FRONT_RING_INIT(_r, _s, __size) do {                            \
+    (_r)->req_prod_pvt = 0;                                             \
+    (_r)->rsp_cons = 0;                                                 \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+    (_r)->sring = (_s);                                                 \
+} while (0)
+#define BACK_RING_INIT(_r, _s, __size) do {                             \
+    (_r)->rsp_prod_pvt = 0;                                             \
+    (_r)->req_cons = 0;                                                 \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+    (_r)->sring = (_s);                                                 \
+} while (0)
+/* Initialize to existing shared indexes -- for recovery */
+#define FRONT_RING_ATTACH(_r, _s, __size) do {                          \
+    (_r)->sring = (_s);                                                 \
+    (_r)->req_prod_pvt = (_s)->req_prod;                                \
+    (_r)->rsp_cons = (_s)->rsp_prod;                                    \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+} while (0)
+#define BACK_RING_ATTACH(_r, _s, __size) do {                           \
+    (_r)->sring = (_s);                                                 \
+    (_r)->rsp_prod_pvt = (_s)->rsp_prod;                                \
+    (_r)->req_cons = (_s)->req_prod;                                    \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+} while (0)
+/* How big is this ring? */
+#define RING_SIZE(_r)                                                   \
+    ((_r)->nr_ents)
+/* Number of free requests (for use on front side only). */
+#define RING_FREE_REQUESTS(_r)                                          \
+    (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
+/* Test if there is an empty slot available on the front ring.
+ * (This is only meaningful from the front. )
+ */
+#define RING_FULL(_r)                                                   \
+    (RING_FREE_REQUESTS(_r) == 0)
+/* Test if there are outstanding messages to be processed on a ring. */
+#define RING_HAS_UNCONSUMED_RESPONSES(_r)                               \
+    ((_r)->sring->rsp_prod - (_r)->rsp_cons)
+#define RING_HAS_UNCONSUMED_REQUESTS(_r)                                \
+    ({                                                                  \
+        unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;      \
+        unsigned int rsp = RING_SIZE(_r) -                              \
+                           ((_r)->req_cons - (_r)->rsp_prod_pvt);       \
+        req < rsp ? req : rsp;                                          \
+    })
+/* Direct access to individual ring elements, by index. */
+#define RING_GET_REQUEST(_r, _idx)                                      \
+    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
+#define RING_GET_RESPONSE(_r, _idx)                                     \
+    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
+/* Loop termination condition: Would the specified index overflow the ring? */
+#define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                           \
+    (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
+#define RING_PUSH_REQUESTS(_r) do {                                     \
+    wmb(); /* back sees requests /before/ updated producer index */     \
+    (_r)->sring->req_prod = (_r)->req_prod_pvt;                         \
+} while (0)
+#define RING_PUSH_RESPONSES(_r) do {                                    \
+    wmb(); /* front sees responses /before/ updated producer index */   \
+    (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;                         \
+} while (0)
+/*
+ * Notification hold-off (req_event and rsp_event):
+ *
+ * When queueing requests or responses on a shared ring, it may not always be
+ * necessary to notify the remote end. For example, if requests are in flight
+ * in a backend, the front may be able to queue further requests without
+ * notifying the back (if the back checks for new requests when it queues
+ * responses).
+ *
+ * When enqueuing requests or responses:
+ *
+ *  Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument
+ *  is a boolean return value. True indicates that the receiver requires an
+ *  asynchronous notification.
+ *
+ * After dequeuing requests or responses (before sleeping the connection):
+ *
+ *  Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES().
+ *  The second argument is a boolean return value. True indicates that there
+ *  are pending messages on the ring (i.e., the connection should not be put
+ *  to sleep).
+ *
+ *  These macros will set the req_event/rsp_event field to trigger a
+ *  notification on the very next message that is enqueued. If you want to
+ *  create batches of work (i.e., only receive a notification after several
+ *  messages have been enqueued) then you will need to create a customised
+ *  version of the FINAL_CHECK macro in your own code, which sets the event
+ *  field appropriately.
+ */
+#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do {           \
+    RING_IDX __old = (_r)->sring->req_prod;                             \
+    RING_IDX __new = (_r)->req_prod_pvt;                                \
+    wmb(); /* back sees requests /before/ updated producer index */     \
+    (_r)->sring->req_prod = __new;                                      \
+    mb(); /* back sees new requests /before/ we check req_event */      \
+    (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <           \
+                 (RING_IDX)(__new - __old));                            \
+} while (0)
+#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do {          \
+    RING_IDX __old = (_r)->sring->rsp_prod;                             \
+    RING_IDX __new = (_r)->rsp_prod_pvt;                                \
+    wmb(); /* front sees responses /before/ updated producer index */   \
+    (_r)->sring->rsp_prod = __new;                                      \
+    mb(); /* front sees new responses /before/ we check rsp_event */    \
+    (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <           \
+                 (RING_IDX)(__new - __old));                            \
+} while (0)
+#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do {             \
+    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
+    if (_work_to_do) break;                                             \
+    (_r)->sring->req_event = (_r)->req_cons + 1;                        \
+    mb();                                                               \
+    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
+} while (0)
+#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do {            \
+    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
+    if (_work_to_do) break;                                             \
+    (_r)->sring->rsp_event = (_r)->rsp_cons + 1;                        \
+    mb();                                                               \
+    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
+} while (0)
+#endif /* __XEN_PUBLIC_IO_RING_H__ */
diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h
new file mode 100644
index 000000000000..46508c7fa399
--- /dev/null
+++ b/include/xen/interface/io/xenbus.h
@@ -0,0 +1,44 @@
+/*****************************************************************************
+ * xenbus.h
+ *
+ * Xenbus protocol details.
+ *
+ * Copyright (C) 2005 XenSource Ltd.
+ */
+#ifndef _XEN_PUBLIC_IO_XENBUS_H
+#define _XEN_PUBLIC_IO_XENBUS_H
+/* The state of either end of the Xenbus, i.e. the current communication
+   status of initialisation across the bus.  States here imply nothing about
+   the state of the connection between the driver and the kernel's device
+   layers.  */
+enum xenbus_state
+{
+        XenbusStateUnknown      = 0,
+        XenbusStateInitialising = 1,
+        XenbusStateInitWait     = 2,  /* Finished early
+                                         initialisation, but waiting
+                                         for information from the peer
+                                         or hotplug scripts. */
+        XenbusStateInitialised  = 3,  /* Initialised and waiting for a
+                                         connection from the peer. */
+        XenbusStateConnected    = 4,
+        XenbusStateClosing      = 5,  /* The device is being closed
+                                         due to an error or an unplug
+                                         event. */
+        XenbusStateClosed       = 6
+};
+#endif /* _XEN_PUBLIC_IO_XENBUS_H */
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
diff --git a/include/xen/interface/io/xs_wire.h b/include/xen/interface/io/xs_wire.h
new file mode 100644
index 000000000000..99fcffb372d1
--- /dev/null
+++ b/include/xen/interface/io/xs_wire.h
@@ -0,0 +1,87 @@
+/*
+ * Details of the "wire" protocol between Xen Store Daemon and client
+ * library or guest kernel.
+ * Copyright (C) 2005 Rusty Russell IBM Corporation
+ */
+#ifndef _XS_WIRE_H
+#define _XS_WIRE_H
+enum xsd_sockmsg_type
+{
+    XS_DEBUG,
+    XS_DIRECTORY,
+    XS_READ,
+    XS_GET_PERMS,
+    XS_WATCH,
+    XS_UNWATCH,
+    XS_TRANSACTION_START,
+    XS_TRANSACTION_END,
+    XS_INTRODUCE,
+    XS_RELEASE,
+    XS_GET_DOMAIN_PATH,
+    XS_WRITE,
+    XS_MKDIR,
+    XS_RM,
+    XS_SET_PERMS,
+    XS_WATCH_EVENT,
+    XS_ERROR,
+    XS_IS_DOMAIN_INTRODUCED
+};
+#define XS_WRITE_NONE "NONE"
+#define XS_WRITE_CREATE "CREATE"
+#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
+/* We hand errors as strings, for portability. */
+struct xsd_errors
+{
+    int errnum;
+    const char *errstring;
+};
+#define XSD_ERROR(x) { x, #x }
+static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
+    XSD_ERROR(EINVAL),
+    XSD_ERROR(EACCES),
+    XSD_ERROR(EEXIST),
+    XSD_ERROR(EISDIR),
+    XSD_ERROR(ENOENT),
+    XSD_ERROR(ENOMEM),
+    XSD_ERROR(ENOSPC),
+    XSD_ERROR(EIO),
+    XSD_ERROR(ENOTEMPTY),
+    XSD_ERROR(ENOSYS),
+    XSD_ERROR(EROFS),
+    XSD_ERROR(EBUSY),
+    XSD_ERROR(EAGAIN),
+    XSD_ERROR(EISCONN)
+};
+struct xsd_sockmsg
+{
+    uint32_t type;  /* XS_??? */
+    uint32_t req_id;/* Request identifier, echoed in daemon's response.  */
+    uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
+    uint32_t len;   /* Length of data following this. */
+    /* Generally followed by nul-terminated string(s). */
+};
+enum xs_watch_type
+{
+    XS_WATCH_PATH = 0,
+    XS_WATCH_TOKEN
+};
+/* Inter-domain shared memory communications. */
+#define XENSTORE_RING_SIZE 1024
+typedef uint32_t XENSTORE_RING_IDX;
+#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
+struct xenstore_domain_interface {
+    char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */
+    char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
+    XENSTORE_RING_IDX req_cons, req_prod;
+    XENSTORE_RING_IDX rsp_cons, rsp_prod;
+};
+#endif /* _XS_WIRE_H */
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
new file mode 100644
index 000000000000..af36ead16817
--- /dev/null
+++ b/include/xen/interface/memory.h
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * memory.h
+ *
+ * Memory reservation and information.
+ *
+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
+ */
+#ifndef __XEN_PUBLIC_MEMORY_H__
+#define __XEN_PUBLIC_MEMORY_H__
+/*
+ * Increase or decrease the specified domain's memory reservation. Returns a
+ * -ve errcode on failure, or the # extents successfully allocated or freed.
+ * arg == addr of struct xen_memory_reservation.
+ */
+#define XENMEM_increase_reservation 0
+#define XENMEM_decrease_reservation 1
+#define XENMEM_populate_physmap     6
+struct xen_memory_reservation {
+    /*
+     * XENMEM_increase_reservation:
+     *   OUT: MFN (*not* GMFN) bases of extents that were allocated
+     * XENMEM_decrease_reservation:
+     *   IN:  GMFN bases of extents to free
+     * XENMEM_populate_physmap:
+     *   IN:  GPFN bases of extents to populate with memory
+     *   OUT: GMFN bases of extents that were allocated
+     *   (NB. This command also updates the mach_to_phys translation table)
+     */
+    GUEST_HANDLE(ulong) extent_start;
+    /* Number of extents, and size/alignment of each (2^extent_order pages). */
+    unsigned long  nr_extents;
+    unsigned int   extent_order;
+    /*
+     * Maximum # bits addressable by the user of the allocated region (e.g.,
+     * I/O devices often have a 32-bit limitation even in 64-bit systems). If
+     * zero then the user has no addressing restriction.
+     * This field is not used by XENMEM_decrease_reservation.
+     */
+    unsigned int   address_bits;
+    /*
+     * Domain whose reservation is being changed.
+     * Unprivileged domains can specify only DOMID_SELF.
+     */
+    domid_t        domid;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
+/*
+ * Returns the maximum machine frame number of mapped RAM in this system.
+ * This command always succeeds (it never returns an error code).
+ * arg == NULL.
+ */
+#define XENMEM_maximum_ram_page     2
+/*
+ * Returns the current or maximum memory reservation, in pages, of the
+ * specified domain (may be DOMID_SELF). Returns -ve errcode on failure.
+ * arg == addr of domid_t.
+ */
+#define XENMEM_current_reservation  3
+#define XENMEM_maximum_reservation  4
+/*
+ * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
+ * mapping table. Architectures which do not have a m2p table do not implement
+ * this command.
+ * arg == addr of xen_machphys_mfn_list_t.
+ */
+#define XENMEM_machphys_mfn_list    5
+struct xen_machphys_mfn_list {
+    /*
+     * Size of the 'extent_start' array. Fewer entries will be filled if the
+     * machphys table is smaller than max_extents * 2MB.
+     */
+    unsigned int max_extents;
+    /*
+     * Pointer to buffer to fill with list of extent starts. If there are
+     * any large discontiguities in the machine address space, 2MB gaps in
+     * the machphys table will be represented by an MFN base of zero.
+     */
+    GUEST_HANDLE(ulong) extent_start;
+    /*
+     * Number of extents written to the above array. This will be smaller
+     * than 'max_extents' if the machphys table is smaller than max_e * 2MB.
+     */
+    unsigned int nr_extents;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
+/*
+ * Sets the GPFN at which a particular page appears in the specified guest's
+ * pseudophysical address space.
+ * arg == addr of xen_add_to_physmap_t.
+ */
+#define XENMEM_add_to_physmap      7
+struct xen_add_to_physmap {
+    /* Which domain to change the mapping for. */
+    domid_t domid;
+    /* Source mapping space. */
+#define XENMAPSPACE_shared_info 0 /* shared info page */
+#define XENMAPSPACE_grant_table 1 /* grant table page */
+    unsigned int space;
+    /* Index into source mapping space. */
+    unsigned long idx;
+    /* GPFN where the source mapping page should appear. */
+    unsigned long gpfn;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
+/*
+ * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
+ * code on failure. This call only works for auto-translated guests.
+ */
+#define XENMEM_translate_gpfn_list  8
+struct xen_translate_gpfn_list {
+    /* Which domain to translate for? */
+    domid_t domid;
+    /* Length of list. */
+    unsigned long nr_gpfns;
+    /* List of GPFNs to translate. */
+    GUEST_HANDLE(ulong) gpfn_list;
+    /*
+     * Output list to contain MFN translations. May be the same as the input
+     * list (in which case each input GPFN is overwritten with the output MFN).
+     */
+    GUEST_HANDLE(ulong) mfn_list;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
+#endif /* __XEN_PUBLIC_MEMORY_H__ */
diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
new file mode 100644
index 000000000000..cd6939147cb6
--- /dev/null
+++ b/include/xen/interface/physdev.h
@@ -0,0 +1,145 @@
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef __XEN_PUBLIC_PHYSDEV_H__
+#define __XEN_PUBLIC_PHYSDEV_H__
+/*
+ * Prototype for this hypercall is:
+ *  int physdev_op(int cmd, void *args)
+ * @cmd  == PHYSDEVOP_??? (physdev operation).
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+/*
+ * Notify end-of-interrupt (EOI) for the specified IRQ.
+ * @arg == pointer to physdev_eoi structure.
+ */
+#define PHYSDEVOP_eoi                   12
+struct physdev_eoi {
+        /* IN */
+        uint32_t irq;
+};
+/*
+ * Query the status of an IRQ line.
+ * @arg == pointer to physdev_irq_status_query structure.
+ */
+#define PHYSDEVOP_irq_status_query       5
+struct physdev_irq_status_query {
+        /* IN */
+        uint32_t irq;
+        /* OUT */
+        uint32_t flags; /* XENIRQSTAT_* */
+};
+/* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
+#define _XENIRQSTAT_needs_eoi   (0)
+#define  XENIRQSTAT_needs_eoi   (1U<<_XENIRQSTAT_needs_eoi)
+/* IRQ shared by multiple guests? */
+#define _XENIRQSTAT_shared      (1)
+#define  XENIRQSTAT_shared      (1U<<_XENIRQSTAT_shared)
+/*
+ * Set the current VCPU's I/O privilege level.
+ * @arg == pointer to physdev_set_iopl structure.
+ */
+#define PHYSDEVOP_set_iopl               6
+struct physdev_set_iopl {
+        /* IN */
+        uint32_t iopl;
+};
+/*
+ * Set the current VCPU's I/O-port permissions bitmap.
+ * @arg == pointer to physdev_set_iobitmap structure.
+ */
+#define PHYSDEVOP_set_iobitmap           7
+struct physdev_set_iobitmap {
+        /* IN */
+        uint8_t * bitmap;
+        uint32_t nr_ports;
+};
+/*
+ * Read or write an IO-APIC register.
+ * @arg == pointer to physdev_apic structure.
+ */
+#define PHYSDEVOP_apic_read              8
+#define PHYSDEVOP_apic_write             9
+struct physdev_apic {
+        /* IN */
+        unsigned long apic_physbase;
+        uint32_t reg;
+        /* IN or OUT */
+        uint32_t value;
+};
+/*
+ * Allocate or free a physical upcall vector for the specified IRQ line.
+ * @arg == pointer to physdev_irq structure.
+ */
+#define PHYSDEVOP_alloc_irq_vector      10
+#define PHYSDEVOP_free_irq_vector       11
+struct physdev_irq {
+        /* IN */
+        uint32_t irq;
+        /* IN or OUT */
+        uint32_t vector;
+};
+/*
+ * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
+ * hypercall since 0x00030202.
+ */
+struct physdev_op {
+        uint32_t cmd;
+        union {
+                struct physdev_irq_status_query      irq_status_query;
+                struct physdev_set_iopl              set_iopl;
+                struct physdev_set_iobitmap          set_iobitmap;
+                struct physdev_apic                  apic_op;
+                struct physdev_irq                   irq_op;
+        } u;
+};
+/*
+ * Notify that some PIRQ-bound event channels have been unmasked.
+ * ** This command is obsolete since interface version 0x00030202 and is **
+ * ** unsupported by newer versions of Xen.                              **
+ */
+#define PHYSDEVOP_IRQ_UNMASK_NOTIFY      4
+/*
+ * These all-capitals physdev operation names are superceded by the new names
+ * (defined above) since interface version 0x00030202.
+ */
+#define PHYSDEVOP_IRQ_STATUS_QUERY       PHYSDEVOP_irq_status_query
+#define PHYSDEVOP_SET_IOPL               PHYSDEVOP_set_iopl
+#define PHYSDEVOP_SET_IOBITMAP           PHYSDEVOP_set_iobitmap
+#define PHYSDEVOP_APIC_READ              PHYSDEVOP_apic_read
+#define PHYSDEVOP_APIC_WRITE             PHYSDEVOP_apic_write
+#define PHYSDEVOP_ASSIGN_VECTOR          PHYSDEVOP_alloc_irq_vector
+#define PHYSDEVOP_FREE_VECTOR            PHYSDEVOP_free_irq_vector
+#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
+#define PHYSDEVOP_IRQ_SHARED             XENIRQSTAT_shared
+#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h
new file mode 100644
index 000000000000..5fec575a800a
--- /dev/null
+++ b/include/xen/interface/sched.h
@@ -0,0 +1,77 @@
+/******************************************************************************
+ * sched.h
+ *
+ * Scheduler state interactions
+ *
+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
+ */
+#ifndef __XEN_PUBLIC_SCHED_H__
+#define __XEN_PUBLIC_SCHED_H__
+#include "event_channel.h"
+/*
+ * The prototype for this hypercall is:
+ *  long sched_op_new(int cmd, void *arg)
+ * @cmd == SCHEDOP_??? (scheduler operation).
+ * @arg == Operation-specific extra argument(s), as described below.
+ *
+ * **NOTE**:
+ * Versions of Xen prior to 3.0.2 provide only the following legacy version
+ * of this hypercall, supporting only the commands yield, block and shutdown:
+ *  long sched_op(int cmd, unsigned long arg)
+ * @cmd == SCHEDOP_??? (scheduler operation).
+ * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
+ *      == SHUTDOWN_* code (SCHEDOP_shutdown)
+ */
+/*
+ * Voluntarily yield the CPU.
+ * @arg == NULL.
+ */
+#define SCHEDOP_yield       0
+/*
+ * Block execution of this VCPU until an event is received for processing.
+ * If called with event upcalls masked, this operation will atomically
+ * reenable event delivery and check for pending events before blocking the
+ * VCPU. This avoids a "wakeup waiting" race.
+ * @arg == NULL.
+ */
+#define SCHEDOP_block       1
+/*
+ * Halt execution of this domain (all VCPUs) and notify the system controller.
+ * @arg == pointer to sched_shutdown structure.
+ */
+#define SCHEDOP_shutdown    2
+struct sched_shutdown {
+    unsigned int reason; /* SHUTDOWN_* */
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
+/*
+ * Poll a set of event-channel ports. Return when one or more are pending. An
+ * optional timeout may be specified.
+ * @arg == pointer to sched_poll structure.
+ */
+#define SCHEDOP_poll        3
+struct sched_poll {
+    GUEST_HANDLE(evtchn_port_t) ports;
+    unsigned int nr_ports;
+    uint64_t timeout;
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
+/*
+ * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
+ * software to determine the appropriate action. For the most part, Xen does
+ * not care about the shutdown code.
+ */
+#define SHUTDOWN_poweroff   0  /* Domain exited normally. Clean up and kill. */
+#define SHUTDOWN_reboot     1  /* Clean up, kill, and then restart.          */
+#define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
+#define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
+#endif /* __XEN_PUBLIC_SCHED_H__ */
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
new file mode 100644
index 000000000000..ff61ea365997
--- /dev/null
+++ b/include/xen/interface/vcpu.h
@@ -0,0 +1,167 @@
+/******************************************************************************
+ * vcpu.h
+ *
+ * VCPU initialisation, query, and hotplug.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
+ */
+#ifndef __XEN_PUBLIC_VCPU_H__
+#define __XEN_PUBLIC_VCPU_H__
+/*
+ * Prototype for this hypercall is:
+ *      int vcpu_op(int cmd, int vcpuid, void *extra_args)
+ * @cmd            == VCPUOP_??? (VCPU operation).
+ * @vcpuid         == VCPU to operate on.
+ * @extra_args == Operation-specific extra arguments (NULL if none).
+ */
+/*
+ * Initialise a VCPU. Each VCPU can be initialised only once. A
+ * newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
+ *
+ * @extra_arg == pointer to vcpu_guest_context structure containing initial
+ *                               state for the VCPU.
+ */
+#define VCPUOP_initialise                        0
+/*
+ * Bring up a VCPU. This makes the VCPU runnable. This operation will fail
+ * if the VCPU has not been initialised (VCPUOP_initialise).
+ */
+#define VCPUOP_up                                        1
+/*
+ * Bring down a VCPU (i.e., make it non-runnable).
+ * There are a few caveats that callers should observe:
+ *      1. This operation may return, and VCPU_is_up may return false, before the
+ *         VCPU stops running (i.e., the command is asynchronous). It is a good
+ *         idea to ensure that the VCPU has entered a non-critical loop before
+ *         bringing it down. Alternatively, this operation is guaranteed
+ *         synchronous if invoked by the VCPU itself.
+ *      2. After a VCPU is initialised, there is currently no way to drop all its
+ *         references to domain memory. Even a VCPU that is down still holds
+ *         memory references via its pagetable base pointer and GDT. It is good
+ *         practise to move a VCPU onto an 'idle' or default page table, LDT and
+ *         GDT before bringing it down.
+ */
+#define VCPUOP_down                                      2
+/* Returns 1 if the given VCPU is up. */
+#define VCPUOP_is_up                             3
+/*
+ * Return information about the state and running time of a VCPU.
+ * @extra_arg == pointer to vcpu_runstate_info structure.
+ */
+#define VCPUOP_get_runstate_info         4
+struct vcpu_runstate_info {
+                /* VCPU's current state (RUNSTATE_*). */
+                int              state;
+                /* When was current state entered (system time, ns)? */
+                uint64_t state_entry_time;
+                /*
+                 * Time spent in each RUNSTATE_* (ns). The sum of these times is
+                 * guaranteed not to drift from system time.
+                 */
+                uint64_t time[4];
+};
+/* VCPU is currently running on a physical CPU. */
+#define RUNSTATE_running  0
+/* VCPU is runnable, but not currently scheduled on any physical CPU. */
+#define RUNSTATE_runnable 1
+/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
+#define RUNSTATE_blocked  2
+/*
+ * VCPU is not runnable, but it is not blocked.
+ * This is a 'catch all' state for things like hotplug and pauses by the
+ * system administrator (or for critical sections in the hypervisor).
+ * RUNSTATE_blocked dominates this state (it is the preferred state).
+ */
+#define RUNSTATE_offline  3
+/*
+ * Register a shared memory area from which the guest may obtain its own
+ * runstate information without needing to execute a hypercall.
+ * Notes:
+ *      1. The registered address may be virtual or physical, depending on the
+ *         platform. The virtual address should be registered on x86 systems.
+ *      2. Only one shared area may be registered per VCPU. The shared area is
+ *         updated by the hypervisor each time the VCPU is scheduled. Thus
+ *         runstate.state will always be RUNSTATE_running and
+ *         runstate.state_entry_time will indicate the system time at which the
+ *         VCPU was last scheduled to run.
+ * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
+ */
+#define VCPUOP_register_runstate_memory_area 5
+struct vcpu_register_runstate_memory_area {
+                union {
+                                struct vcpu_runstate_info *v;
+                                uint64_t p;
+                } addr;
+};
+/*
+ * Set or stop a VCPU's periodic timer. Every VCPU has one periodic timer
+ * which can be set via these commands. Periods smaller than one millisecond
+ * may not be supported.
+ */
+#define VCPUOP_set_periodic_timer        6 /* arg == vcpu_set_periodic_timer_t */
+#define VCPUOP_stop_periodic_timer       7 /* arg == NULL */
+struct vcpu_set_periodic_timer {
+                uint64_t period_ns;
+};
+/*
+ * Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot
+ * timer which can be set via these commands.
+ */
+#define VCPUOP_set_singleshot_timer      8 /* arg == vcpu_set_singleshot_timer_t */
+#define VCPUOP_stop_singleshot_timer 9 /* arg == NULL */
+struct vcpu_set_singleshot_timer {
+                uint64_t timeout_abs_ns;
+                uint32_t flags;                    /* VCPU_SSHOTTMR_??? */
+};
+/* Flags to VCPUOP_set_singleshot_timer. */
+ /* Require the timeout to be in the future (return -ETIME if it's passed). */
+#define _VCPU_SSHOTTMR_future (0)
+#define VCPU_SSHOTTMR_future  (1U << _VCPU_SSHOTTMR_future)
+/*
+ * Register a memory location in the guest address space for the
+ * vcpu_info structure.  This allows the guest to place the vcpu_info
+ * structure in a convenient place, such as in a per-cpu data area.
+ * The pointer need not be page aligned, but the structure must not
+ * cross a page boundary.
+ */
+#define VCPUOP_register_vcpu_info   10  /* arg == struct vcpu_info */
+struct vcpu_register_vcpu_info {
+    uint32_t mfn;               /* mfn of page to place vcpu_info */
+    uint32_t offset;            /* offset within page */
+};
+#endif /* __XEN_PUBLIC_VCPU_H__ */
diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h
new file mode 100644
index 000000000000..453235e923f0
--- /dev/null
+++ b/include/xen/interface/version.h
@@ -0,0 +1,60 @@
+/******************************************************************************
+ * version.h
+ *
+ * Xen version, type, and compile information.
+ *
+ * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
+ */
+#ifndef __XEN_PUBLIC_VERSION_H__
+#define __XEN_PUBLIC_VERSION_H__
+/* NB. All ops return zero on success, except XENVER_version. */
+/* arg == NULL; returns major:minor (16:16). */
+#define XENVER_version      0
+/* arg == xen_extraversion_t. */
+#define XENVER_extraversion 1
+struct xen_extraversion {
+    char extraversion[16];
+};
+#define XEN_EXTRAVERSION_LEN (sizeof(struct xen_extraversion))
+/* arg == xen_compile_info_t. */
+#define XENVER_compile_info 2
+struct xen_compile_info {
+    char compiler[64];
+    char compile_by[16];
+    char compile_domain[32];
+    char compile_date[32];
+};
+#define XENVER_capabilities 3
+struct xen_capabilities_info {
+    char info[1024];
+};
+#define XEN_CAPABILITIES_INFO_LEN (sizeof(struct xen_capabilities_info))
+#define XENVER_changeset 4
+struct xen_changeset_info {
+    char info[64];
+};
+#define XEN_CHANGESET_INFO_LEN (sizeof(struct xen_changeset_info))
+#define XENVER_platform_parameters 5
+struct xen_platform_parameters {
+    unsigned long virt_start;
+};
+#define XENVER_get_features 6
+struct xen_feature_info {
+    unsigned int submap_idx;    /* IN: which 32-bit submap to return */
+    uint32_t     submap;        /* OUT: 32-bit submap */
+};
+/* Declares the features reported by XENVER_get_features. */
+#include "features.h"
+#endif /* __XEN_PUBLIC_VERSION_H__ */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
new file mode 100644
index 000000000000..518a5bf79ed3
--- /dev/null
+++ b/include/xen/interface/xen.h
@@ -0,0 +1,447 @@
+/******************************************************************************
+ * xen.h
+ *
+ * Guest OS interface to Xen.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+#ifndef __XEN_PUBLIC_XEN_H__
+#define __XEN_PUBLIC_XEN_H__
+#include <asm/xen/interface.h>
+/*
+ * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
+ */
+/*
+ * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
+ *         EAX = return value
+ *         (argument registers may be clobbered on return)
+ * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6.
+ *         RAX = return value
+ *         (argument registers not clobbered on return; RCX, R11 are)
+ */
+#define __HYPERVISOR_set_trap_table        0
+#define __HYPERVISOR_mmu_update            1
+#define __HYPERVISOR_set_gdt               2
+#define __HYPERVISOR_stack_switch          3
+#define __HYPERVISOR_set_callbacks         4
+#define __HYPERVISOR_fpu_taskswitch        5
+#define __HYPERVISOR_sched_op              6
+#define __HYPERVISOR_dom0_op               7
+#define __HYPERVISOR_set_debugreg          8
+#define __HYPERVISOR_get_debugreg          9
+#define __HYPERVISOR_update_descriptor    10
+#define __HYPERVISOR_memory_op            12
+#define __HYPERVISOR_multicall            13
+#define __HYPERVISOR_update_va_mapping    14
+#define __HYPERVISOR_set_timer_op         15
+#define __HYPERVISOR_event_channel_op_compat 16
+#define __HYPERVISOR_xen_version          17
+#define __HYPERVISOR_console_io           18
+#define __HYPERVISOR_physdev_op_compat    19
+#define __HYPERVISOR_grant_table_op       20
+#define __HYPERVISOR_vm_assist            21
+#define __HYPERVISOR_update_va_mapping_otherdomain 22
+#define __HYPERVISOR_iret                 23 /* x86 only */
+#define __HYPERVISOR_vcpu_op              24
+#define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
+#define __HYPERVISOR_mmuext_op            26
+#define __HYPERVISOR_acm_op               27
+#define __HYPERVISOR_nmi_op               28
+#define __HYPERVISOR_sched_op_new         29
+#define __HYPERVISOR_callback_op          30
+#define __HYPERVISOR_xenoprof_op          31
+#define __HYPERVISOR_event_channel_op     32
+#define __HYPERVISOR_physdev_op           33
+#define __HYPERVISOR_hvm_op               34
+/*
+ * VIRTUAL INTERRUPTS
+ *
+ * Virtual interrupts that a guest OS may receive from Xen.
+ */
+#define VIRQ_TIMER      0  /* Timebase update, and/or requested timeout.  */
+#define VIRQ_DEBUG      1  /* Request guest to dump debug info.           */
+#define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
+#define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
+#define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
+#define NR_VIRQS        8
+/*
+ * MMU-UPDATE REQUESTS
+ *
+ * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
+ * A foreigndom (FD) can be specified (or DOMID_SELF for none).
+ * Where the FD has some effect, it is described below.
+ * ptr[1:0] specifies the appropriate MMU_* command.
+ *
+ * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
+ * Updates an entry in a page table. If updating an L1 table, and the new
+ * table entry is valid/present, the mapped frame must belong to the FD, if
+ * an FD has been specified. If attempting to map an I/O page then the
+ * caller assumes the privilege of the FD.
+ * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
+ * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
+ * ptr[:2]  -- Machine address of the page-table entry to modify.
+ * val      -- Value to write.
+ *
+ * ptr[1:0] == MMU_MACHPHYS_UPDATE:
+ * Updates an entry in the machine->pseudo-physical mapping table.
+ * ptr[:2]  -- Machine address within the frame whose mapping to modify.
+ *             The frame must belong to the FD, if one is specified.
+ * val      -- Value to write into the mapping entry.
+ */
+#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.       */
+#define MMU_MACHPHYS_UPDATE      1 /* ptr = MA of frame to modify entry for  */
+/*
+ * MMU EXTENDED OPERATIONS
+ *
+ * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
+ * A foreigndom (FD) can be specified (or DOMID_SELF for none).
+ * Where the FD has some effect, it is described below.
+ *
+ * cmd: MMUEXT_(UN)PIN_*_TABLE
+ * mfn: Machine frame number to be (un)pinned as a p.t. page.
+ *      The frame must belong to the FD, if one is specified.
+ *
+ * cmd: MMUEXT_NEW_BASEPTR
+ * mfn: Machine frame number of new page-table base to install in MMU.
+ *
+ * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
+ * mfn: Machine frame number of new page-table base to install in MMU
+ *      when in user space.
+ *
+ * cmd: MMUEXT_TLB_FLUSH_LOCAL
+ * No additional arguments. Flushes local TLB.
+ *
+ * cmd: MMUEXT_INVLPG_LOCAL
+ * linear_addr: Linear address to be flushed from the local TLB.
+ *
+ * cmd: MMUEXT_TLB_FLUSH_MULTI
+ * vcpumask: Pointer to bitmap of VCPUs to be flushed.
+ *
+ * cmd: MMUEXT_INVLPG_MULTI
+ * linear_addr: Linear address to be flushed.
+ * vcpumask: Pointer to bitmap of VCPUs to be flushed.
+ *
+ * cmd: MMUEXT_TLB_FLUSH_ALL
+ * No additional arguments. Flushes all VCPUs' TLBs.
+ *
+ * cmd: MMUEXT_INVLPG_ALL
+ * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
+ *
+ * cmd: MMUEXT_FLUSH_CACHE
+ * No additional arguments. Writes back and flushes cache contents.
+ *
+ * cmd: MMUEXT_SET_LDT
+ * linear_addr: Linear address of LDT base (NB. must be page-aligned).
+ * nr_ents: Number of entries in LDT.
+ */
+#define MMUEXT_PIN_L1_TABLE      0
+#define MMUEXT_PIN_L2_TABLE      1
+#define MMUEXT_PIN_L3_TABLE      2
+#define MMUEXT_PIN_L4_TABLE      3
+#define MMUEXT_UNPIN_TABLE       4
+#define MMUEXT_NEW_BASEPTR       5
+#define MMUEXT_TLB_FLUSH_LOCAL   6
+#define MMUEXT_INVLPG_LOCAL      7
+#define MMUEXT_TLB_FLUSH_MULTI   8
+#define MMUEXT_INVLPG_MULTI      9
+#define MMUEXT_TLB_FLUSH_ALL    10
+#define MMUEXT_INVLPG_ALL       11
+#define MMUEXT_FLUSH_CACHE      12
+#define MMUEXT_SET_LDT          13
+#define MMUEXT_NEW_USER_BASEPTR 15
+#ifndef __ASSEMBLY__
+struct mmuext_op {
+        unsigned int cmd;
+        union {
+                /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
+                unsigned long mfn;
+                /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
+                unsigned long linear_addr;
+        } arg1;
+        union {
+                /* SET_LDT */
+                unsigned int nr_ents;
+                /* TLB_FLUSH_MULTI, INVLPG_MULTI */
+                void *vcpumask;
+        } arg2;
+};
+DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
+#endif
+/* These are passed as 'flags' to update_va_mapping. They can be ORed. */
+/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap.   */
+/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer.         */
+#define UVMF_NONE               (0UL<<0) /* No flushing at all.   */
+#define UVMF_TLB_FLUSH          (1UL<<0) /* Flush entire TLB(s).  */
+#define UVMF_INVLPG             (2UL<<0) /* Flush only one entry. */
+#define UVMF_FLUSHTYPE_MASK     (3UL<<0)
+#define UVMF_MULTI              (0UL<<2) /* Flush subset of TLBs. */
+#define UVMF_LOCAL              (0UL<<2) /* Flush local TLB.      */
+#define UVMF_ALL                (1UL<<2) /* Flush all TLBs.       */
+/*
+ * Commands to HYPERVISOR_console_io().
+ */
+#define CONSOLEIO_write         0
+#define CONSOLEIO_read          1
+/*
+ * Commands to HYPERVISOR_vm_assist().
+ */
+#define VMASST_CMD_enable                0
+#define VMASST_CMD_disable               1
+#define VMASST_TYPE_4gb_segments         0
+#define VMASST_TYPE_4gb_segments_notify  1
+#define VMASST_TYPE_writable_pagetables  2
+#define VMASST_TYPE_pae_extended_cr3     3
+#define MAX_VMASST_TYPE 3
+#ifndef __ASSEMBLY__
+typedef uint16_t domid_t;
+/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
+#define DOMID_FIRST_RESERVED (0x7FF0U)
+/* DOMID_SELF is used in certain contexts to refer to oneself. */
+#define DOMID_SELF (0x7FF0U)
+/*
+ * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
+ * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
+ * is useful to ensure that no mappings to the OS's own heap are accidentally
+ * installed. (e.g., in Linux this could cause havoc as reference counts
+ * aren't adjusted on the I/O-mapping code path).
+ * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
+ * be specified by any calling domain.
+ */
+#define DOMID_IO   (0x7FF1U)
+/*
+ * DOMID_XEN is used to allow privileged domains to map restricted parts of
+ * Xen's heap space (e.g., the machine_to_phys table).
+ * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
+ * the caller is privileged.
+ */
+#define DOMID_XEN  (0x7FF2U)
+/*
+ * Send an array of these to HYPERVISOR_mmu_update().
+ * NB. The fields are natural pointer/address size for this architecture.
+ */
+struct mmu_update {
+    uint64_t ptr;       /* Machine address of PTE. */
+    uint64_t val;       /* New contents of PTE.    */
+};
+DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
+/*
+ * Send an array of these to HYPERVISOR_multicall().
+ * NB. The fields are natural register size for this architecture.
+ */
+struct multicall_entry {
+    unsigned long op;
+    long result;
+    unsigned long args[6];
+};
+DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
+/*
+ * Event channel endpoints per domain:
+ *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
+ */
+#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
+struct vcpu_time_info {
+        /*
+         * Updates to the following values are preceded and followed
+         * by an increment of 'version'. The guest can therefore
+         * detect updates by looking for changes to 'version'. If the
+         * least-significant bit of the version number is set then an
+         * update is in progress and the guest must wait to read a
+         * consistent set of values.  The correct way to interact with
+         * the version number is similar to Linux's seqlock: see the
+         * implementations of read_seqbegin/read_seqretry.
+         */
+        uint32_t version;
+        uint32_t pad0;
+        uint64_t tsc_timestamp;   /* TSC at last update of time vals.  */
+        uint64_t system_time;     /* Time, in nanosecs, since boot.    */
+        /*
+         * Current system time:
+         *   system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul
+         * CPU frequency (Hz):
+         *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
+         */
+        uint32_t tsc_to_system_mul;
+        int8_t   tsc_shift;
+        int8_t   pad1[3];
+}; /* 32 bytes */
+struct vcpu_info {
+        /*
+         * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
+         * a pending notification for a particular VCPU. It is then cleared
+         * by the guest OS /before/ checking for pending work, thus avoiding
+         * a set-and-check race. Note that the mask is only accessed by Xen
+         * on the CPU that is currently hosting the VCPU. This means that the
+         * pending and mask flags can be updated by the guest without special
+         * synchronisation (i.e., no need for the x86 LOCK prefix).
+         * This may seem suboptimal because if the pending flag is set by
+         * a different CPU then an IPI may be scheduled even when the mask
+         * is set. However, note:
+         *  1. The task of 'interrupt holdoff' is covered by the per-event-
+         *     channel mask bits. A 'noisy' event that is continually being
+         *     triggered can be masked at source at this very precise
+         *     granularity.
+         *  2. The main purpose of the per-VCPU mask is therefore to restrict
+         *     reentrant execution: whether for concurrency control, or to
+         *     prevent unbounded stack usage. Whatever the purpose, we expect
+         *     that the mask will be asserted only for short periods at a time,
+         *     and so the likelihood of a 'spurious' IPI is suitably small.
+         * The mask is read before making an event upcall to the guest: a
+         * non-zero mask therefore guarantees that the VCPU will not receive
+         * an upcall activation. The mask is cleared when the VCPU requests
+         * to block: this avoids wakeup-waiting races.
+         */
+        uint8_t evtchn_upcall_pending;
+        uint8_t evtchn_upcall_mask;
+        unsigned long evtchn_pending_sel;
+        struct arch_vcpu_info arch;
+        struct vcpu_time_info time;
+}; /* 64 bytes (x86) */
+/*
+ * Xen/kernel shared data -- pointer provided in start_info.
+ * NB. We expect that this struct is smaller than a page.
+ */
+struct shared_info {
+        struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
+        /*
+         * A domain can create "event channels" on which it can send and receive
+         * asynchronous event notifications. There are three classes of event that
+         * are delivered by this mechanism:
+         *  1. Bi-directional inter- and intra-domain connections. Domains must
+         *     arrange out-of-band to set up a connection (usually by allocating
+         *     an unbound 'listener' port and avertising that via a storage service
+         *     such as xenstore).
+         *  2. Physical interrupts. A domain with suitable hardware-access
+         *     privileges can bind an event-channel port to a physical interrupt
+         *     source.
+         *  3. Virtual interrupts ('events'). A domain can bind an event-channel
+         *     port to a virtual interrupt source, such as the virtual-timer
+         *     device or the emergency console.
+         *
+         * Event channels are addressed by a "port index". Each channel is
+         * associated with two bits of information:
+         *  1. PENDING -- notifies the domain that there is a pending notification
+         *     to be processed. This bit is cleared by the guest.
+         *  2. MASK -- if this bit is clear then a 0->1 transition of PENDING
+         *     will cause an asynchronous upcall to be scheduled. This bit is only
+         *     updated by the guest. It is read-only within Xen. If a channel
+         *     becomes pending while the channel is masked then the 'edge' is lost
+         *     (i.e., when the channel is unmasked, the guest must manually handle
+         *     pending notifications as no upcall will be scheduled by Xen).
+         *
+         * To expedite scanning of pending notifications, any 0->1 pending
+         * transition on an unmasked channel causes a corresponding bit in a
+         * per-vcpu selector word to be set. Each bit in the selector covers a
+         * 'C long' in the PENDING bitfield array.
+         */
+        unsigned long evtchn_pending[sizeof(unsigned long) * 8];
+        unsigned long evtchn_mask[sizeof(unsigned long) * 8];
+        /*
+         * Wallclock time: updated only by control software. Guests should base
+         * their gettimeofday() syscall on this wallclock-base value.
+         */
+        uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
+        uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
+        uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
+        struct arch_shared_info arch;
+};
+/*
+ * Start-of-day memory layout for the initial domain (DOM0):
+ *  1. The domain is started within contiguous virtual-memory region.
+ *  2. The contiguous region begins and ends on an aligned 4MB boundary.
+ *  3. The region start corresponds to the load address of the OS image.
+ *     If the load address is not 4MB aligned then the address is rounded down.
+ *  4. This the order of bootstrap elements in the initial virtual region:
+ *      a. relocated kernel image
+ *      b. initial ram disk              [mod_start, mod_len]
+ *      c. list of allocated page frames [mfn_list, nr_pages]
+ *      d. start_info_t structure        [register ESI (x86)]
+ *      e. bootstrap page tables         [pt_base, CR3 (x86)]
+ *      f. bootstrap stack               [register ESP (x86)]
+ *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
+ *  6. The initial ram disk may be omitted.
+ *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
+ *     layout for the domain. In particular, the bootstrap virtual-memory
+ *     region is a 1:1 mapping to the first section of the pseudo-physical map.
+ *  8. All bootstrap elements are mapped read-writable for the guest OS. The
+ *     only exception is the bootstrap page table, which is mapped read-only.
+ *  9. There is guaranteed to be at least 512kB padding after the final
+ *     bootstrap element. If necessary, the bootstrap virtual region is
+ *     extended by an extra 4MB to ensure this.
+ */
+#define MAX_GUEST_CMDLINE 1024
+struct start_info {
+        /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME.    */
+        char magic[32];             /* "xen-<version>-<platform>".            */
+        unsigned long nr_pages;     /* Total pages allocated to this domain.  */
+        unsigned long shared_info;  /* MACHINE address of shared info struct. */
+        uint32_t flags;             /* SIF_xxx flags.                         */
+        unsigned long store_mfn;    /* MACHINE page number of shared page.    */
+        uint32_t store_evtchn;      /* Event channel for store communication. */
+        union {
+                struct {
+                        unsigned long mfn;  /* MACHINE page number of console page.   */
+                        uint32_t  evtchn;   /* Event channel for console page.        */
+                } domU;
+                struct {
+                        uint32_t info_off;  /* Offset of console_info struct.         */
+                        uint32_t info_size; /* Size of console_info struct from start.*/
+                } dom0;
+        } console;
+        /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME).     */
+        unsigned long pt_base;      /* VIRTUAL address of page directory.     */
+        unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames.       */
+        unsigned long mfn_list;     /* VIRTUAL address of page-frame list.    */
+        unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
+        unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
+        int8_t cmd_line[MAX_GUEST_CMDLINE];
+};
+/* These flags are passed in the 'flags' field of start_info_t. */
+#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
+#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
+typedef uint64_t cpumap_t;
+typedef uint8_t xen_domain_handle_t[16];
+/* Turn a plain number into a C unsigned long constant. */
+#define __mk_unsigned_long(x) x ## UL
+#define mk_unsigned_long(x) __mk_unsigned_long(x)
+#else /* __ASSEMBLY__ */
+/* In assembly code we cannot use C numeric constant suffixes. */
+#define mk_unsigned_long(x) x
+#endif /* !__ASSEMBLY__ */
+#endif /* __XEN_PUBLIC_XEN_H__ */
diff --git a/include/xen/page.h b/include/xen/page.h
new file mode 100644
index 000000000000..1df6c1930578
--- /dev/null
+++ b/include/xen/page.h
@@ -0,0 +1,179 @@
+#ifndef __XEN_PAGE_H
+#define __XEN_PAGE_H
+#include <linux/pfn.h>
+#include <asm/uaccess.h>
+#include <xen/features.h>
+#ifdef CONFIG_X86_PAE
+/* Xen machine address */
+typedef struct xmaddr {
+        unsigned long long maddr;
+} xmaddr_t;
+/* Xen pseudo-physical address */
+typedef struct xpaddr {
+        unsigned long long paddr;
+} xpaddr_t;
+#else
+/* Xen machine address */
+typedef struct xmaddr {
+        unsigned long maddr;
+} xmaddr_t;
+/* Xen pseudo-physical address */
+typedef struct xpaddr {
+        unsigned long paddr;
+} xpaddr_t;
+#endif
+#define XMADDR(x)       ((xmaddr_t) { .maddr = (x) })
+#define XPADDR(x)       ((xpaddr_t) { .paddr = (x) })
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY       (~0UL)
+#define FOREIGN_FRAME_BIT       (1UL<<31)
+#define FOREIGN_FRAME(m)        ((m) | FOREIGN_FRAME_BIT)
+extern unsigned long *phys_to_machine_mapping;
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+        if (xen_feature(XENFEAT_auto_translated_physmap))
+                return pfn;
+        return phys_to_machine_mapping[(unsigned int)(pfn)] &
+                ~FOREIGN_FRAME_BIT;
+}
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+        if (xen_feature(XENFEAT_auto_translated_physmap))
+                return 1;
+        return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
+}
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+        unsigned long pfn;
+        if (xen_feature(XENFEAT_auto_translated_physmap))
+                return mfn;
+#if 0
+        if (unlikely((mfn >> machine_to_phys_order) != 0))
+                return max_mapnr;
+#endif
+        pfn = 0;
+        /*
+         * The array access can fail (e.g., device space beyond end of RAM).
+         * In such cases it doesn't matter what we return (we return garbage),
+         * but we must handle the fault without crashing!
+         */
+        __get_user(pfn, &machine_to_phys_mapping[mfn]);
+        return pfn;
+}
+static inline xmaddr_t phys_to_machine(xpaddr_t phys)
+{
+        unsigned offset = phys.paddr & ~PAGE_MASK;
+        return XMADDR(PFN_PHYS((u64)pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
+}
+static inline xpaddr_t machine_to_phys(xmaddr_t machine)
+{
+        unsigned offset = machine.maddr & ~PAGE_MASK;
+        return XPADDR(PFN_PHYS((u64)mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
+}
+/*
+ * We detect special mappings in one of two ways:
+ *  1. If the MFN is an I/O page then Xen will set the m2p entry
+ *     to be outside our maximum possible pseudophys range.
+ *  2. If the MFN belongs to a different domain then we will certainly
+ *     not have MFN in our p2m table. Conversely, if the page is ours,
+ *     then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ *      require. In all the cases we care about, the FOREIGN_FRAME bit is
+ *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+{
+        extern unsigned long max_mapnr;
+        unsigned long pfn = mfn_to_pfn(mfn);
+        if ((pfn < max_mapnr)
+            && !xen_feature(XENFEAT_auto_translated_physmap)
+            && (phys_to_machine_mapping[pfn] != mfn))
+                return max_mapnr; /* force !pfn_valid() */
+        return pfn;
+}
+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+        if (xen_feature(XENFEAT_auto_translated_physmap)) {
+                BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+                return;
+        }
+        phys_to_machine_mapping[pfn] = mfn;
+}
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v)      (phys_to_machine(XPADDR(__pa(v))))
+#define virt_to_mfn(v)          (pfn_to_mfn(PFN_DOWN(__pa(v))))
+#define mfn_to_virt(m)          (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+#ifdef CONFIG_X86_PAE
+#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |                 \
+                       (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
+static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+        pte_t pte;
+        pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) |
+                (pgprot_val(pgprot) >> 32);
+        pte.pte_high &= (__supported_pte_mask >> 32);
+        pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
+        pte.pte_low &= __supported_pte_mask;
+        return pte;
+}
+static inline unsigned long long pte_val_ma(pte_t x)
+{
+        return ((unsigned long long)x.pte_high << 32) | x.pte_low;
+}
+#define pmd_val_ma(v) ((v).pmd)
+#define pud_val_ma(v) ((v).pgd.pgd)
+#define __pte_ma(x)     ((pte_t) { .pte_low = (x), .pte_high = (x)>>32 } )
+#define __pmd_ma(x)     ((pmd_t) { (x) } )
+#else  /* !X86_PAE */
+#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
+#define mfn_pte(pfn, prot)      __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
+#define pte_val_ma(x)   ((x).pte_low)
+#define pmd_val_ma(v)   ((v).pud.pgd.pgd)
+#define __pte_ma(x)     ((pte_t) { (x) } )
+#endif  /* CONFIG_X86_PAE */
+#define pgd_val_ma(x)   ((x).pgd)
+xmaddr_t arbitrary_virt_to_machine(unsigned long address);
+void make_lowmem_page_readonly(void *vaddr);
+void make_lowmem_page_readwrite(void *vaddr);
+#endif /* __XEN_PAGE_H */
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
new file mode 100644
index 000000000000..6f7c290651ae
--- /dev/null
+++ b/include/xen/xenbus.h
@@ -0,0 +1,234 @@
+/******************************************************************************
+ * xenbus.h
+ *
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 XenSource Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef _XEN_XENBUS_H
+#define _XEN_XENBUS_H
+#include <linux/device.h>
+#include <linux/notifier.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/init.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/xenbus.h>
+#include <xen/interface/io/xs_wire.h>
+/* Register callback to watch this node. */
+struct xenbus_watch
+{
+        struct list_head list;
+        /* Path being watched. */
+        const char *node;
+        /* Callback (executed in a process context with no locks held). */
+        void (*callback)(struct xenbus_watch *,
+                         const char **vec, unsigned int len);
+};
+/* A xenbus device. */
+struct xenbus_device {
+        const char *devicetype;
+        const char *nodename;
+        const char *otherend;
+        int otherend_id;
+        struct xenbus_watch otherend_watch;
+        struct device dev;
+        enum xenbus_state state;
+        struct completion down;
+};
+static inline struct xenbus_device *to_xenbus_device(struct device *dev)
+{
+        return container_of(dev, struct xenbus_device, dev);
+}
+struct xenbus_device_id
+{
+        /* .../device/<device_type>/<identifier> */
+        char devicetype[32];    /* General class of device. */
+};
+/* A xenbus driver. */
+struct xenbus_driver {
+        char *name;
+        struct module *owner;
+        const struct xenbus_device_id *ids;
+        int (*probe)(struct xenbus_device *dev,
+                     const struct xenbus_device_id *id);
+        void (*otherend_changed)(struct xenbus_device *dev,
+                                 enum xenbus_state backend_state);
+        int (*remove)(struct xenbus_device *dev);
+        int (*suspend)(struct xenbus_device *dev);
+        int (*suspend_cancel)(struct xenbus_device *dev);
+        int (*resume)(struct xenbus_device *dev);
+        int (*uevent)(struct xenbus_device *, char **, int, char *, int);
+        struct device_driver driver;
+        int (*read_otherend_details)(struct xenbus_device *dev);
+};
+static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
+{
+        return container_of(drv, struct xenbus_driver, driver);
+}
+int __must_check __xenbus_register_frontend(struct xenbus_driver *drv,
+                                            struct module *owner,
+                                            const char *mod_name);
+static inline int __must_check
+xenbus_register_frontend(struct xenbus_driver *drv)
+{
+        WARN_ON(drv->owner != THIS_MODULE);
+        return __xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME);
+}
+int __must_check __xenbus_register_backend(struct xenbus_driver *drv,
+                                           struct module *owner,
+                                           const char *mod_name);
+static inline int __must_check
+xenbus_register_backend(struct xenbus_driver *drv)
+{
+        WARN_ON(drv->owner != THIS_MODULE);
+        return __xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME);
+}
+void xenbus_unregister_driver(struct xenbus_driver *drv);
+struct xenbus_transaction
+{
+        u32 id;
+};
+/* Nil transaction ID. */
+#define XBT_NIL ((struct xenbus_transaction) { 0 })
+int __init xenbus_dev_init(void);
+char **xenbus_directory(struct xenbus_transaction t,
+                        const char *dir, const char *node, unsigned int *num);
+void *xenbus_read(struct xenbus_transaction t,
+                  const char *dir, const char *node, unsigned int *len);
+int xenbus_write(struct xenbus_transaction t,
+                 const char *dir, const char *node, const char *string);
+int xenbus_mkdir(struct xenbus_transaction t,
+                 const char *dir, const char *node);
+int xenbus_exists(struct xenbus_transaction t,
+                  const char *dir, const char *node);
+int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node);
+int xenbus_transaction_start(struct xenbus_transaction *t);
+int xenbus_transaction_end(struct xenbus_transaction t, int abort);
+/* Single read and scanf: returns -errno or num scanned if > 0. */
+int xenbus_scanf(struct xenbus_transaction t,
+                 const char *dir, const char *node, const char *fmt, ...)
+        __attribute__((format(scanf, 4, 5)));
+/* Single printf and write: returns -errno or 0. */
+int xenbus_printf(struct xenbus_transaction t,
+                  const char *dir, const char *node, const char *fmt, ...)
+        __attribute__((format(printf, 4, 5)));
+/* Generic read function: NULL-terminated triples of name,
+ * sprintf-style type string, and pointer. Returns 0 or errno.*/
+int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
+/* notifer routines for when the xenstore comes up */
+extern int xenstored_ready;
+int register_xenstore_notifier(struct notifier_block *nb);
+void unregister_xenstore_notifier(struct notifier_block *nb);
+int register_xenbus_watch(struct xenbus_watch *watch);
+void unregister_xenbus_watch(struct xenbus_watch *watch);
+void xs_suspend(void);
+void xs_resume(void);
+void xs_suspend_cancel(void);
+/* Used by xenbus_dev to borrow kernel's store connection. */
+void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
+struct work_struct;
+/* Prepare for domain suspend: then resume or cancel the suspend. */
+void xenbus_suspend(void);
+void xenbus_resume(void);
+void xenbus_probe(struct work_struct *);
+void xenbus_suspend_cancel(void);
+#define XENBUS_IS_ERR_READ(str) ({                      \
+        if (!IS_ERR(str) && strlen(str) == 0) {         \
+                kfree(str);                             \
+                str = ERR_PTR(-ERANGE);                 \
+        }                                               \
+        IS_ERR(str);                                    \
+})
+#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
+int xenbus_watch_path(struct xenbus_device *dev, const char *path,
+                      struct xenbus_watch *watch,
+                      void (*callback)(struct xenbus_watch *,
+                                       const char **, unsigned int));
+int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
+                         void (*callback)(struct xenbus_watch *,
+                                          const char **, unsigned int),
+                         const char *pathfmt, ...)
+        __attribute__ ((format (printf, 4, 5)));
+int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
+int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
+int xenbus_map_ring_valloc(struct xenbus_device *dev,
+                           int gnt_ref, void **vaddr);
+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
+                           grant_handle_t *handle, void *vaddr);
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
+int xenbus_unmap_ring(struct xenbus_device *dev,
+                      grant_handle_t handle, void *vaddr);
+int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
+int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
+int xenbus_free_evtchn(struct xenbus_device *dev, int port);
+enum xenbus_state xenbus_read_driver_state(const char *path);
+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...);
+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...);
+const char *xenbus_strstate(enum xenbus_state state);
+int xenbus_dev_is_online(struct xenbus_device *dev);
+int xenbus_frontend_closed(struct xenbus_device *dev);
+#endif /* _XEN_XENBUS_H */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b4796d850140..57e6448b171e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf)
        envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
        envp[i] = NULL;
-        call_usermodehelper(argv[0], argv, envp, 0);
+        call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
        kfree(pathbuf);
 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4d32eb077179..78d365c524ed 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -119,9 +119,10 @@ struct subprocess_info {
        char **argv;
        char **envp;
        struct key *ring;
-        int wait;
+        enum umh_wait wait;
        int retval;
        struct file *stdin;
+        void (*cleanup)(char **argv, char **envp);
 };
 /*
@@ -180,6 +181,14 @@ static int ____call_usermodehelper(void *data)
        do_exit(0);
 }
+void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+        if (info->cleanup)
+                (*info->cleanup)(info->argv, info->envp);
+        kfree(info);
+}
+EXPORT_SYMBOL(call_usermodehelper_freeinfo);
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
 {
@@ -216,8 +225,8 @@ static int wait_for_helper(void *data)
                        sub_info->retval = ret;
        }
-        if (sub_info->wait < 0)
+        if (sub_info->wait == UMH_NO_WAIT)
-                kfree(sub_info);
+                call_usermodehelper_freeinfo(sub_info);
        else
                complete(sub_info->complete);
        return 0;
@@ -229,34 +238,122 @@ static void __call_usermodehelper(struct work_struct *work)
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);
        pid_t pid;
-        int wait = sub_info->wait;
+        enum umh_wait wait = sub_info->wait;
        /* CLONE_VFORK: wait until the usermode helper has execve'd
         * successfully We need the data structures to stay around
         * until that is done.  */
-        if (wait)
+        if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
                pid = kernel_thread(wait_for_helper, sub_info,
                                    CLONE_FS | CLONE_FILES | SIGCHLD);
        else
                pid = kernel_thread(____call_usermodehelper, sub_info,
                                    CLONE_VFORK | SIGCHLD);
-        if (wait < 0)
+        switch (wait) {
-                return;
+        case UMH_NO_WAIT:
+                break;
-        if (pid < 0) {
+        case UMH_WAIT_PROC:
+                if (pid > 0)
+                        break;
                sub_info->retval = pid;
+                /* FALLTHROUGH */
+        case UMH_WAIT_EXEC:
                complete(sub_info->complete);
-        } else if (!wait)
+        }
-                complete(sub_info->complete);
+}
+/**
+ * call_usermodehelper_setup - prepare to call a usermode helper
+ * @path - path to usermode executable
+ * @argv - arg vector for process
+ * @envp - environment for process
+ *
+ * Returns either NULL on allocation failure, or a subprocess_info
+ * structure.  This should be passed to call_usermodehelper_exec to
+ * exec the process and free the structure.
+ */
+struct subprocess_info *call_usermodehelper_setup(char *path,
+                                                  char **argv, char **envp)
+{
+        struct subprocess_info *sub_info;
+        sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+        if (!sub_info)
+                goto out;
+        INIT_WORK(&sub_info->work, __call_usermodehelper);
+        sub_info->path = path;
+        sub_info->argv = argv;
+        sub_info->envp = envp;
+  out:
+        return sub_info;
 }
+EXPORT_SYMBOL(call_usermodehelper_setup);
 /**
- * call_usermodehelper_keys - start a usermode application
+ * call_usermodehelper_setkeys - set the session keys for usermode helper
- * @path: pathname for the application
+ * @info: a subprocess_info returned by call_usermodehelper_setup
- * @argv: null-terminated argument list
+ * @session_keyring: the session keyring for the process
- * @envp: null-terminated environment list
+ */
- * @session_keyring: session keyring for process (NULL for an empty keyring)
+void call_usermodehelper_setkeys(struct subprocess_info *info,
+                                 struct key *session_keyring)
+{
+        info->ring = session_keyring;
+}
+EXPORT_SYMBOL(call_usermodehelper_setkeys);
+/**
+ * call_usermodehelper_setcleanup - set a cleanup function
+ * @info: a subprocess_info returned by call_usermodehelper_setup
+ * @cleanup: a cleanup function
+ *
+ * The cleanup function is just befor ethe subprocess_info is about to
+ * be freed.  This can be used for freeing the argv and envp.  The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
+ */
+void call_usermodehelper_setcleanup(struct subprocess_info *info,
+                                    void (*cleanup)(char **argv, char **envp))
+{
+        info->cleanup = cleanup;
+}
+EXPORT_SYMBOL(call_usermodehelper_setcleanup);
+/**
+ * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
+ * @sub_info: a subprocess_info returned by call_usermodehelper_setup
+ * @filp: set to the write-end of a pipe
+ *
+ * This constructs a pipe, and sets the read end to be the stdin of the
+ * subprocess, and returns the write-end in *@filp.
+ */
+int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
+                                  struct file **filp)
+{
+        struct file *f;
+        f = create_write_pipe();
+        if (IS_ERR(f))
+                return PTR_ERR(f);
+        *filp = f;
+        f = create_read_pipe(f);
+        if (IS_ERR(f)) {
+                free_write_pipe(*filp);
+                return PTR_ERR(f);
+        }
+        sub_info->stdin = f;
+        return 0;
+}
+EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
+/**
+ * call_usermodehelper_exec - start a usermode application
+ * @sub_info: information about the subprocessa
 * @wait: wait for the application to finish and return status.
 *        when -1 don't wait at all, but you get no useful error back when
 *        the program couldn't be exec'ed. This makes it safe to call
@@ -265,81 +362,68 @@ static void __call_usermodehelper(struct work_struct *work)
 * Runs a user-space application.  The application is started
 * asynchronously if wait is not set, and runs as a child of keventd.
 * (ie. it runs with full root capabilities).
- *
- * Must be called from process context.  Returns a negative error code
- * if program was not execed successfully, or 0.
 */
-int call_usermodehelper_keys(char *path, char **argv, char **envp,
+int call_usermodehelper_exec(struct subprocess_info *sub_info,
-                             struct key *session_keyring, int wait)
+                             enum umh_wait wait)
 {
        DECLARE_COMPLETION_ONSTACK(done);
-        struct subprocess_info *sub_info;
        int retval;
-        if (!khelper_wq)
+        if (sub_info->path[0] == '\0') {
-                return -EBUSY;
+                retval = 0;
+                goto out;
-        if (path[0] == '\0')
+        }
-                return 0;
-        sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+        if (!khelper_wq) {
-        if (!sub_info)
+                retval = -EBUSY;
-                return -ENOMEM;
+                goto out;
+        }
-        INIT_WORK(&sub_info->work, __call_usermodehelper);
        sub_info->complete = &done;
-        sub_info->path = path;
-        sub_info->argv = argv;
-        sub_info->envp = envp;
-        sub_info->ring = session_keyring;
        sub_info->wait = wait;
        queue_work(khelper_wq, &sub_info->work);
-        if (wait < 0) /* task has freed sub_info */
+        if (wait == UMH_NO_WAIT) /* task has freed sub_info */
                return 0;
        wait_for_completion(&done);
        retval = sub_info->retval;
-        kfree(sub_info);
+  out:
+        call_usermodehelper_freeinfo(sub_info);
        return retval;
 }
-EXPORT_SYMBOL(call_usermodehelper_keys);
+EXPORT_SYMBOL(call_usermodehelper_exec);
+/**
+ * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @filp: set to the write-end of a pipe
+ *
+ * This is a simple wrapper which executes a usermode-helper function
+ * with a pipe as stdin.  It is implemented entirely in terms of
+ * lower-level call_usermodehelper_* functions.
+ */
 int call_usermodehelper_pipe(char *path, char **argv, char **envp,
                             struct file **filp)
 {
-        DECLARE_COMPLETION(done);
+        struct subprocess_info *sub_info;
-        struct subprocess_info sub_info = {
+        int ret;
-                .work           = __WORK_INITIALIZER(sub_info.work,
-                                                     __call_usermodehelper),
-                .complete       = &done,
-                .path           = path,
-                .argv           = argv,
-                .envp           = envp,
-                .retval         = 0,
-        };
-        struct file *f;
-        if (!khelper_wq)
+        sub_info = call_usermodehelper_setup(path, argv, envp);
-                return -EBUSY;
+        if (sub_info == NULL)
+                return -ENOMEM;
-        if (path[0] == '\0')
+        ret = call_usermodehelper_stdinpipe(sub_info, filp);
-                return 0;
+        if (ret < 0)
+                goto out;
-        f = create_write_pipe();
+        return call_usermodehelper_exec(sub_info, 1);
-        if (IS_ERR(f))
-                return PTR_ERR(f);
-        *filp = f;
-        f = create_read_pipe(f);
-        if (IS_ERR(f)) {
-                free_write_pipe(*filp);
-                return PTR_ERR(f);
-        }
-        sub_info.stdin = f;
-        queue_work(khelper_wq, &sub_info.work);
+  out:
-        wait_for_completion(&done);
+        call_usermodehelper_freeinfo(sub_info);
-        return sub_info.retval;
+        return ret;
 }
 EXPORT_SYMBOL(call_usermodehelper_pipe);
diff --git a/kernel/sys.c b/kernel/sys.c
index 4d141ae3e802..18987c7f6add 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2286,3 +2286,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
        }
        return err ? -EFAULT : 0;
 }
+char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+static void argv_cleanup(char **argv, char **envp)
+{
+        argv_free(argv);
+}
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+        int argc;
+        char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
+        static char *envp[] = {
+                "HOME=/",
+                "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+                NULL
+        };
+        int ret = -ENOMEM;
+        struct subprocess_info *info;
+        if (argv == NULL) {
+                printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
+                       __func__, poweroff_cmd);
+                goto out;
+        }
+        info = call_usermodehelper_setup(argv[0], argv, envp);
+        if (info == NULL) {
+                argv_free(argv);
+                goto out;
+        }
+        call_usermodehelper_setcleanup(info, argv_cleanup);
+        ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
+  out:
+        if (ret && force) {
+                printk(KERN_WARNING "Failed to start orderly shutdown: "
+                       "forcing the issue\n");
+                /* I guess this should try to kick off some daemon to
+                   sync and poweroff asap.  Or not even bother syncing
+                   if we're doing an emergency shutdown? */
+                emergency_sync();
+                kernel_power_off();
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7063ebc6db05..44a1d699aad7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -46,6 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
+#include <linux/reboot.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -705,6 +706,15 @@ static ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "poweroff_cmd",
+                .data           = &poweroff_cmd,
+                .maxlen         = POWEROFF_CMD_PATH_LEN,
+                .mode           = 0644,
+                .proc_handler   = &proc_dostring,
+                .strategy       = &sysctl_string,
+        },
        { .ctl_name = 0 }
 };
diff --git a/lib/Makefile b/lib/Makefile
index da68b2ca0606..614966387402 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,7 +5,7 @@
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
         rbtree.o radix-tree.o dump_stack.o \
         idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
-         sha1.o irq_regs.o reciprocal_div.o
+         sha1.o irq_regs.o reciprocal_div.o argv_split.o
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/argv_split.c b/lib/argv_split.c
new file mode 100644
index 000000000000..4096ed42f490
--- /dev/null
+++ b/lib/argv_split.c
@@ -0,0 +1,105 @@
+/*
+ * Helper function for splitting a string into an argv-like array.
+ */
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/bug.h>
+static const char *skip_sep(const char *cp)
+{
+        while (*cp && isspace(*cp))
+                cp++;
+        return cp;
+}
+static const char *skip_arg(const char *cp)
+{
+        while (*cp && !isspace(*cp))
+                cp++;
+        return cp;
+}
+static int count_argc(const char *str)
+{
+        int count = 0;
+        while (*str) {
+                str = skip_sep(str);
+                if (*str) {
+                        count++;
+                        str = skip_arg(str);
+                }
+        }
+        return count;
+}
+/**
+ * argv_free - free an argv
+ * @argv - the argument vector to be freed
+ *
+ * Frees an argv and the strings it points to.
+ */
+void argv_free(char **argv)
+{
+        char **p;
+        for (p = argv; *p; p++)
+                kfree(*p);
+        kfree(argv);
+}
+EXPORT_SYMBOL(argv_free);
+/**
+ * argv_split - split a string at whitespace, returning an argv
+ * @gfp: the GFP mask used to allocate memory
+ * @str: the string to be split
+ * @argcp: returned argument count
+ *
+ * Returns an array of pointers to strings which are split out from
+ * @str.  This is performed by strictly splitting on white-space; no
+ * quote processing is performed.  Multiple whitespace characters are
+ * considered to be a single argument separator.  The returned array
+ * is always NULL-terminated.  Returns NULL on memory allocation
+ * failure.
+ */
+char **argv_split(gfp_t gfp, const char *str, int *argcp)
+{
+        int argc = count_argc(str);
+        char **argv = kzalloc(sizeof(*argv) * (argc+1), gfp);
+        char **argvp;
+        if (argv == NULL)
+                goto out;
+        *argcp = argc;
+        argvp = argv;
+        while (*str) {
+                str = skip_sep(str);
+                if (*str) {
+                        const char *p = str;
+                        char *t;
+                        str = skip_arg(str);
+                        t = kstrndup(p, str-p, gfp);
+                        if (t == NULL)
+                                goto fail;
+                        *argvp++ = t;
+                }
+        }
+        *argvp = NULL;
+  out:
+        return argv;
+  fail:
+        argv_free(argv);
+        return NULL;
+}
+EXPORT_SYMBOL(argv_split);
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 12e311dc664c..bd5ecbbafab1 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -208,7 +208,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
                argv [0] = uevent_helper;
                argv [1] = (char *)subsystem;
                argv [2] = NULL;
-                call_usermodehelper (argv[0], argv, envp, 0);
+                call_usermodehelper (argv[0], argv, envp, UMH_WAIT_EXEC);
        }
 exit:
diff --git a/mm/util.c b/mm/util.c
index 78f3783bdcc8..bf340d806868 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,7 +6,6 @@
 /**
 * kstrdup - allocate space for and copy an existing string
- *
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 */
@@ -27,6 +26,30 @@ char *kstrdup(const char *s, gfp_t gfp)
 EXPORT_SYMBOL(kstrdup);
 /**
+ * kstrndup - allocate space for and copy an existing string
+ * @s: the string to duplicate
+ * @max: read at most @max chars from @s
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrndup(const char *s, size_t max, gfp_t gfp)
+{
+        size_t len;
+        char *buf;
+        if (!s)
+                return NULL;
+        len = strnlen(s, max);
+        buf = kmalloc_track_caller(len+1, gfp);
+        if (buf) {
+                memcpy(buf, s, len);
+                buf[len] = '\0';
+        }
+        return buf;
+}
+EXPORT_SYMBOL(kstrndup);
+/**
 * kmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
@@ -80,7 +103,6 @@ EXPORT_SYMBOL(krealloc);
 /*
 * strndup_user - duplicate an existing string from user space
- *
 * @s: The string to duplicate
 * @n: Maximum number of bytes to copy, including the trailing NUL.
 */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8e05a11155c9..3130c343088f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -767,3 +767,56 @@ EXPORT_SYMBOL(remap_vmalloc_range);
 void  __attribute__((weak)) vmalloc_sync_all(void)
 {
 }
+static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+        /* apply_to_page_range() does all the hard work. */
+        return 0;
+}
+/**
+ *      alloc_vm_area - allocate a range of kernel address space
+ *      @size:          size of the area
+ *      @returns:       NULL on failure, vm_struct on success
+ *
+ *      This function reserves a range of kernel address space, and
+ *      allocates pagetables to map that range.  No actual mappings
+ *      are created.  If the kernel address space is not shared
+ *      between processes, it syncs the pagetable across all
+ *      processes.
+ */
+struct vm_struct *alloc_vm_area(size_t size)
+{
+        struct vm_struct *area;
+        area = get_vm_area(size, VM_IOREMAP);
+        if (area == NULL)
+                return NULL;
+        /*
+         * This ensures that page tables are constructed for this region
+         * of kernel virtual address space and mapped into init_mm.
+         */
+        if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+                                area->size, f, NULL)) {
+                free_vm_area(area);
+                return NULL;
+        }
+        /* Make sure the pagetables are constructed in process kernel
+           mappings */
+        vmalloc_sync_all();
+        return area;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
+void free_vm_area(struct vm_struct *area)
+{
+        struct vm_struct *ret;
+        ret = remove_vm_area(area->addr);
+        BUG_ON(ret != area);
+        kfree(area);
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index a786e7863200..1ea2f86f7683 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -125,7 +125,7 @@ static void br_stp_start(struct net_bridge *br)
        char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
        char *envp[] = { NULL };
-        r = call_usermodehelper(BR_STP_PROG, argv, envp, 1);
+        r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
        if (r == 0) {
                br->stp_enabled = BR_USER_STP;
                printk(KERN_INFO "%s: userspace STP started\n", br->dev->name);
diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c
index 4adaae242b9e..cf302457097b 100644
--- a/net/irda/irias_object.c
+++ b/net/irda/irias_object.c
@@ -36,39 +36,6 @@ hashbin_t *irias_objects;
 */
 struct ias_value irias_missing = { IAS_MISSING, 0, 0, 0, {0}};
-/*
- * Function strndup (str, max)
- *
- *    My own kernel version of strndup!
- *
- * Faster, check boundary... Jean II
- */
-static char *strndup(char *str, size_t max)
-{
-        char *new_str;
-        int len;
-        /* Check string */
-        if (str == NULL)
-                return NULL;
-        /* Check length, truncate */
-        len = strlen(str);
-        if(len > max)
-                len = max;
-        /* Allocate new string */
-        new_str = kmalloc(len + 1, GFP_ATOMIC);
-        if (new_str == NULL) {
-                IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__);
-                return NULL;
-        }
-        /* Copy and truncate */
-        memcpy(new_str, str, len);
-        new_str[len] = '\0';
-        return new_str;
-}
 /*
 * Function ias_new_object (name, id)
@@ -90,7 +57,7 @@ struct ias_object *irias_new_object( char *name, int id)
        }
        obj->magic = IAS_OBJECT_MAGIC;
-        obj->name = strndup(name, IAS_MAX_CLASSNAME);
+        obj->name = kstrndup(name, IAS_MAX_CLASSNAME, GFP_ATOMIC);
        if (!obj->name) {
                IRDA_WARNING("%s(), Unable to allocate name!\n",
                             __FUNCTION__);
@@ -360,7 +327,7 @@ void irias_add_integer_attrib(struct ias_object *obj, char *name, int value,
        }
        attrib->magic = IAS_ATTRIB_MAGIC;
-        attrib->name = strndup(name, IAS_MAX_ATTRIBNAME);
+        attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
        /* Insert value */
        attrib->value = irias_new_integer_value(value);
@@ -404,7 +371,7 @@ void irias_add_octseq_attrib(struct ias_object *obj, char *name, __u8 *octets,
        }
        attrib->magic = IAS_ATTRIB_MAGIC;
-        attrib->name = strndup(name, IAS_MAX_ATTRIBNAME);
+        attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
        attrib->value = irias_new_octseq_value( octets, len);
        if (!attrib->name || !attrib->value) {
@@ -446,7 +413,7 @@ void irias_add_string_attrib(struct ias_object *obj, char *name, char *value,
        }
        attrib->magic = IAS_ATTRIB_MAGIC;
-        attrib->name = strndup(name, IAS_MAX_ATTRIBNAME);
+        attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
        attrib->value = irias_new_string_value(value);
        if (!attrib->name || !attrib->value) {
@@ -506,7 +473,7 @@ struct ias_value *irias_new_string_value(char *string)
        value->type = IAS_STRING;
        value->charset = CS_ASCII;
-        value->t.string = strndup(string, IAS_MAX_STRING);
+        value->t.string = kstrndup(string, IAS_MAX_STRING, GFP_ATOMIC);
        if (!value->t.string) {
                IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__);
                kfree(value);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index f573ac189a0a..557500110a13 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -108,7 +108,8 @@ static int call_sbin_request_key(struct key *key,
        argv[i] = NULL;
        /* do it */
-        ret = call_usermodehelper_keys(argv[0], argv, envp, keyring, 1);
+        ret = call_usermodehelper_keys(argv[0], argv, envp, keyring,
+                                       UMH_WAIT_PROC);
 error_link:
        key_put(keyring);