17 files changed, 4198 insertions, 0 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
new file mode 100644
index 000000000000..9df99e1885a4
--- /dev/null
+++ b/arch/x86/xen/Kconfig
@@ -0,0 +1,11 @@
+#
+# This Kconfig describes xen options
+#
+config XEN
+        bool "Enable support for Xen hypervisor"
+        depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
+        help
+          This is the Linux Xen port.  Enabling this will allow the
+          kernel to boot in a paravirtualized environment under the
+          Xen hypervisor.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
new file mode 100644
index 000000000000..343df246bd3e
--- /dev/null
+++ b/arch/x86/xen/Makefile
@@ -0,0 +1,4 @@
+obj-y           := enlighten.o setup.o features.o multicalls.o mmu.o \
+                        events.o time.o manage.o xen-asm.o
+obj-$(CONFIG_SMP)       += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
new file mode 100644
index 000000000000..f01bfcd4bdee
--- /dev/null
+++ b/arch/x86/xen/enlighten.c
@@ -0,0 +1,1146 @@
+/*
+ * Core of Xen paravirt_ops implementation.
+ *
+ * This file contains the xen_paravirt_ops structure itself, and the
+ * implementations for:
+ * - privileged instructions
+ * - interrupt flags
+ * - segment operations
+ * - booting and setup
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/hardirq.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/sched.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <asm/paravirt.h>
+#include <asm/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/reboot.h>
+#include "xen-ops.h"
+#include "mmu.h"
+#include "multicalls.h"
+EXPORT_SYMBOL_GPL(hypercall_page);
+DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU(unsigned long, xen_cr3);
+struct start_info *xen_start_info;
+EXPORT_SYMBOL_GPL(xen_start_info);
+static /* __initdata */ struct shared_info dummy_shared_info;
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
+/*
+ * Flag to determine whether vcpu info placement is available on all
+ * VCPUs.  We assume it is to start with, and then set it to zero on
+ * the first failure.  This is because it can succeed on some VCPUs
+ * and not others, since it can involve hypervisor memory allocation,
+ * or because the guest failed to guarantee all the appropriate
+ * constraints on all VCPUs (ie buffer can't cross a page boundary).
+ *
+ * Note that any particular CPU may be using a placed vcpu structure,
+ * but we can only optimise if the all are.
+ *
+ * 0: not available, 1: available
+ */
+static int have_vcpu_info_placement = 1;
+static void __init xen_vcpu_setup(int cpu)
+{
+        struct vcpu_register_vcpu_info info;
+        int err;
+        struct vcpu_info *vcpup;
+        per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+        if (!have_vcpu_info_placement)
+                return;         /* already tested, not available */
+        vcpup = &per_cpu(xen_vcpu_info, cpu);
+        info.mfn = virt_to_mfn(vcpup);
+        info.offset = offset_in_page(vcpup);
+        printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+               cpu, vcpup, info.mfn, info.offset);
+        /* Check to see if the hypervisor will put the vcpu_info
+           structure where we want it, which allows direct access via
+           a percpu-variable. */
+        err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
+        if (err) {
+                printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
+                have_vcpu_info_placement = 0;
+        } else {
+                /* This cpu is using the registered vcpu info, even if
+                   later ones fail to. */
+                per_cpu(xen_vcpu, cpu) = vcpup;
+                printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
+                       cpu, vcpup);
+        }
+}
+static void __init xen_banner(void)
+{
+        printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+               paravirt_ops.name);
+        printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+}
+static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+                      unsigned int *ecx, unsigned int *edx)
+{
+        unsigned maskedx = ~0;
+        /*
+         * Mask out inconvenient features, to try and disable as many
+         * unsupported kernel subsystems as possible.
+         */
+        if (*eax == 1)
+                maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
+                            (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
+                            (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+        asm(XEN_EMULATE_PREFIX "cpuid"
+                : "=a" (*eax),
+                  "=b" (*ebx),
+                  "=c" (*ecx),
+                  "=d" (*edx)
+                : "0" (*eax), "2" (*ecx));
+        *edx &= maskedx;
+}
+static void xen_set_debugreg(int reg, unsigned long val)
+{
+        HYPERVISOR_set_debugreg(reg, val);
+}
+static unsigned long xen_get_debugreg(int reg)
+{
+        return HYPERVISOR_get_debugreg(reg);
+}
+static unsigned long xen_save_fl(void)
+{
+        struct vcpu_info *vcpu;
+        unsigned long flags;
+        vcpu = x86_read_percpu(xen_vcpu);
+        /* flag has opposite sense of mask */
+        flags = !vcpu->evtchn_upcall_mask;
+        /* convert to IF type flag
+           -0 -> 0x00000000
+           -1 -> 0xffffffff
+        */
+        return (-flags) & X86_EFLAGS_IF;
+}
+static void xen_restore_fl(unsigned long flags)
+{
+        struct vcpu_info *vcpu;
+        /* convert from IF type flag */
+        flags = !(flags & X86_EFLAGS_IF);
+        /* There's a one instruction preempt window here.  We need to
+           make sure we're don't switch CPUs between getting the vcpu
+           pointer and updating the mask. */
+        preempt_disable();
+        vcpu = x86_read_percpu(xen_vcpu);
+        vcpu->evtchn_upcall_mask = flags;
+        preempt_enable_no_resched();
+        /* Doesn't matter if we get preempted here, because any
+           pending event will get dealt with anyway. */
+        if (flags == 0) {
+                preempt_check_resched();
+                barrier(); /* unmask then check (avoid races) */
+                if (unlikely(vcpu->evtchn_upcall_pending))
+                        force_evtchn_callback();
+        }
+}
+static void xen_irq_disable(void)
+{
+        /* There's a one instruction preempt window here.  We need to
+           make sure we're don't switch CPUs between getting the vcpu
+           pointer and updating the mask. */
+        preempt_disable();
+        x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+        preempt_enable_no_resched();
+}
+static void xen_irq_enable(void)
+{
+        struct vcpu_info *vcpu;
+        /* There's a one instruction preempt window here.  We need to
+           make sure we're don't switch CPUs between getting the vcpu
+           pointer and updating the mask. */
+        preempt_disable();
+        vcpu = x86_read_percpu(xen_vcpu);
+        vcpu->evtchn_upcall_mask = 0;
+        preempt_enable_no_resched();
+        /* Doesn't matter if we get preempted here, because any
+           pending event will get dealt with anyway. */
+        barrier(); /* unmask then check (avoid races) */
+        if (unlikely(vcpu->evtchn_upcall_pending))
+                force_evtchn_callback();
+}
+static void xen_safe_halt(void)
+{
+        /* Blocking includes an implicit local_irq_enable(). */
+        if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+                BUG();
+}
+static void xen_halt(void)
+{
+        if (irqs_disabled())
+                HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+        else
+                xen_safe_halt();
+}
+static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+        BUG_ON(preemptible());
+        switch (mode) {
+        case PARAVIRT_LAZY_NONE:
+                BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
+                break;
+        case PARAVIRT_LAZY_MMU:
+        case PARAVIRT_LAZY_CPU:
+                BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
+                break;
+        case PARAVIRT_LAZY_FLUSH:
+                /* flush if necessary, but don't change state */
+                if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
+                        xen_mc_flush();
+                return;
+        }
+        xen_mc_flush();
+        x86_write_percpu(xen_lazy_mode, mode);
+}
+static unsigned long xen_store_tr(void)
+{
+        return 0;
+}
+static void xen_set_ldt(const void *addr, unsigned entries)
+{
+        unsigned long linear_addr = (unsigned long)addr;
+        struct mmuext_op *op;
+        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_SET_LDT;
+        if (linear_addr) {
+                /* ldt my be vmalloced, use arbitrary_virt_to_machine */
+                xmaddr_t maddr;
+                maddr = arbitrary_virt_to_machine((unsigned long)addr);
+                linear_addr = (unsigned long)maddr.maddr;
+        }
+        op->arg1.linear_addr = linear_addr;
+        op->arg2.nr_ents = entries;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+        unsigned long *frames;
+        unsigned long va = dtr->address;
+        unsigned int size = dtr->size + 1;
+        unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+        int f;
+        struct multicall_space mcs;
+        /* A GDT can be up to 64k in size, which corresponds to 8192
+           8-byte entries, or 16 4k pages.. */
+        BUG_ON(size > 65536);
+        BUG_ON(va & ~PAGE_MASK);
+        mcs = xen_mc_entry(sizeof(*frames) * pages);
+        frames = mcs.args;
+        for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+                frames[f] = virt_to_mfn(va);
+                make_lowmem_page_readonly((void *)va);
+        }
+        MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+static void load_TLS_descriptor(struct thread_struct *t,
+                                unsigned int cpu, unsigned int i)
+{
+        struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+        xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+        struct multicall_space mc = __xen_mc_entry(0);
+        MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+        xen_mc_batch();
+        load_TLS_descriptor(t, cpu, 0);
+        load_TLS_descriptor(t, cpu, 1);
+        load_TLS_descriptor(t, cpu, 2);
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+        /*
+         * XXX sleazy hack: If we're being called in a lazy-cpu zone,
+         * it means we're in a context switch, and %gs has just been
+         * saved.  This means we can zero it out to prevent faults on
+         * exit from the hypervisor if the next process has no %gs.
+         * Either way, it has been saved, and the new value will get
+         * loaded properly.  This will go away as soon as Xen has been
+         * modified to not save/restore %gs for normal hypercalls.
+         */
+        if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+                loadsegment(gs, 0);
+}
+static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
+                                u32 low, u32 high)
+{
+        unsigned long lp = (unsigned long)&dt[entrynum];
+        xmaddr_t mach_lp = virt_to_machine(lp);
+        u64 entry = (u64)high << 32 | low;
+        preempt_disable();
+        xen_mc_flush();
+        if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+                BUG();
+        preempt_enable();
+}
+static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+                            struct trap_info *info)
+{
+        u8 type, dpl;
+        type = (high >> 8) & 0x1f;
+        dpl = (high >> 13) & 3;
+        if (type != 0xf && type != 0xe)
+                return 0;
+        info->vector = vector;
+        info->address = (high & 0xffff0000) | (low & 0x0000ffff);
+        info->cs = low >> 16;
+        info->flags = dpl;
+        /* interrupt gates clear IF */
+        if (type == 0xe)
+                info->flags |= 4;
+        return 1;
+}
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+/* Set an IDT entry.  If the entry is part of the current IDT, then
+   also update Xen. */
+static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
+                                u32 low, u32 high)
+{
+        unsigned long p = (unsigned long)&dt[entrynum];
+        unsigned long start, end;
+        preempt_disable();
+        start = __get_cpu_var(idt_desc).address;
+        end = start + __get_cpu_var(idt_desc).size + 1;
+        xen_mc_flush();
+        write_dt_entry(dt, entrynum, low, high);
+        if (p >= start && (p + 8) <= end) {
+                struct trap_info info[2];
+                info[1].address = 0;
+                if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+                        if (HYPERVISOR_set_trap_table(info))
+                                BUG();
+        }
+        preempt_enable();
+}
+static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+                                  struct trap_info *traps)
+{
+        unsigned in, out, count;
+        count = (desc->size+1) / 8;
+        BUG_ON(count > 256);
+        for (in = out = 0; in < count; in++) {
+                const u32 *entry = (u32 *)(desc->address + in * 8);
+                if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+                        out++;
+        }
+        traps[out].address = 0;
+}
+void xen_copy_trap_info(struct trap_info *traps)
+{
+        const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
+        xen_convert_trap_info(desc, traps);
+}
+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
+   hold a spinlock to protect the static traps[] array (static because
+   it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+        static DEFINE_SPINLOCK(lock);
+        static struct trap_info traps[257];
+        spin_lock(&lock);
+        __get_cpu_var(idt_desc) = *desc;
+        xen_convert_trap_info(desc, traps);
+        xen_mc_flush();
+        if (HYPERVISOR_set_trap_table(traps))
+                BUG();
+        spin_unlock(&lock);
+}
+/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
+   they're handled differently. */
+static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
+                                u32 low, u32 high)
+{
+        preempt_disable();
+        switch ((high >> 8) & 0xff) {
+        case DESCTYPE_LDT:
+        case DESCTYPE_TSS:
+                /* ignore */
+                break;
+        default: {
+                xmaddr_t maddr = virt_to_machine(&dt[entry]);
+                u64 desc = (u64)high << 32 | low;
+                xen_mc_flush();
+                if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
+                        BUG();
+        }
+        }
+        preempt_enable();
+}
+static void xen_load_esp0(struct tss_struct *tss,
+                          struct thread_struct *thread)
+{
+        struct multicall_space mcs = xen_mc_entry(0);
+        MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+        xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+static void xen_set_iopl_mask(unsigned mask)
+{
+        struct physdev_set_iopl set_iopl;
+        /* Force the change at ring 0. */
+        set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+        HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+}
+static void xen_io_delay(void)
+{
+}
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_apic_read(unsigned long reg)
+{
+        return 0;
+}
+static void xen_apic_write(unsigned long reg, unsigned long val)
+{
+        /* Warn to see if there's any stray references */
+        WARN_ON(1);
+}
+#endif
+static void xen_flush_tlb(void)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+static void xen_flush_tlb_single(unsigned long addr)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_INVLPG_LOCAL;
+        op->arg1.linear_addr = addr & PAGE_MASK;
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
+                                 unsigned long va)
+{
+        struct {
+                struct mmuext_op op;
+                cpumask_t mask;
+        } *args;
+        cpumask_t cpumask = *cpus;
+        struct multicall_space mcs;
+        /*
+         * A couple of (to be removed) sanity checks:
+         *
+         * - current CPU must not be in mask
+         * - mask must exist :)
+         */
+        BUG_ON(cpus_empty(cpumask));
+        BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+        BUG_ON(!mm);
+        /* If a CPU which we ran on has gone down, OK. */
+        cpus_and(cpumask, cpumask, cpu_online_map);
+        if (cpus_empty(cpumask))
+                return;
+        mcs = xen_mc_entry(sizeof(*args));
+        args = mcs.args;
+        args->mask = cpumask;
+        args->op.arg2.vcpumask = &args->mask;
+        if (va == TLB_FLUSH_ALL) {
+                args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+        } else {
+                args->op.cmd = MMUEXT_INVLPG_MULTI;
+                args->op.arg1.linear_addr = va;
+        }
+        MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+static void xen_write_cr2(unsigned long cr2)
+{
+        x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+}
+static unsigned long xen_read_cr2(void)
+{
+        return x86_read_percpu(xen_vcpu)->arch.cr2;
+}
+static unsigned long xen_read_cr2_direct(void)
+{
+        return x86_read_percpu(xen_vcpu_info.arch.cr2);
+}
+static void xen_write_cr4(unsigned long cr4)
+{
+        /* Just ignore cr4 changes; Xen doesn't allow us to do
+           anything anyway. */
+}
+static unsigned long xen_read_cr3(void)
+{
+        return x86_read_percpu(xen_cr3);
+}
+static void xen_write_cr3(unsigned long cr3)
+{
+        BUG_ON(preemptible());
+        if (cr3 == x86_read_percpu(xen_cr3)) {
+                /* just a simple tlb flush */
+                xen_flush_tlb();
+                return;
+        }
+        x86_write_percpu(xen_cr3, cr3);
+        {
+                struct mmuext_op *op;
+                struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+                unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+                op = mcs.args;
+                op->cmd = MMUEXT_NEW_BASEPTR;
+                op->arg1.mfn = mfn;
+                MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+                xen_mc_issue(PARAVIRT_LAZY_CPU);
+        }
+}
+/* Early in boot, while setting up the initial pagetable, assume
+   everything is pinned. */
+static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+{
+        BUG_ON(mem_map);        /* should only be used early */
+        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+/* This needs to make sure the new pte page is pinned iff its being
+   attached to a pinned pagetable. */
+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+{
+        struct page *page = pfn_to_page(pfn);
+        if (PagePinned(virt_to_page(mm->pgd))) {
+                SetPagePinned(page);
+                if (!PageHighMem(page))
+                        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+                else
+                        /* make sure there are no stray mappings of
+                           this page */
+                        kmap_flush_unused();
+        }
+}
+/* This should never happen until we're OK to use struct page */
+static void xen_release_pt(u32 pfn)
+{
+        struct page *page = pfn_to_page(pfn);
+        if (PagePinned(page)) {
+                if (!PageHighMem(page))
+                        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+        }
+}
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+        pgprot_t prot = PAGE_KERNEL;
+        if (PagePinned(page))
+                prot = PAGE_KERNEL_RO;
+        if (0 && PageHighMem(page))
+                printk("mapping highpte %lx type %d prot %s\n",
+                       page_to_pfn(page), type,
+                       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+        return kmap_atomic_prot(page, type, prot);
+}
+#endif
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+        /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+        if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+                pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+                               pte_val_ma(pte));
+        return pte;
+}
+/* Init-time set_pte while constructing initial pagetables, which
+   doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+        pte = mask_rw_pte(ptep, pte);
+        xen_set_pte(ptep, pte);
+}
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+        pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+        /* special set_pte for pagetable initialization */
+        paravirt_ops.set_pte = xen_set_pte_init;
+        init_mm.pgd = base;
+        /*
+         * copy top-level of Xen-supplied pagetable into place.  For
+         * !PAE we can use this as-is, but for PAE it is a stand-in
+         * while we copy the pmd pages.
+         */
+        memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+        if (PTRS_PER_PMD > 1) {
+                int i;
+                /*
+                 * For PAE, need to allocate new pmds, rather than
+                 * share Xen's, since Xen doesn't like pmd's being
+                 * shared between address spaces.
+                 */
+                for (i = 0; i < PTRS_PER_PGD; i++) {
+                        if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+                                pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+                                memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+                                       PAGE_SIZE);
+                                make_lowmem_page_readonly(pmd);
+                                set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+                        } else
+                                pgd_clear(&base[i]);
+                }
+        }
+        /* make sure zero_page is mapped RO so we can use it in pagetables */
+        make_lowmem_page_readonly(empty_zero_page);
+        make_lowmem_page_readonly(base);
+        /*
+         * Switch to new pagetable.  This is done before
+         * pagetable_init has done anything so that the new pages
+         * added to the table can be prepared properly for Xen.
+         */
+        xen_write_cr3(__pa(base));
+}
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+        /* This will work as long as patching hasn't happened yet
+           (which it hasn't) */
+        paravirt_ops.alloc_pt = xen_alloc_pt;
+        paravirt_ops.set_pte = xen_set_pte;
+        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                /*
+                 * Create a mapping for the shared info page.
+                 * Should be set_fixmap(), but shared_info is a machine
+                 * address with no corresponding pseudo-phys address.
+                 */
+                set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+                            PFN_DOWN(xen_start_info->shared_info),
+                            PAGE_KERNEL);
+                HYPERVISOR_shared_info =
+                        (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+        } else
+                HYPERVISOR_shared_info =
+                        (struct shared_info *)__va(xen_start_info->shared_info);
+        /* Actually pin the pagetable down, but we can't set PG_pinned
+           yet because the page structures don't exist yet. */
+        {
+                struct mmuext_op op;
+#ifdef CONFIG_X86_PAE
+                op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+                op.cmd = MMUEXT_PIN_L3_TABLE;
+#endif
+                op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
+                if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+                        BUG();
+        }
+}
+/* This is called once we have the cpu_possible_map */
+void __init xen_setup_vcpu_info_placement(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                xen_vcpu_setup(cpu);
+        /* xen_vcpu_setup managed to place the vcpu_info within the
+           percpu area for all cpus, so make use of it */
+        if (have_vcpu_info_placement) {
+                printk(KERN_INFO "Xen: using vcpu_info placement\n");
+                paravirt_ops.save_fl = xen_save_fl_direct;
+                paravirt_ops.restore_fl = xen_restore_fl_direct;
+                paravirt_ops.irq_disable = xen_irq_disable_direct;
+                paravirt_ops.irq_enable = xen_irq_enable_direct;
+                paravirt_ops.read_cr2 = xen_read_cr2_direct;
+                paravirt_ops.iret = xen_iret_direct;
+        }
+}
+static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
+                          unsigned long addr, unsigned len)
+{
+        char *start, *end, *reloc;
+        unsigned ret;
+        start = end = reloc = NULL;
+#define SITE(x)                                                         \
+        case PARAVIRT_PATCH(x):                                         \
+        if (have_vcpu_info_placement) {                                 \
+                start = (char *)xen_##x##_direct;                       \
+                end = xen_##x##_direct_end;                             \
+                reloc = xen_##x##_direct_reloc;                         \
+        }                                                               \
+        goto patch_site
+        switch (type) {
+                SITE(irq_enable);
+                SITE(irq_disable);
+                SITE(save_fl);
+                SITE(restore_fl);
+#undef SITE
+        patch_site:
+                if (start == NULL || (end-start) > len)
+                        goto default_patch;
+                ret = paravirt_patch_insns(insnbuf, len, start, end);
+                /* Note: because reloc is assigned from something that
+                   appears to be an array, gcc assumes it's non-null,
+                   but doesn't know its relationship with start and
+                   end. */
+                if (reloc > start && reloc < end) {
+                        int reloc_off = reloc - start;
+                        long *relocp = (long *)(insnbuf + reloc_off);
+                        long delta = start - (char *)addr;
+                        *relocp += delta;
+                }
+                break;
+        default_patch:
+        default:
+                ret = paravirt_patch_default(type, clobbers, insnbuf,
+                                             addr, len);
+                break;
+        }
+        return ret;
+}
+static const struct paravirt_ops xen_paravirt_ops __initdata = {
+        .paravirt_enabled = 1,
+        .shared_kernel_pmd = 0,
+        .name = "Xen",
+        .banner = xen_banner,
+        .patch = xen_patch,
+        .memory_setup = xen_memory_setup,
+        .arch_setup = xen_arch_setup,
+        .init_IRQ = xen_init_IRQ,
+        .post_allocator_init = xen_mark_init_mm_pinned,
+        .time_init = xen_time_init,
+        .set_wallclock = xen_set_wallclock,
+        .get_wallclock = xen_get_wallclock,
+        .get_cpu_khz = xen_cpu_khz,
+        .sched_clock = xen_sched_clock,
+        .cpuid = xen_cpuid,
+        .set_debugreg = xen_set_debugreg,
+        .get_debugreg = xen_get_debugreg,
+        .clts = native_clts,
+        .read_cr0 = native_read_cr0,
+        .write_cr0 = native_write_cr0,
+        .read_cr2 = xen_read_cr2,
+        .write_cr2 = xen_write_cr2,
+        .read_cr3 = xen_read_cr3,
+        .write_cr3 = xen_write_cr3,
+        .read_cr4 = native_read_cr4,
+        .read_cr4_safe = native_read_cr4_safe,
+        .write_cr4 = xen_write_cr4,
+        .save_fl = xen_save_fl,
+        .restore_fl = xen_restore_fl,
+        .irq_disable = xen_irq_disable,
+        .irq_enable = xen_irq_enable,
+        .safe_halt = xen_safe_halt,
+        .halt = xen_halt,
+        .wbinvd = native_wbinvd,
+        .read_msr = native_read_msr_safe,
+        .write_msr = native_write_msr_safe,
+        .read_tsc = native_read_tsc,
+        .read_pmc = native_read_pmc,
+        .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
+        .irq_enable_sysexit = NULL,  /* never called */
+        .load_tr_desc = paravirt_nop,
+        .set_ldt = xen_set_ldt,
+        .load_gdt = xen_load_gdt,
+        .load_idt = xen_load_idt,
+        .load_tls = xen_load_tls,
+        .store_gdt = native_store_gdt,
+        .store_idt = native_store_idt,
+        .store_tr = xen_store_tr,
+        .write_ldt_entry = xen_write_ldt_entry,
+        .write_gdt_entry = xen_write_gdt_entry,
+        .write_idt_entry = xen_write_idt_entry,
+        .load_esp0 = xen_load_esp0,
+        .set_iopl_mask = xen_set_iopl_mask,
+        .io_delay = xen_io_delay,
+#ifdef CONFIG_X86_LOCAL_APIC
+        .apic_write = xen_apic_write,
+        .apic_write_atomic = xen_apic_write,
+        .apic_read = xen_apic_read,
+        .setup_boot_clock = paravirt_nop,
+        .setup_secondary_clock = paravirt_nop,
+        .startup_ipi_hook = paravirt_nop,
+#endif
+        .flush_tlb_user = xen_flush_tlb,
+        .flush_tlb_kernel = xen_flush_tlb,
+        .flush_tlb_single = xen_flush_tlb_single,
+        .flush_tlb_others = xen_flush_tlb_others,
+        .pte_update = paravirt_nop,
+        .pte_update_defer = paravirt_nop,
+        .pagetable_setup_start = xen_pagetable_setup_start,
+        .pagetable_setup_done = xen_pagetable_setup_done,
+        .alloc_pt = xen_alloc_pt_init,
+        .release_pt = xen_release_pt,
+        .alloc_pd = paravirt_nop,
+        .alloc_pd_clone = paravirt_nop,
+        .release_pd = paravirt_nop,
+#ifdef CONFIG_HIGHPTE
+        .kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+        .set_pte = NULL,        /* see xen_pagetable_setup_* */
+        .set_pte_at = xen_set_pte_at,
+        .set_pmd = xen_set_pmd,
+        .pte_val = xen_pte_val,
+        .pgd_val = xen_pgd_val,
+        .make_pte = xen_make_pte,
+        .make_pgd = xen_make_pgd,
+#ifdef CONFIG_X86_PAE
+        .set_pte_atomic = xen_set_pte_atomic,
+        .set_pte_present = xen_set_pte_at,
+        .set_pud = xen_set_pud,
+        .pte_clear = xen_pte_clear,
+        .pmd_clear = xen_pmd_clear,
+        .make_pmd = xen_make_pmd,
+        .pmd_val = xen_pmd_val,
+#endif  /* PAE */
+        .activate_mm = xen_activate_mm,
+        .dup_mmap = xen_dup_mmap,
+        .exit_mmap = xen_exit_mmap,
+        .set_lazy_mode = xen_set_lazy_mode,
+};
+#ifdef CONFIG_SMP
+static const struct smp_ops xen_smp_ops __initdata = {
+        .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+        .smp_prepare_cpus = xen_smp_prepare_cpus,
+        .cpu_up = xen_cpu_up,
+        .smp_cpus_done = xen_smp_cpus_done,
+        .smp_send_stop = xen_smp_send_stop,
+        .smp_send_reschedule = xen_smp_send_reschedule,
+        .smp_call_function_mask = xen_smp_call_function_mask,
+};
+#endif  /* CONFIG_SMP */
+static void xen_reboot(int reason)
+{
+#ifdef CONFIG_SMP
+        smp_send_stop();
+#endif
+        if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
+                BUG();
+}
+static void xen_restart(char *msg)
+{
+        xen_reboot(SHUTDOWN_reboot);
+}
+static void xen_emergency_restart(void)
+{
+        xen_reboot(SHUTDOWN_reboot);
+}
+static void xen_machine_halt(void)
+{
+        xen_reboot(SHUTDOWN_poweroff);
+}
+static void xen_crash_shutdown(struct pt_regs *regs)
+{
+        xen_reboot(SHUTDOWN_crash);
+}
+static const struct machine_ops __initdata xen_machine_ops = {
+        .restart = xen_restart,
+        .halt = xen_machine_halt,
+        .power_off = xen_machine_halt,
+        .shutdown = xen_machine_halt,
+        .crash_shutdown = xen_crash_shutdown,
+        .emergency_restart = xen_emergency_restart,
+};
+/* First C function to be called on Xen boot */
+asmlinkage void __init xen_start_kernel(void)
+{
+        pgd_t *pgd;
+        if (!xen_start_info)
+                return;
+        BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
+        /* Install Xen paravirt ops */
+        paravirt_ops = xen_paravirt_ops;
+        machine_ops = xen_machine_ops;
+#ifdef CONFIG_SMP
+        smp_ops = xen_smp_ops;
+#endif
+        xen_setup_features();
+        /* Get mfn list */
+        if (!xen_feature(XENFEAT_auto_translated_physmap))
+                phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+        pgd = (pgd_t *)xen_start_info->pt_base;
+        init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+        init_mm.pgd = pgd; /* use the Xen pagetables to start */
+        /* keep using Xen gdt for now; no urgent need to change it */
+        x86_write_percpu(xen_cr3, __pa(pgd));
+#ifdef CONFIG_SMP
+        /* Don't do the full vcpu_info placement stuff until we have a
+           possible map. */
+        per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+#else
+        /* May as well do it now, since there's no good time to call
+           it later on UP. */
+        xen_setup_vcpu_info_placement();
+#endif
+        paravirt_ops.kernel_rpl = 1;
+        if (xen_feature(XENFEAT_supervisor_mode_kernel))
+                paravirt_ops.kernel_rpl = 0;
+        /* set the limit of our address space */
+        reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+        /* set up basic CPUID stuff */
+        cpu_detect(&new_cpu_data);
+        new_cpu_data.hard_math = 1;
+        new_cpu_data.x86_capability[0] = cpuid_edx(1);
+        /* Poke various useful things into boot_params */
+        LOADER_TYPE = (9 << 4) | 0;
+        INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
+        INITRD_SIZE = xen_start_info->mod_len;
+        /* Start the world */
+        start_kernel();
+}
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
new file mode 100644
index 000000000000..da1b173547a1
--- /dev/null
+++ b/arch/x86/xen/events.c
@@ -0,0 +1,591 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels.  Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels.  The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip.  When an event is recieved, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications.  This includes all the virtual
+ *    device events, since they're driven by front-ends in another domain
+ *    (typically dom0).
+ * 2. VIRQs, typically used for timers.  These are per-cpu events.
+ * 3. IPIs.
+ * 4. Hardware interrupts. Not supported at present.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include "xen-ops.h"
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+struct packed_irq
+{
+        unsigned short evtchn;
+        unsigned char index;
+        unsigned char type;
+};
+static struct packed_irq irq_info[NR_IRQS];
+/* Binding types. */
+enum {
+        IRQT_UNBOUND,
+        IRQT_PIRQ,
+        IRQT_VIRQ,
+        IRQT_IPI,
+        IRQT_EVTCHN
+};
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND     mk_irq_info(IRQT_UNBOUND, 0, 0)
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+        [0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)       ((chn) != 0)
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+        (void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+static struct irq_chip xen_dynamic_chip;
+/* Constructor for packed IRQ information. */
+static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+        return (struct packed_irq) { evtchn, index, type };
+}
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+        return irq_info[irq].evtchn;
+}
+static inline unsigned int index_from_irq(int irq)
+{
+        return irq_info[irq].index;
+}
+static inline unsigned int type_from_irq(int irq)
+{
+        return irq_info[irq].type;
+}
+static inline unsigned long active_evtchns(unsigned int cpu,
+                                           struct shared_info *sh,
+                                           unsigned int idx)
+{
+        return (sh->evtchn_pending[idx] &
+                cpu_evtchn_mask[cpu][idx] &
+                ~sh->evtchn_mask[idx]);
+}
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+        int irq = evtchn_to_irq[chn];
+        BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+        irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+#endif
+        __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
+        __set_bit(chn, cpu_evtchn_mask[cpu]);
+        cpu_evtchn[chn] = cpu;
+}
+static void init_evtchn_cpu_bindings(void)
+{
+#ifdef CONFIG_SMP
+        int i;
+        /* By default all event channels notify CPU#0. */
+        for (i = 0; i < NR_IRQS; i++)
+                irq_desc[i].affinity = cpumask_of_cpu(0);
+#endif
+        memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+        memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+        return cpu_evtchn[evtchn];
+}
+static inline void clear_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+static inline void set_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        sync_set_bit(port, &s->evtchn_pending[0]);
+}
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+static void mask_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        sync_set_bit(port, &s->evtchn_mask[0]);
+}
+static void unmask_evtchn(int port)
+{
+        struct shared_info *s = HYPERVISOR_shared_info;
+        unsigned int cpu = get_cpu();
+        BUG_ON(!irqs_disabled());
+        /* Slow path (hypercall) if this is a non-local port. */
+        if (unlikely(cpu != cpu_from_evtchn(port))) {
+                struct evtchn_unmask unmask = { .port = port };
+                (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+        } else {
+                struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+                sync_clear_bit(port, &s->evtchn_mask[0]);
+                /*
+                 * The following is basically the equivalent of
+                 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+                 * the interrupt edge' if the channel is masked.
+                 */
+                if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+                    !sync_test_and_set_bit(port / BITS_PER_LONG,
+                                           &vcpu_info->evtchn_pending_sel))
+                        vcpu_info->evtchn_upcall_pending = 1;
+        }
+        put_cpu();
+}
+static int find_unbound_irq(void)
+{
+        int irq;
+        /* Only allocate from dynirq range */
+        for (irq = 0; irq < NR_IRQS; irq++)
+                if (irq_bindcount[irq] == 0)
+                        break;
+        if (irq == NR_IRQS)
+                panic("No available IRQ to bind to: increase NR_IRQS!\n");
+        return irq;
+}
+int bind_evtchn_to_irq(unsigned int evtchn)
+{
+        int irq;
+        spin_lock(&irq_mapping_update_lock);
+        irq = evtchn_to_irq[evtchn];
+        if (irq == -1) {
+                irq = find_unbound_irq();
+                dynamic_irq_init(irq);
+                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                              handle_level_irq, "event");
+                evtchn_to_irq[evtchn] = irq;
+                irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+        }
+        irq_bindcount[irq]++;
+        spin_unlock(&irq_mapping_update_lock);
+        return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+        struct evtchn_bind_ipi bind_ipi;
+        int evtchn, irq;
+        spin_lock(&irq_mapping_update_lock);
+        irq = per_cpu(ipi_to_irq, cpu)[ipi];
+        if (irq == -1) {
+                irq = find_unbound_irq();
+                if (irq < 0)
+                        goto out;
+                dynamic_irq_init(irq);
+                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                              handle_level_irq, "ipi");
+                bind_ipi.vcpu = cpu;
+                if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+                                                &bind_ipi) != 0)
+                        BUG();
+                evtchn = bind_ipi.port;
+                evtchn_to_irq[evtchn] = irq;
+                irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+                per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+                bind_evtchn_to_cpu(evtchn, cpu);
+        }
+        irq_bindcount[irq]++;
+ out:
+        spin_unlock(&irq_mapping_update_lock);
+        return irq;
+}
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+        struct evtchn_bind_virq bind_virq;
+        int evtchn, irq;
+        spin_lock(&irq_mapping_update_lock);
+        irq = per_cpu(virq_to_irq, cpu)[virq];
+        if (irq == -1) {
+                bind_virq.virq = virq;
+                bind_virq.vcpu = cpu;
+                if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+                                                &bind_virq) != 0)
+                        BUG();
+                evtchn = bind_virq.port;
+                irq = find_unbound_irq();
+                dynamic_irq_init(irq);
+                set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+                                              handle_level_irq, "virq");
+                evtchn_to_irq[evtchn] = irq;
+                irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+                per_cpu(virq_to_irq, cpu)[virq] = irq;
+                bind_evtchn_to_cpu(evtchn, cpu);
+        }
+        irq_bindcount[irq]++;
+        spin_unlock(&irq_mapping_update_lock);
+        return irq;
+}
+static void unbind_from_irq(unsigned int irq)
+{
+        struct evtchn_close close;
+        int evtchn = evtchn_from_irq(irq);
+        spin_lock(&irq_mapping_update_lock);
+        if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+                close.port = evtchn;
+                if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+                        BUG();
+                switch (type_from_irq(irq)) {
+                case IRQT_VIRQ:
+                        per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+                                [index_from_irq(irq)] = -1;
+                        break;
+                default:
+                        break;
+                }
+                /* Closed ports are implicitly re-bound to VCPU0. */
+                bind_evtchn_to_cpu(evtchn, 0);
+                evtchn_to_irq[evtchn] = -1;
+                irq_info[irq] = IRQ_UNBOUND;
+                dynamic_irq_init(irq);
+        }
+        spin_unlock(&irq_mapping_update_lock);
+}
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+                              irqreturn_t (*handler)(int, void *),
+                              unsigned long irqflags,
+                              const char *devname, void *dev_id)
+{
+        unsigned int irq;
+        int retval;
+        irq = bind_evtchn_to_irq(evtchn);
+        retval = request_irq(irq, handler, irqflags, devname, dev_id);
+        if (retval != 0) {
+                unbind_from_irq(irq);
+                return retval;
+        }
+        return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+                            irqreturn_t (*handler)(int, void *),
+                            unsigned long irqflags, const char *devname, void *dev_id)
+{
+        unsigned int irq;
+        int retval;
+        irq = bind_virq_to_irq(virq, cpu);
+        retval = request_irq(irq, handler, irqflags, devname, dev_id);
+        if (retval != 0) {
+                unbind_from_irq(irq);
+                return retval;
+        }
+        return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+                           unsigned int cpu,
+                           irq_handler_t handler,
+                           unsigned long irqflags,
+                           const char *devname,
+                           void *dev_id)
+{
+        int irq, retval;
+        irq = bind_ipi_to_irq(ipi, cpu);
+        if (irq < 0)
+                return irq;
+        retval = request_irq(irq, handler, irqflags, devname, dev_id);
+        if (retval != 0) {
+                unbind_from_irq(irq);
+                return retval;
+        }
+        return irq;
+}
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+        free_irq(irq, dev_id);
+        unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+        int irq = per_cpu(ipi_to_irq, cpu)[vector];
+        BUG_ON(irq < 0);
+        notify_remote_via_irq(irq);
+}
+/*
+ * Search the CPUs pending events bitmasks.  For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching.  The first level is
+ * a bitset of words which contain pending event bits.  The second
+ * level is a bitset of pending events themselves.
+ */
+fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+        int cpu = get_cpu();
+        struct shared_info *s = HYPERVISOR_shared_info;
+        struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+        unsigned long pending_words;
+        vcpu_info->evtchn_upcall_pending = 0;
+        /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+        pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+        while (pending_words != 0) {
+                unsigned long pending_bits;
+                int word_idx = __ffs(pending_words);
+                pending_words &= ~(1UL << word_idx);
+                while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+                        int bit_idx = __ffs(pending_bits);
+                        int port = (word_idx * BITS_PER_LONG) + bit_idx;
+                        int irq = evtchn_to_irq[port];
+                        if (irq != -1) {
+                                regs->orig_eax = ~irq;
+                                do_IRQ(regs);
+                        }
+                }
+        }
+        put_cpu();
+}
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+        struct evtchn_bind_vcpu bind_vcpu;
+        int evtchn = evtchn_from_irq(irq);
+        if (!VALID_EVTCHN(evtchn))
+                return;
+        /* Send future instances of this interrupt to other vcpu. */
+        bind_vcpu.port = evtchn;
+        bind_vcpu.vcpu = tcpu;
+        /*
+         * If this fails, it usually just indicates that we're dealing with a
+         * virq or IPI channel, which don't actually need to be rebound. Ignore
+         * it, but don't do the xenlinux-level rebind in that case.
+         */
+        if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+                bind_evtchn_to_cpu(evtchn, tcpu);
+}
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+        unsigned tcpu = first_cpu(dest);
+        rebind_irq_to_cpu(irq, tcpu);
+}
+static void enable_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                unmask_evtchn(evtchn);
+}
+static void disable_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                mask_evtchn(evtchn);
+}
+static void ack_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        move_native_irq(irq);
+        if (VALID_EVTCHN(evtchn))
+                clear_evtchn(evtchn);
+}
+static int retrigger_dynirq(unsigned int irq)
+{
+        int evtchn = evtchn_from_irq(irq);
+        int ret = 0;
+        if (VALID_EVTCHN(evtchn)) {
+                set_evtchn(evtchn);
+                ret = 1;
+        }
+        return ret;
+}
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+        .name           = "xen-dyn",
+        .mask           = disable_dynirq,
+        .unmask         = enable_dynirq,
+        .ack            = ack_dynirq,
+        .set_affinity   = set_affinity_irq,
+        .retrigger      = retrigger_dynirq,
+};
+void __init xen_init_IRQ(void)
+{
+        int i;
+        init_evtchn_cpu_bindings();
+        /* No event channels are 'live' right now. */
+        for (i = 0; i < NR_EVENT_CHANNELS; i++)
+                mask_evtchn(i);
+        /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+        for (i = 0; i < NR_IRQS; i++)
+                irq_bindcount[i] = 0;
+        irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c
new file mode 100644
index 000000000000..0707714e40d6
--- /dev/null
+++ b/arch/x86/xen/features.c
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/features.h>
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+void xen_setup_features(void)
+{
+        struct xen_feature_info fi;
+        int i, j;
+        for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+                fi.submap_idx = i;
+                if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+                        break;
+                for (j = 0; j < 32; j++)
+                        xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+        }
+}
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c
new file mode 100644
index 000000000000..aa7af9e6abc0
--- /dev/null
+++ b/arch/x86/xen/manage.c
@@ -0,0 +1,143 @@
+/*
+ * Handle extern requests for shutdown, reboot and sysrq
+ */
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <xen/xenbus.h>
+#define SHUTDOWN_INVALID  -1
+#define SHUTDOWN_POWEROFF  0
+#define SHUTDOWN_SUSPEND   2
+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+ * report a crash, not be instructed to crash!
+ * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
+ * the distinction when we return the reason code to them.
+ */
+#define SHUTDOWN_HALT      4
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+static void shutdown_handler(struct xenbus_watch *watch,
+                             const char **vec, unsigned int len)
+{
+        char *str;
+        struct xenbus_transaction xbt;
+        int err;
+        if (shutting_down != SHUTDOWN_INVALID)
+                return;
+ again:
+        err = xenbus_transaction_start(&xbt);
+        if (err)
+                return;
+        str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+        /* Ignore read errors and empty reads. */
+        if (XENBUS_IS_ERR_READ(str)) {
+                xenbus_transaction_end(xbt, 1);
+                return;
+        }
+        xenbus_write(xbt, "control", "shutdown", "");
+        err = xenbus_transaction_end(xbt, 0);
+        if (err == -EAGAIN) {
+                kfree(str);
+                goto again;
+        }
+        if (strcmp(str, "poweroff") == 0 ||
+            strcmp(str, "halt") == 0)
+                orderly_poweroff(false);
+        else if (strcmp(str, "reboot") == 0)
+                ctrl_alt_del();
+        else {
+                printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+                shutting_down = SHUTDOWN_INVALID;
+        }
+        kfree(str);
+}
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+                          unsigned int len)
+{
+        char sysrq_key = '\0';
+        struct xenbus_transaction xbt;
+        int err;
+ again:
+        err = xenbus_transaction_start(&xbt);
+        if (err)
+                return;
+        if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+                printk(KERN_ERR "Unable to read sysrq code in "
+                       "control/sysrq\n");
+                xenbus_transaction_end(xbt, 1);
+                return;
+        }
+        if (sysrq_key != '\0')
+                xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+        err = xenbus_transaction_end(xbt, 0);
+        if (err == -EAGAIN)
+                goto again;
+        if (sysrq_key != '\0')
+                handle_sysrq(sysrq_key, NULL);
+}
+static struct xenbus_watch shutdown_watch = {
+        .node = "control/shutdown",
+        .callback = shutdown_handler
+};
+static struct xenbus_watch sysrq_watch = {
+        .node = "control/sysrq",
+        .callback = sysrq_handler
+};
+static int setup_shutdown_watcher(void)
+{
+        int err;
+        err = register_xenbus_watch(&shutdown_watch);
+        if (err) {
+                printk(KERN_ERR "Failed to set shutdown watcher\n");
+                return err;
+        }
+        err = register_xenbus_watch(&sysrq_watch);
+        if (err) {
+                printk(KERN_ERR "Failed to set sysrq watcher\n");
+                return err;
+        }
+        return 0;
+}
+static int shutdown_event(struct notifier_block *notifier,
+                          unsigned long event,
+                          void *data)
+{
+        setup_shutdown_watcher();
+        return NOTIFY_DONE;
+}
+static int __init setup_shutdown_event(void)
+{
+        static struct notifier_block xenstore_notifier = {
+                .notifier_call = shutdown_event
+        };
+        register_xenstore_notifier(&xenstore_notifier);
+        return 0;
+}
+subsys_initcall(setup_shutdown_event);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
new file mode 100644
index 000000000000..874db0cd1d2a
--- /dev/null
+++ b/arch/x86/xen/mmu.c
@@ -0,0 +1,567 @@
+/*
+ * Xen mmu operations
+ *
+ * This file contains the various mmu fetch and update operations.
+ * The most important job they must perform is the mapping between the
+ * domain's pfn and the overall machine mfns.
+ *
+ * Xen allows guests to directly update the pagetable, in a controlled
+ * fashion.  In other words, the guest modifies the same pagetable
+ * that the CPU actually uses, which eliminates the overhead of having
+ * a separate shadow pagetable.
+ *
+ * In order to allow this, it falls on the guest domain to map its
+ * notion of a "physical" pfn - which is just a domain-local linear
+ * address - into a real "machine address" which the CPU's MMU can
+ * use.
+ *
+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
+ * inserted directly into the pagetable.  When creating a new
+ * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
+ * the mfn back into a pfn.
+ *
+ * The other constraint is that all pages which make up a pagetable
+ * must be mapped read-only in the guest.  This prevents uncontrolled
+ * guest updates to the pagetable.  Xen strictly enforces this, and
+ * will disallow any pagetable update which will end up mapping a
+ * pagetable page RW, and will disallow using any writable page as a
+ * pagetable.
+ *
+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
+ * would need to validate the whole pagetable before going on.
+ * Naturally, this is quite slow.  The solution is to "pin" a
+ * pagetable, which enforces all the constraints on the pagetable even
+ * when it is not actively in use.  This menas that Xen can be assured
+ * that it is still valid when you do load it into %cr3, and doesn't
+ * need to revalidate it.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/bug.h>
+#include <linux/sched.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/paravirt.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include "multicalls.h"
+#include "mmu.h"
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+        pte_t *pte = lookup_address(address);
+        unsigned offset = address & PAGE_MASK;
+        BUG_ON(pte == NULL);
+        return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+void make_lowmem_page_readonly(void *vaddr)
+{
+        pte_t *pte, ptev;
+        unsigned long address = (unsigned long)vaddr;
+        pte = lookup_address(address);
+        BUG_ON(pte == NULL);
+        ptev = pte_wrprotect(*pte);
+        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+                BUG();
+}
+void make_lowmem_page_readwrite(void *vaddr)
+{
+        pte_t *pte, ptev;
+        unsigned long address = (unsigned long)vaddr;
+        pte = lookup_address(address);
+        BUG_ON(pte == NULL);
+        ptev = pte_mkwrite(*pte);
+        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+                BUG();
+}
+void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+        struct multicall_space mcs;
+        struct mmu_update *u;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*u));
+        u = mcs.args;
+        u->ptr = virt_to_machine(ptr).maddr;
+        u->val = pmd_val_ma(val);
+        MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
+}
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        pgd = swapper_pg_dir + pgd_index(vaddr);
+        if (pgd_none(*pgd)) {
+                BUG();
+                return;
+        }
+        pud = pud_offset(pgd, vaddr);
+        if (pud_none(*pud)) {
+                BUG();
+                return;
+        }
+        pmd = pmd_offset(pud, vaddr);
+        if (pmd_none(*pmd)) {
+                BUG();
+                return;
+        }
+        pte = pte_offset_kernel(pmd, vaddr);
+        /* <mfn,flags> stored as-is, to permit clearing entries */
+        xen_set_pte(pte, mfn_pte(mfn, flags));
+        /*
+         * It's enough to flush this one mapping.
+         * (PGE mappings get flushed as well)
+         */
+        __flush_tlb_one(vaddr);
+}
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval)
+{
+        if (mm == current->mm || mm == &init_mm) {
+                if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+                        struct multicall_space mcs;
+                        mcs = xen_mc_entry(0);
+                        MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+                        xen_mc_issue(PARAVIRT_LAZY_MMU);
+                        return;
+                } else
+                        if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
+                                return;
+        }
+        xen_set_pte(ptep, pteval);
+}
+#ifdef CONFIG_X86_PAE
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+        struct multicall_space mcs;
+        struct mmu_update *u;
+        preempt_disable();
+        mcs = xen_mc_entry(sizeof(*u));
+        u = mcs.args;
+        u->ptr = virt_to_machine(ptr).maddr;
+        u->val = pud_val_ma(val);
+        MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+        xen_mc_issue(PARAVIRT_LAZY_MMU);
+        preempt_enable();
+}
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+        ptep->pte_high = pte.pte_high;
+        smp_wmb();
+        ptep->pte_low = pte.pte_low;
+}
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+        set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+        ptep->pte_low = 0;
+        smp_wmb();              /* make sure low gets written first */
+        ptep->pte_high = 0;
+}
+void xen_pmd_clear(pmd_t *pmdp)
+{
+        xen_set_pmd(pmdp, __pmd(0));
+}
+unsigned long long xen_pte_val(pte_t pte)
+{
+        unsigned long long ret = 0;
+        if (pte.pte_low) {
+                ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        }
+        return ret;
+}
+unsigned long long xen_pmd_val(pmd_t pmd)
+{
+        unsigned long long ret = pmd.pmd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+unsigned long long xen_pgd_val(pgd_t pgd)
+{
+        unsigned long long ret = pgd.pgd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+pte_t xen_make_pte(unsigned long long pte)
+{
+        if (pte & 1)
+                pte = phys_to_machine(XPADDR(pte)).maddr;
+        return (pte_t){ pte, pte >> 32 };
+}
+pmd_t xen_make_pmd(unsigned long long pmd)
+{
+        if (pmd & 1)
+                pmd = phys_to_machine(XPADDR(pmd)).maddr;
+        return (pmd_t){ pmd };
+}
+pgd_t xen_make_pgd(unsigned long long pgd)
+{
+        if (pgd & _PAGE_PRESENT)
+                pgd = phys_to_machine(XPADDR(pgd)).maddr;
+        return (pgd_t){ pgd };
+}
+#else  /* !PAE */
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+        *ptep = pte;
+}
+unsigned long xen_pte_val(pte_t pte)
+{
+        unsigned long ret = pte.pte_low;
+        if (ret & _PAGE_PRESENT)
+                ret = machine_to_phys(XMADDR(ret)).paddr;
+        return ret;
+}
+unsigned long xen_pgd_val(pgd_t pgd)
+{
+        unsigned long ret = pgd.pgd;
+        if (ret)
+                ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+        return ret;
+}
+pte_t xen_make_pte(unsigned long pte)
+{
+        if (pte & _PAGE_PRESENT)
+                pte = phys_to_machine(XPADDR(pte)).maddr;
+        return (pte_t){ pte };
+}
+pgd_t xen_make_pgd(unsigned long pgd)
+{
+        if (pgd & _PAGE_PRESENT)
+                pgd = phys_to_machine(XPADDR(pgd)).maddr;
+        return (pgd_t){ pgd };
+}
+#endif  /* CONFIG_X86_PAE */
+/*
+  (Yet another) pagetable walker.  This one is intended for pinning a
+  pagetable.  This means that it walks a pagetable and calls the
+  callback function on each page it finds making up the page table,
+  at every level.  It walks the entire pagetable, but it only bothers
+  pinning pte pages which are below pte_limit.  In the normal case
+  this will be TASK_SIZE, but at boot we need to pin up to
+  FIXADDR_TOP.  But the important bit is that we don't pin beyond
+  there, because then we start getting into Xen's ptes.
+*/
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+                    unsigned long limit)
+{
+        pgd_t *pgd = pgd_base;
+        int flush = 0;
+        unsigned long addr = 0;
+        unsigned long pgd_next;
+        BUG_ON(limit > FIXADDR_TOP);
+        if (xen_feature(XENFEAT_auto_translated_physmap))
+                return 0;
+        for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+                pud_t *pud;
+                unsigned long pud_limit, pud_next;
+                pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+                if (!pgd_val(*pgd))
+                        continue;
+                pud = pud_offset(pgd, 0);
+                if (PTRS_PER_PUD > 1) /* not folded */
+                        flush |= (*func)(virt_to_page(pud), 0);
+                for (; addr != pud_limit; pud++, addr = pud_next) {
+                        pmd_t *pmd;
+                        unsigned long pmd_limit;
+                        pud_next = pud_addr_end(addr, pud_limit);
+                        if (pud_next < limit)
+                                pmd_limit = pud_next;
+                        else
+                                pmd_limit = limit;
+                        if (pud_none(*pud))
+                                continue;
+                        pmd = pmd_offset(pud, 0);
+                        if (PTRS_PER_PMD > 1) /* not folded */
+                                flush |= (*func)(virt_to_page(pmd), 0);
+                        for (; addr != pmd_limit; pmd++) {
+                                addr += (PAGE_SIZE * PTRS_PER_PTE);
+                                if ((pmd_limit-1) < (addr-1)) {
+                                        addr = pmd_limit;
+                                        break;
+                                }
+                                if (pmd_none(*pmd))
+                                        continue;
+                                flush |= (*func)(pmd_page(*pmd), 0);
+                        }
+                }
+        }
+        flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+        return flush;
+}
+static int pin_page(struct page *page, unsigned flags)
+{
+        unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+        int flush;
+        if (pgfl)
+                flush = 0;              /* already pinned */
+        else if (PageHighMem(page))
+                /* kmaps need flushing if we found an unpinned
+                   highpage */
+                flush = 1;
+        else {
+                void *pt = lowmem_page_address(page);
+                unsigned long pfn = page_to_pfn(page);
+                struct multicall_space mcs = __xen_mc_entry(0);
+                flush = 0;
+                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+                                        pfn_pte(pfn, PAGE_KERNEL_RO),
+                                        flags);
+        }
+        return flush;
+}
+/* This is called just after a mm has been created, but it has not
+   been used yet.  We need to make sure that its pagetable is all
+   read-only, and can be pinned. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+        struct multicall_space mcs;
+        struct mmuext_op *op;
+        xen_mc_batch();
+        if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+                /* re-enable interrupts for kmap_flush_unused */
+                xen_mc_issue(0);
+                kmap_flush_unused();
+                xen_mc_batch();
+        }
+        mcs = __xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+#ifdef CONFIG_X86_PAE
+        op->cmd = MMUEXT_PIN_L3_TABLE;
+#else
+        op->cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+        op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        xen_mc_issue(0);
+}
+/* The init_mm pagetable is really pinned as soon as its created, but
+   that's before we have page structures to store the bits.  So do all
+   the book-keeping now. */
+static __init int mark_pinned(struct page *page, unsigned flags)
+{
+        SetPagePinned(page);
+        return 0;
+}
+void __init xen_mark_init_mm_pinned(void)
+{
+        pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+}
+static int unpin_page(struct page *page, unsigned flags)
+{
+        unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
+        if (pgfl && !PageHighMem(page)) {
+                void *pt = lowmem_page_address(page);
+                unsigned long pfn = page_to_pfn(page);
+                struct multicall_space mcs = __xen_mc_entry(0);
+                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+                                        pfn_pte(pfn, PAGE_KERNEL),
+                                        flags);
+        }
+        return 0;               /* never need to flush on unpin */
+}
+/* Release a pagetables pages back as normal RW */
+static void xen_pgd_unpin(pgd_t *pgd)
+{
+        struct mmuext_op *op;
+        struct multicall_space mcs;
+        xen_mc_batch();
+        mcs = __xen_mc_entry(sizeof(*op));
+        op = mcs.args;
+        op->cmd = MMUEXT_UNPIN_TABLE;
+        op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+        pgd_walk(pgd, unpin_page, TASK_SIZE);
+        xen_mc_issue(0);
+}
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+        spin_lock(&next->page_table_lock);
+        xen_pgd_pin(next->pgd);
+        spin_unlock(&next->page_table_lock);
+}
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+        spin_lock(&mm->page_table_lock);
+        xen_pgd_pin(mm->pgd);
+        spin_unlock(&mm->page_table_lock);
+}
+#ifdef CONFIG_SMP
+/* Another cpu may still have their %cr3 pointing at the pagetable, so
+   we need to repoint it somewhere else before we can unpin it. */
+static void drop_other_mm_ref(void *info)
+{
+        struct mm_struct *mm = info;
+        if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+                leave_mm(smp_processor_id());
+}
+static void drop_mm_ref(struct mm_struct *mm)
+{
+        if (current->active_mm == mm) {
+                if (current->mm == mm)
+                        load_cr3(swapper_pg_dir);
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(mm->cpu_vm_mask))
+                xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+                                           mm, 1);
+}
+#else
+static void drop_mm_ref(struct mm_struct *mm)
+{
+        if (current->active_mm == mm)
+                load_cr3(swapper_pg_dir);
+}
+#endif
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it.  This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing.  This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+void xen_exit_mmap(struct mm_struct *mm)
+{
+        get_cpu();              /* make sure we don't move around */
+        drop_mm_ref(mm);
+        put_cpu();
+        spin_lock(&mm->page_table_lock);
+        /* pgd may not be pinned in the error exit path of execve */
+        if (PagePinned(virt_to_page(mm->pgd)))
+                xen_pgd_unpin(mm->pgd);
+        spin_unlock(&mm->page_table_lock);
+}
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
new file mode 100644
index 000000000000..c9ff27f3ac3a
--- /dev/null
+++ b/arch/x86/xen/mmu.h
@@ -0,0 +1,60 @@
+#ifndef _XEN_MMU_H
+#include <linux/linkage.h>
+#include <asm/page.h>
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+void xen_set_pte(pte_t *ptep, pte_t pteval);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval);
+void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+void xen_exit_mmap(struct mm_struct *mm);
+void xen_pgd_pin(pgd_t *pgd);
+//void xen_pgd_unpin(pgd_t *pgd);
+#ifdef CONFIG_X86_PAE
+unsigned long long xen_pte_val(pte_t);
+unsigned long long xen_pmd_val(pmd_t);
+unsigned long long xen_pgd_val(pgd_t);
+pte_t xen_make_pte(unsigned long long);
+pmd_t xen_make_pmd(unsigned long long);
+pgd_t xen_make_pgd(unsigned long long);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pteval);
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_set_pud(pud_t *ptr, pud_t val);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+#else
+unsigned long xen_pte_val(pte_t);
+unsigned long xen_pmd_val(pmd_t);
+unsigned long xen_pgd_val(pgd_t);
+pte_t xen_make_pte(unsigned long);
+pmd_t xen_make_pmd(unsigned long);
+pgd_t xen_make_pgd(unsigned long);
+#endif
+#endif  /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
new file mode 100644
index 000000000000..c837e8e463db
--- /dev/null
+++ b/arch/x86/xen/multicalls.c
@@ -0,0 +1,90 @@
+/*
+ * Xen hypercall batching.
+ *
+ * Xen allows multiple hypercalls to be issued at once, using the
+ * multicall interface.  This allows the cost of trapping into the
+ * hypervisor to be amortized over several calls.
+ *
+ * This file implements a simple interface for multicalls.  There's a
+ * per-cpu buffer of outstanding multicalls.  When you want to queue a
+ * multicall for issuing, you can allocate a multicall slot for the
+ * call and its arguments, along with storage for space which is
+ * pointed to by the arguments (for passing pointers to structures,
+ * etc).  When the multicall is actually issued, all the space for the
+ * commands and allocated memory is freed for reuse.
+ *
+ * Multicalls are flushed whenever any of the buffers get full, or
+ * when explicitly requested.  There's no way to get per-multicall
+ * return results back.  It will BUG if any of the multicalls fail.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/xen/hypercall.h>
+#include "multicalls.h"
+#define MC_BATCH        32
+#define MC_ARGS         (MC_BATCH * 16 / sizeof(u64))
+struct mc_buffer {
+        struct multicall_entry entries[MC_BATCH];
+        u64 args[MC_ARGS];
+        unsigned mcidx, argidx;
+};
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+void xen_mc_flush(void)
+{
+        struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+        int ret = 0;
+        unsigned long flags;
+        BUG_ON(preemptible());
+        /* Disable interrupts in case someone comes in and queues
+           something in the middle */
+        local_irq_save(flags);
+        if (b->mcidx) {
+                int i;
+                if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+                        BUG();
+                for (i = 0; i < b->mcidx; i++)
+                        if (b->entries[i].result < 0)
+                                ret++;
+                b->mcidx = 0;
+                b->argidx = 0;
+        } else
+                BUG_ON(b->argidx != 0);
+        local_irq_restore(flags);
+        BUG_ON(ret);
+}
+struct multicall_space __xen_mc_entry(size_t args)
+{
+        struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+        struct multicall_space ret;
+        unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+        BUG_ON(preemptible());
+        BUG_ON(argspace > MC_ARGS);
+        if (b->mcidx == MC_BATCH ||
+            (b->argidx + argspace) > MC_ARGS)
+                xen_mc_flush();
+        ret.mc = &b->entries[b->mcidx];
+        b->mcidx++;
+        ret.args = &b->args[b->argidx];
+        b->argidx += argspace;
+        return ret;
+}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
new file mode 100644
index 000000000000..e6f7530b156c
--- /dev/null
+++ b/arch/x86/xen/multicalls.h
@@ -0,0 +1,45 @@
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+#include "xen-ops.h"
+/* Multicalls */
+struct multicall_space
+{
+        struct multicall_entry *mc;
+        void *args;
+};
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+/* Call to start a batch of multiple __xen_mc_entry()s.  Must be
+   paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+        /* need to disable interrupts until this entry is complete */
+        local_irq_save(__get_cpu_var(xen_mc_irq_flags));
+}
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+        xen_mc_batch();
+        return __xen_mc_entry(args);
+}
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+        if ((xen_get_lazy_mode() & mode) == 0)
+                xen_mc_flush();
+        /* restore flags saved in xen_mc_batch */
+        local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+}
+#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
new file mode 100644
index 000000000000..f84e77226646
--- /dev/null
+++ b/arch/x86/xen/setup.c
@@ -0,0 +1,111 @@
+/*
+ * Machine specific setup for xen
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+#include <asm/elf.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+#include "xen-ops.h"
+#include "vdso.h"
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+char * __init xen_memory_setup(void)
+{
+        unsigned long max_pfn = xen_start_info->nr_pages;
+        e820.nr_map = 0;
+        add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+        return "Xen";
+}
+static void xen_idle(void)
+{
+        local_irq_disable();
+        if (need_resched())
+                local_irq_enable();
+        else {
+                current_thread_info()->status &= ~TS_POLLING;
+                smp_mb__after_clear_bit();
+                safe_halt();
+                current_thread_info()->status |= TS_POLLING;
+        }
+}
+/*
+ * Set the bit indicating "nosegneg" library variants should be used.
+ */
+static void fiddle_vdso(void)
+{
+        extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S.  */
+        extern char vsyscall_int80_start;
+        u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
+                             &vsyscall_int80_start);
+        *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+}
+void __init xen_arch_setup(void)
+{
+        struct physdev_set_iopl set_iopl;
+        int rc;
+        HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+        HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+        if (!xen_feature(XENFEAT_auto_translated_physmap))
+                HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+        HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
+                                 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+        set_iopl.iopl = 1;
+        rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+        if (rc != 0)
+                printk(KERN_INFO "physdev_op failed %d\n", rc);
+#ifdef CONFIG_ACPI
+        if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+                printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+                disable_acpi();
+        }
+#endif
+        memcpy(boot_command_line, xen_start_info->cmd_line,
+               MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+               COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+        pm_idle = xen_idle;
+#ifdef CONFIG_SMP
+        /* fill cpus_possible with all available cpus */
+        xen_fill_possible_map();
+#endif
+        paravirt_disable_iospace();
+        fiddle_vdso();
+}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
new file mode 100644
index 000000000000..557b8e24706a
--- /dev/null
+++ b/arch/x86/xen/smp.c
@@ -0,0 +1,404 @@
+/*
+ * Xen SMP support
+ *
+ * This file implements the Xen versions of smp_ops.  SMP under Xen is
+ * very straightforward.  Bringing a CPU up is simply a matter of
+ * loading its initial context and setting it running.
+ *
+ * IPIs are handled through the Xen event mechanism.
+ *
+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
+ * useful topology information for the kernel to make use of.  As a
+ * result, all CPUs are treated as if they're single-core and
+ * single-threaded.
+ *
+ * This does not handle HOTPLUG_CPU yet.
+ */
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/smp.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cpu.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+#include <xen/page.h>
+#include <xen/events.h>
+#include "xen-ops.h"
+#include "mmu.h"
+static cpumask_t cpu_initialized_map;
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+struct call_data_struct {
+        void (*func) (void *info);
+        void *info;
+        atomic_t started;
+        atomic_t finished;
+        int wait;
+};
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
+static struct call_data_struct *call_data;
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
+{
+        return IRQ_HANDLED;
+}
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+        int cpu = smp_processor_id();
+        cpu_init();
+        preempt_disable();
+        per_cpu(cpu_state, cpu) = CPU_ONLINE;
+        xen_setup_cpu_clockevents();
+        /* We can take interrupts now: we're officially "up". */
+        local_irq_enable();
+        wmb();                  /* make sure everything is out */
+        cpu_idle();
+}
+static int xen_smp_intr_init(unsigned int cpu)
+{
+        int rc;
+        const char *resched_name, *callfunc_name;
+        per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+        resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+        rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
+                                    cpu,
+                                    xen_reschedule_interrupt,
+                                    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                    resched_name,
+                                    NULL);
+        if (rc < 0)
+                goto fail;
+        per_cpu(resched_irq, cpu) = rc;
+        callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+        rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
+                                    cpu,
+                                    xen_call_function_interrupt,
+                                    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                    callfunc_name,
+                                    NULL);
+        if (rc < 0)
+                goto fail;
+        per_cpu(callfunc_irq, cpu) = rc;
+        return 0;
+ fail:
+        if (per_cpu(resched_irq, cpu) >= 0)
+                unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+        if (per_cpu(callfunc_irq, cpu) >= 0)
+                unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+        return rc;
+}
+void __init xen_fill_possible_map(void)
+{
+        int i, rc;
+        for (i = 0; i < NR_CPUS; i++) {
+                rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+                if (rc >= 0)
+                        cpu_set(i, cpu_possible_map);
+        }
+}
+void __init xen_smp_prepare_boot_cpu(void)
+{
+        int cpu;
+        BUG_ON(smp_processor_id() != 0);
+        native_smp_prepare_boot_cpu();
+        /* We've switched to the "real" per-cpu gdt, so make sure the
+           old memory can be recycled */
+        make_lowmem_page_readwrite(&per_cpu__gdt_page);
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                cpus_clear(cpu_sibling_map[cpu]);
+                cpus_clear(cpu_core_map[cpu]);
+        }
+        xen_setup_vcpu_info_placement();
+}
+void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+{
+        unsigned cpu;
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                cpus_clear(cpu_sibling_map[cpu]);
+                cpus_clear(cpu_core_map[cpu]);
+        }
+        smp_store_cpu_info(0);
+        set_cpu_sibling_map(0);
+        if (xen_smp_intr_init(0))
+                BUG();
+        cpu_initialized_map = cpumask_of_cpu(0);
+        /* Restrict the possible_map according to max_cpus. */
+        while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+                for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+                        continue;
+                cpu_clear(cpu, cpu_possible_map);
+        }
+        for_each_possible_cpu (cpu) {
+                struct task_struct *idle;
+                if (cpu == 0)
+                        continue;
+                idle = fork_idle(cpu);
+                if (IS_ERR(idle))
+                        panic("failed fork for CPU %d", cpu);
+                cpu_set(cpu, cpu_present_map);
+        }
+        //init_xenbus_allowed_cpumask();
+}
+static __cpuinit int
+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
+{
+        struct vcpu_guest_context *ctxt;
+        struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+        if (cpu_test_and_set(cpu, cpu_initialized_map))
+                return 0;
+        ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+        if (ctxt == NULL)
+                return -ENOMEM;
+        ctxt->flags = VGCF_IN_KERNEL;
+        ctxt->user_regs.ds = __USER_DS;
+        ctxt->user_regs.es = __USER_DS;
+        ctxt->user_regs.fs = __KERNEL_PERCPU;
+        ctxt->user_regs.gs = 0;
+        ctxt->user_regs.ss = __KERNEL_DS;
+        ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+        ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+        memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+        xen_copy_trap_info(ctxt->trap_ctxt);
+        ctxt->ldt_ents = 0;
+        BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
+        make_lowmem_page_readonly(gdt->gdt);
+        ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
+        ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
+        ctxt->user_regs.cs = __KERNEL_CS;
+        ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+        ctxt->kernel_ss = __KERNEL_DS;
+        ctxt->kernel_sp = idle->thread.esp0;
+        ctxt->event_callback_cs     = __KERNEL_CS;
+        ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
+        ctxt->failsafe_callback_cs  = __KERNEL_CS;
+        ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
+        per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+        ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+        if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+                BUG();
+        kfree(ctxt);
+        return 0;
+}
+int __cpuinit xen_cpu_up(unsigned int cpu)
+{
+        struct task_struct *idle = idle_task(cpu);
+        int rc;
+#if 0
+        rc = cpu_up_check(cpu);
+        if (rc)
+                return rc;
+#endif
+        init_gdt(cpu);
+        per_cpu(current_task, cpu) = idle;
+        irq_ctx_init(cpu);
+        xen_setup_timer(cpu);
+        /* make sure interrupts start blocked */
+        per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
+        rc = cpu_initialize_context(cpu, idle);
+        if (rc)
+                return rc;
+        if (num_online_cpus() == 1)
+                alternatives_smp_switch(1);
+        rc = xen_smp_intr_init(cpu);
+        if (rc)
+                return rc;
+        smp_store_cpu_info(cpu);
+        set_cpu_sibling_map(cpu);
+        /* This must be done before setting cpu_online_map */
+        wmb();
+        cpu_set(cpu, cpu_online_map);
+        rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+        BUG_ON(rc);
+        return 0;
+}
+void xen_smp_cpus_done(unsigned int max_cpus)
+{
+}
+static void stop_self(void *v)
+{
+        int cpu = smp_processor_id();
+        /* make sure we're not pinning something down */
+        load_cr3(swapper_pg_dir);
+        /* should set up a minimal gdt */
+        HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
+        BUG();
+}
+void xen_smp_send_stop(void)
+{
+        smp_call_function(stop_self, NULL, 0, 0);
+}
+void xen_smp_send_reschedule(int cpu)
+{
+        xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
+}
+static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+{
+        unsigned cpu;
+        cpus_and(mask, mask, cpu_online_map);
+        for_each_cpu_mask(cpu, mask)
+                xen_send_IPI_one(cpu, vector);
+}
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+{
+        void (*func) (void *info) = call_data->func;
+        void *info = call_data->info;
+        int wait = call_data->wait;
+        /*
+         * Notify initiating CPU that I've grabbed the data and am
+         * about to execute the function
+         */
+        mb();
+        atomic_inc(&call_data->started);
+        /*
+         * At this point the info structure may be out of scope unless wait==1
+         */
+        irq_enter();
+        (*func)(info);
+        irq_exit();
+        if (wait) {
+                mb();           /* commit everything before setting finished */
+                atomic_inc(&call_data->finished);
+        }
+        return IRQ_HANDLED;
+}
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+                               void *info, int wait)
+{
+        struct call_data_struct data;
+        int cpus;
+        /* Holding any lock stops cpus from going down. */
+        spin_lock(&call_lock);
+        cpu_clear(smp_processor_id(), mask);
+        cpus = cpus_weight(mask);
+        if (!cpus) {
+                spin_unlock(&call_lock);
+                return 0;
+        }
+        /* Can deadlock when called with interrupts disabled */
+        WARN_ON(irqs_disabled());
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        call_data = &data;
+        mb();                   /* write everything before IPI */
+        /* Send a message to other CPUs and wait for them to respond */
+        xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+        /* Make sure other vcpus get a chance to run.
+           XXX too severe?  Maybe we should check the other CPU's states? */
+        HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus ||
+               (wait && atomic_read(&data.finished) != cpus))
+                cpu_relax();
+        spin_unlock(&call_lock);
+        return 0;
+}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
new file mode 100644
index 000000000000..dfd6db69ead5
--- /dev/null
+++ b/arch/x86/xen/time.c
@@ -0,0 +1,593 @@
+/*
+ * Xen time implementation.
+ *
+ * This is implemented in terms of a clocksource driver which uses
+ * the hypervisor clock as a nanosecond timebase, and a clockevent
+ * driver which uses the hypervisor's timer mechanism.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/kernel_stat.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include "xen-ops.h"
+#define XEN_SHIFT 22
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP      100000
+#define NS_PER_TICK     (1000000000LL / HZ)
+static cycle_t xen_clocksource_read(void);
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+        u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+        u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+        u32 tsc_to_nsec_mul;
+        int tsc_shift;
+        u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(u64, residual_stolen);
+static DEFINE_PER_CPU(u64, residual_blocked);
+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+        u64 ret;
+        if (BITS_PER_LONG < 64) {
+                u32 *p32 = (u32 *)p;
+                u32 h, l;
+                /*
+                 * Read high then low, and then make sure high is
+                 * still the same; this will only loop if low wraps
+                 * and carries into high.
+                 * XXX some clean way to make this endian-proof?
+                 */
+                do {
+                        h = p32[1];
+                        barrier();
+                        l = p32[0];
+                        barrier();
+                } while (p32[1] != h);
+                ret = (((u64)h) << 32) | l;
+        } else
+                ret = *p;
+        return ret;
+}
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+        u64 state_time;
+        struct vcpu_runstate_info *state;
+        BUG_ON(preemptible());
+        state = &__get_cpu_var(runstate);
+        /*
+         * The runstate info is always updated by the hypervisor on
+         * the current CPU, so there's no need to use anything
+         * stronger than a compiler barrier when fetching it.
+         */
+        do {
+                state_time = get64(&state->state_entry_time);
+                barrier();
+                *res = *state;
+                barrier();
+        } while (get64(&state->state_entry_time) != state_time);
+}
+static void setup_runstate_info(int cpu)
+{
+        struct vcpu_register_runstate_memory_area area;
+        area.addr.v = &per_cpu(runstate, cpu);
+        if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+                               cpu, &area))
+                BUG();
+}
+static void do_stolen_accounting(void)
+{
+        struct vcpu_runstate_info state;
+        struct vcpu_runstate_info *snap;
+        s64 blocked, runnable, offline, stolen;
+        cputime_t ticks;
+        get_runstate_snapshot(&state);
+        WARN_ON(state.state != RUNSTATE_running);
+        snap = &__get_cpu_var(runstate_snapshot);
+        /* work out how much time the VCPU has not been runn*ing*  */
+        blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
+        runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
+        offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
+        *snap = state;
+        /* Add the appropriate number of ticks of stolen time,
+           including any left-overs from last time.  Passing NULL to
+           account_steal_time accounts the time as stolen. */
+        stolen = runnable + offline + __get_cpu_var(residual_stolen);
+        if (stolen < 0)
+                stolen = 0;
+        ticks = 0;
+        while (stolen >= NS_PER_TICK) {
+                ticks++;
+                stolen -= NS_PER_TICK;
+        }
+        __get_cpu_var(residual_stolen) = stolen;
+        account_steal_time(NULL, ticks);
+        /* Add the appropriate number of ticks of blocked time,
+           including any left-overs from last time.  Passing idle to
+           account_steal_time accounts the time as idle/wait. */
+        blocked += __get_cpu_var(residual_blocked);
+        if (blocked < 0)
+                blocked = 0;
+        ticks = 0;
+        while (blocked >= NS_PER_TICK) {
+                ticks++;
+                blocked -= NS_PER_TICK;
+        }
+        __get_cpu_var(residual_blocked) = blocked;
+        account_steal_time(idle_task(smp_processor_id()), ticks);
+}
+/*
+ * Xen sched_clock implementation.  Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long xen_sched_clock(void)
+{
+        struct vcpu_runstate_info state;
+        cycle_t now;
+        u64 ret;
+        s64 offset;
+        /*
+         * Ideally sched_clock should be called on a per-cpu basis
+         * anyway, so preempt should already be disabled, but that's
+         * not current practice at the moment.
+         */
+        preempt_disable();
+        now = xen_clocksource_read();
+        get_runstate_snapshot(&state);
+        WARN_ON(state.state != RUNSTATE_running);
+        offset = now - state.state_entry_time;
+        if (offset < 0)
+                offset = 0;
+        ret = state.time[RUNSTATE_blocked] +
+                state.time[RUNSTATE_running] +
+                offset;
+        preempt_enable();
+        return ret;
+}
+/* Get the CPU speed from Xen */
+unsigned long xen_cpu_khz(void)
+{
+        u64 cpu_khz = 1000000ULL << 32;
+        const struct vcpu_time_info *info =
+                &HYPERVISOR_shared_info->vcpu_info[0].time;
+        do_div(cpu_khz, info->tsc_to_system_mul);
+        if (info->tsc_shift < 0)
+                cpu_khz <<= -info->tsc_shift;
+        else
+                cpu_khz >>= info->tsc_shift;
+        return cpu_khz;
+}
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static unsigned get_time_values_from_xen(void)
+{
+        struct vcpu_time_info   *src;
+        struct shadow_time_info *dst;
+        /* src is shared memory with the hypervisor, so we need to
+           make sure we get a consistent snapshot, even in the face of
+           being preempted. */
+        src = &__get_cpu_var(xen_vcpu)->time;
+        dst = &__get_cpu_var(shadow_time);
+        do {
+                dst->version = src->version;
+                rmb();          /* fetch version before data */
+                dst->tsc_timestamp     = src->tsc_timestamp;
+                dst->system_timestamp  = src->system_time;
+                dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+                dst->tsc_shift         = src->tsc_shift;
+                rmb();          /* test version after fetching data */
+        } while ((src->version & 1) | (dst->version ^ src->version));
+        return dst->version;
+}
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+        u64 product;
+#ifdef __i386__
+        u32 tmp1, tmp2;
+#endif
+        if (shift < 0)
+                delta >>= -shift;
+        else
+                delta <<= shift;
+#ifdef __i386__
+        __asm__ (
+                "mul  %5       ; "
+                "mov  %4,%%eax ; "
+                "mov  %%edx,%4 ; "
+                "mul  %5       ; "
+                "xor  %5,%5    ; "
+                "add  %4,%%eax ; "
+                "adc  %5,%%edx ; "
+                : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+                : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+        __asm__ (
+                "mul %%rdx ; shrd $32,%%rdx,%%rax"
+                : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+        return product;
+}
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+        u64 now, delta;
+        now = native_read_tsc();
+        delta = now - shadow->tsc_timestamp;
+        return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+static cycle_t xen_clocksource_read(void)
+{
+        struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+        cycle_t ret;
+        unsigned version;
+        do {
+                version = get_time_values_from_xen();
+                barrier();
+                ret = shadow->system_timestamp + get_nsec_offset(shadow);
+                barrier();
+        } while (version != __get_cpu_var(xen_vcpu)->time.version);
+        put_cpu_var(shadow_time);
+        return ret;
+}
+static void xen_read_wallclock(struct timespec *ts)
+{
+        const struct shared_info *s = HYPERVISOR_shared_info;
+        u32 version;
+        u64 delta;
+        struct timespec now;
+        /* get wallclock at system boot */
+        do {
+                version = s->wc_version;
+                rmb();          /* fetch version before time */
+                now.tv_sec  = s->wc_sec;
+                now.tv_nsec = s->wc_nsec;
+                rmb();          /* fetch time before checking version */
+        } while ((s->wc_version & 1) | (version ^ s->wc_version));
+        delta = xen_clocksource_read(); /* time since system boot */
+        delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+        now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+        now.tv_sec = delta;
+        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+unsigned long xen_get_wallclock(void)
+{
+        struct timespec ts;
+        xen_read_wallclock(&ts);
+        return ts.tv_sec;
+}
+int xen_set_wallclock(unsigned long now)
+{
+        /* do nothing for domU */
+        return -1;
+}
+static struct clocksource xen_clocksource __read_mostly = {
+        .name = "xen",
+        .rating = 400,
+        .read = xen_clocksource_read,
+        .mask = ~0,
+        .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
+        .shift = XEN_SHIFT,
+        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+/*
+   Xen clockevent implementation
+   Xen has two clockevent implementations:
+   The old timer_op one works with all released versions of Xen prior
+   to version 3.0.4.  This version of the hypervisor provides a
+   single-shot timer with nanosecond resolution.  However, sharing the
+   same event channel is a 100Hz tick which is delivered while the
+   vcpu is running.  We don't care about or use this tick, but it will
+   cause the core time code to think the timer fired too soon, and
+   will end up resetting it each time.  It could be filtered, but
+   doing so has complications when the ktime clocksource is not yet
+   the xen clocksource (ie, at boot time).
+   The new vcpu_op-based timer interface allows the tick timer period
+   to be changed or turned off.  The tick timer is not useful as a
+   periodic timer because events are only delivered to running vcpus.
+   The one-shot timer can report when a timeout is in the past, so
+   set_next_event is capable of returning -ETIME when appropriate.
+   This interface is used when available.
+*/
+/*
+  Get a hypervisor absolute time.  In theory we could maintain an
+  offset between the kernel's time and the hypervisor's time, and
+  apply that to a kernel's absolute timeout.  Unfortunately the
+  hypervisor and kernel times can drift even if the kernel is using
+  the Xen clocksource, because ntp can warp the kernel's clocksource.
+*/
+static s64 get_abs_timeout(unsigned long delta)
+{
+        return xen_clocksource_read() + delta;
+}
+static void xen_timerop_set_mode(enum clock_event_mode mode,
+                                 struct clock_event_device *evt)
+{
+        switch (mode) {
+        case CLOCK_EVT_MODE_PERIODIC:
+                /* unsupported */
+                WARN_ON(1);
+                break;
+        case CLOCK_EVT_MODE_ONESHOT:
+        case CLOCK_EVT_MODE_RESUME:
+                break;
+        case CLOCK_EVT_MODE_UNUSED:
+        case CLOCK_EVT_MODE_SHUTDOWN:
+                HYPERVISOR_set_timer_op(0);  /* cancel timeout */
+                break;
+        }
+}
+static int xen_timerop_set_next_event(unsigned long delta,
+                                      struct clock_event_device *evt)
+{
+        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+        if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+                BUG();
+        /* We may have missed the deadline, but there's no real way of
+           knowing for sure.  If the event was in the past, then we'll
+           get an immediate interrupt. */
+        return 0;
+}
+static const struct clock_event_device xen_timerop_clockevent = {
+        .name = "xen",
+        .features = CLOCK_EVT_FEAT_ONESHOT,
+        .max_delta_ns = 0xffffffff,
+        .min_delta_ns = TIMER_SLOP,
+        .mult = 1,
+        .shift = 0,
+        .rating = 500,
+        .set_mode = xen_timerop_set_mode,
+        .set_next_event = xen_timerop_set_next_event,
+};
+static void xen_vcpuop_set_mode(enum clock_event_mode mode,
+                                struct clock_event_device *evt)
+{
+        int cpu = smp_processor_id();
+        switch (mode) {
+        case CLOCK_EVT_MODE_PERIODIC:
+                WARN_ON(1);     /* unsupported */
+                break;
+        case CLOCK_EVT_MODE_ONESHOT:
+                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                        BUG();
+                break;
+        case CLOCK_EVT_MODE_UNUSED:
+        case CLOCK_EVT_MODE_SHUTDOWN:
+                if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+                    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                        BUG();
+                break;
+        case CLOCK_EVT_MODE_RESUME:
+                break;
+        }
+}
+static int xen_vcpuop_set_next_event(unsigned long delta,
+                                     struct clock_event_device *evt)
+{
+        int cpu = smp_processor_id();
+        struct vcpu_set_singleshot_timer single;
+        int ret;
+        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+        single.timeout_abs_ns = get_abs_timeout(delta);
+        single.flags = VCPU_SSHOTTMR_future;
+        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
+        BUG_ON(ret != 0 && ret != -ETIME);
+        return ret;
+}
+static const struct clock_event_device xen_vcpuop_clockevent = {
+        .name = "xen",
+        .features = CLOCK_EVT_FEAT_ONESHOT,
+        .max_delta_ns = 0xffffffff,
+        .min_delta_ns = TIMER_SLOP,
+        .mult = 1,
+        .shift = 0,
+        .rating = 500,
+        .set_mode = xen_vcpuop_set_mode,
+        .set_next_event = xen_vcpuop_set_next_event,
+};
+static const struct clock_event_device *xen_clockevent =
+        &xen_timerop_clockevent;
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+        struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+        irqreturn_t ret;
+        ret = IRQ_NONE;
+        if (evt->event_handler) {
+                evt->event_handler(evt);
+                ret = IRQ_HANDLED;
+        }
+        do_stolen_accounting();
+        return ret;
+}
+void xen_setup_timer(int cpu)
+{
+        const char *name;
+        struct clock_event_device *evt;
+        int irq;
+        printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
+        name = kasprintf(GFP_KERNEL, "timer%d", cpu);
+        if (!name)
+                name = "<timer kasprintf failed>";
+        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
+                                      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                      name, NULL);
+        evt = &per_cpu(xen_clock_events, cpu);
+        memcpy(evt, xen_clockevent, sizeof(*evt));
+        evt->cpumask = cpumask_of_cpu(cpu);
+        evt->irq = irq;
+        setup_runstate_info(cpu);
+}
+void xen_setup_cpu_clockevents(void)
+{
+        BUG_ON(preemptible());
+        clockevents_register_device(&__get_cpu_var(xen_clock_events));
+}
+__init void xen_time_init(void)
+{
+        int cpu = smp_processor_id();
+        get_time_values_from_xen();
+        clocksource_register(&xen_clocksource);
+        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
+                /* Successfully turned off 100Hz tick, so we have the
+                   vcpuop-based timer interface */
+                printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
+                xen_clockevent = &xen_vcpuop_clockevent;
+        }
+        /* Set initial system time with full resolution */
+        xen_read_wallclock(&xtime);
+        set_normalized_timespec(&wall_to_monotonic,
+                                -xtime.tv_sec, -xtime.tv_nsec);
+        tsc_disable = 0;
+        xen_setup_timer(cpu);
+        xen_setup_cpu_clockevents();
+}
diff --git a/arch/x86/xen/vdso.h b/arch/x86/xen/vdso.h
new file mode 100644
index 000000000000..861fedfe5230
--- /dev/null
+++ b/arch/x86/xen/vdso.h
@@ -0,0 +1,4 @@
+/* Bit used for the pseudo-hwcap for non-negative segments.  We use
+   bit 1 to avoid bugs in some versions of glibc when bit 0 is
+   used; the choice is otherwise arbitrary. */
+#define VDSO_NOTE_NONEGSEG_BIT  1
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 000000000000..1a43b60c0c62
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,291 @@
+/*
+        Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+        The inline versions are the same as the direct-use versions, with the
+        pre- and post-amble chopped off.
+        This code is encoded for size rather than absolute efficiency,
+        with a view to being able to inline as much as possible.
+        We only bother with direct forms (ie, vcpu in pda) of the operations
+        here; the indirect forms are better handled in C, since they're
+        generally too large to inline anyway.
+ */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+#include <xen/interface/xen.h>
+#define RELOC(x, v)     .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)     .globl x##_end; x##_end=.
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI  0x80000000
+/*
+        Enable events.  This clears the event mask and tests the pending
+        event status with one and operation.  If there are pending
+        events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+        /* Clear mask and test pending */
+        andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+        /* Preempt here doesn't matter because that will deal with
+           any pending interrupts.  The pending check may end up being
+           run on the wrong CPU, but that doesn't hurt. */
+        jz 1f
+2:      call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+        ret
+        ENDPROC(xen_irq_enable_direct)
+        RELOC(xen_irq_enable_direct, 2b+1)
+/*
+        Disabling events is simply a matter of making the event mask
+        non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+        movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+        ret
+        ENDPROC(xen_irq_disable_direct)
+        RELOC(xen_irq_disable_direct, 0)
+/*
+        (xen_)save_fl is used to get the current interrupt enable status.
+        Callers expect the status to be in X86_EFLAGS_IF, and other bits
+        may be set in the return value.  We take advantage of this by
+        making sure that X86_EFLAGS_IF has the right value (and other bits
+        in that byte are 0), but other bits in the return value are
+        undefined.  We need to toggle the state of the bit, because
+        Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+        testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+        setz %ah
+        addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+        ret
+        ENDPROC(xen_save_fl_direct)
+        RELOC(xen_save_fl_direct, 0)
+/*
+        In principle the caller should be passing us a value return
+        from xen_save_fl_direct, but for robustness sake we test only
+        the X86_EFLAGS_IF flag rather than the whole byte. After
+        setting the interrupt mask state, it checks for unmasked
+        pending events and enters the hypervisor to get them delivered
+        if so.
+ */
+ENTRY(xen_restore_fl_direct)
+        testb $X86_EFLAGS_IF>>8, %ah
+        setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+        /* Preempt here doesn't matter because that will deal with
+           any pending interrupts.  The pending check may end up being
+           run on the wrong CPU, but that doesn't hurt. */
+        /* check for unmasked and pending */
+        cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+        jz 1f
+2:      call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+        ret
+        ENDPROC(xen_restore_fl_direct)
+        RELOC(xen_restore_fl_direct, 2b+1)
+/*
+        This is run where a normal iret would be run, with the same stack setup:
+              8: eflags
+              4: cs
+        esp-> 0: eip
+        This attempts to make sure that any pending events are dealt
+        with on return to usermode, but there is a small window in
+        which an event can happen just before entering usermode.  If
+        the nested interrupt ends up setting one of the TIF_WORK_MASK
+        pending work flags, they will not be tested again before
+        returning to usermode. This means that a process can end up
+        with pending work, which will be unprocessed until the process
+        enters and leaves the kernel again, which could be an
+        unbounded amount of time.  This means that a pending signal or
+        reschedule event could be indefinitely delayed.
+        The fix is to notice a nested interrupt in the critical
+        window, and if one occurs, then fold the nested interrupt into
+        the current interrupt stack frame, and re-process it
+        iteratively rather than recursively.  This means that it will
+        exit via the normal path, and all pending work will be dealt
+        with appropriately.
+        Because the nested interrupt handler needs to deal with the
+        current stack state in whatever form its in, we keep things
+        simple by only using a single register which is pushed/popped
+        on the stack.
+        Non-direct iret could be done in the same way, but it would
+        require an annoying amount of code duplication.  We'll assume
+        that direct mode will be the common case once the hypervisor
+        support becomes commonplace.
+ */
+ENTRY(xen_iret_direct)
+        /* test eflags for special cases */
+        testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+        jnz hyper_iret
+        push %eax
+        ESP_OFFSET=4    # bytes pushed onto stack
+        /* Store vcpu_info pointer for easy access.  Do it this
+           way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+        GET_THREAD_INFO(%eax)
+        movl TI_cpu(%eax),%eax
+        movl __per_cpu_offset(,%eax,4),%eax
+        lea per_cpu__xen_vcpu_info(%eax),%eax
+#else
+        movl $per_cpu__xen_vcpu_info, %eax
+#endif
+        /* check IF state we're restoring */
+        testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+        /* Maybe enable events.  Once this happens we could get a
+           recursive event, so the critical region starts immediately
+           afterwards.  However, if that happens we don't end up
+           resuming the code, so we don't have to be worried about
+           being preempted to another CPU. */
+        setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+        /* check for unmasked and pending */
+        cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+        /* If there's something pending, mask events again so we
+           can jump back into xen_hypervisor_callback */
+        sete XEN_vcpu_info_mask(%eax)
+        popl %eax
+        /* From this point on the registers are restored and the stack
+           updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+        /* Jump to hypervisor_callback after fixing up the stack.
+           Events are masked, so jumping out of the critical
+           region is OK. */
+        je xen_hypervisor_callback
+        iret
+xen_iret_end_crit:
+hyper_iret:
+        /* put this out of line since its very rarely used */
+        jmp hypercall_page + __HYPERVISOR_iret * 32
+        .globl xen_iret_start_crit, xen_iret_end_crit
+/*
+   This is called by xen_hypervisor_callback in entry.S when it sees
+   that the EIP at the time of interrupt was between xen_iret_start_crit
+   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
+   a more refined determination of what to do.
+   The stack format at this point is:
+        ----------------
+         ss             : (ss/esp may be present if we came from usermode)
+         esp            :
+         eflags         }  outer exception info
+         cs             }
+         eip            }
+        ---------------- <- edi (copy dest)
+         eax            :  outer eax if it hasn't been restored
+        ----------------
+         eflags         }  nested exception info
+         cs             }   (no ss/esp because we're nested
+         eip            }    from the same ring)
+         orig_eax       }<- esi (copy src)
+         - - - - - - - -
+         fs             }
+         es             }
+         ds             }  SAVE_ALL state
+         eax            }
+          :             :
+         ebx            }
+        ----------------
+         return addr     <- esp
+        ----------------
+   In order to deliver the nested exception properly, we need to shift
+   everything from the return addr up to the error code so it
+   sits just under the outer exception info.  This means that when we
+   handle the exception, we do it in the context of the outer exception
+   rather than starting a new one.
+   The only caveat is that if the outer eax hasn't been
+   restored yet (ie, it's still on stack), we need to insert
+   its value into the SAVE_ALL state before going on, since
+   it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+        /* offsets +4 for return address */
+        /*
+           Paranoia: Make sure we're really coming from userspace.
+           One could imagine a case where userspace jumps into the
+           critical range address, but just before the CPU delivers a GP,
+           it decides to deliver an interrupt instead.  Unlikely?
+           Definitely.  Easy to avoid?  Yes.  The Intel documents
+           explicitly say that the reported EIP for a bad jump is the
+           jump instruction itself, not the destination, but some virtual
+           environments get this wrong.
+         */
+        movl PT_CS+4(%esp), %ecx
+        andl $SEGMENT_RPL_MASK, %ecx
+        cmpl $USER_RPL, %ecx
+        je 2f
+        lea PT_ORIG_EAX+4(%esp), %esi
+        lea PT_EFLAGS+4(%esp), %edi
+        /* If eip is before iret_restore_end then stack
+           hasn't been restored yet. */
+        cmp $iret_restore_end, %eax
+        jae 1f
+        movl 0+4(%edi),%eax             /* copy EAX */
+        movl %eax, PT_EAX+4(%esp)
+        lea ESP_OFFSET(%edi),%edi       /* move dest up over saved regs */
+        /* set up the copy */
+1:      std
+        mov $(PT_EIP+4) / 4, %ecx       /* copy ret+saved regs up to orig_eax */
+        rep movsl
+        cld
+        lea 4(%edi),%esp                /* point esp to new frame */
+2:      ret
+/*
+        Force an event check by making a hypercall,
+        but preserve regs before making the call.
+ */
+check_events:
+        push %eax
+        push %ecx
+        push %edx
+        call force_evtchn_callback
+        pop %edx
+        pop %ecx
+        pop %eax
+        ret
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
new file mode 100644
index 000000000000..f8d6937db2ec
--- /dev/null
+++ b/arch/x86/xen/xen-head.S
@@ -0,0 +1,38 @@
+/* Xen-specific pieces of head.S, intended to be included in the right
+        place in head.S */
+#ifdef CONFIG_XEN
+#include <linux/elfnote.h>
+#include <asm/boot.h>
+#include <xen/interface/elfnote.h>
+.pushsection .init.text
+ENTRY(startup_xen)
+        movl %esi,xen_start_info
+        cld
+        movl $(init_thread_union+THREAD_SIZE),%esp
+        jmp xen_start_kernel
+.popsection
+.pushsection .bss.page_aligned
+        .align PAGE_SIZE_asm
+ENTRY(hypercall_page)
+        .skip 0x1000
+.popsection
+        ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
+        ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
+        ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
+        ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
+        ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
+        ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
+        ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
+        ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
+#else
+        ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
+#endif
+        ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
new file mode 100644
index 000000000000..b9aaea45f07f
--- /dev/null
+++ b/arch/x86/xen/xen-ops.h
@@ -0,0 +1,71 @@
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+#include <linux/init.h>
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+void xen_copy_trap_info(struct trap_info *traps);
+DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DECLARE_PER_CPU(unsigned long, xen_cr3);
+extern struct start_info *xen_start_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void __init xen_init_IRQ(void);
+void xen_setup_timer(int cpu);
+void xen_setup_cpu_clockevents(void);
+unsigned long xen_cpu_khz(void);
+void __init xen_time_init(void);
+unsigned long xen_get_wallclock(void);
+int xen_set_wallclock(unsigned long time);
+unsigned long long xen_sched_clock(void);
+void xen_mark_init_mm_pinned(void);
+DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+static inline unsigned xen_get_lazy_mode(void)
+{
+        return x86_read_percpu(xen_lazy_mode);
+}
+void __init xen_fill_possible_map(void);
+void __init xen_setup_vcpu_info_placement(void);
+void xen_smp_prepare_boot_cpu(void);
+void xen_smp_prepare_cpus(unsigned int max_cpus);
+int xen_cpu_up(unsigned int cpu);
+void xen_smp_cpus_done(unsigned int max_cpus);
+void xen_smp_send_stop(void);
+void xen_smp_send_reschedule(int cpu);
+int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+                           int wait);
+int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+                                 int nonatomic, int wait);
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+                               void *info, int wait);
+/* Declare an asm function, along with symbols needed to make it
+   inlineable */
+#define DECL_ASM(ret, name, ...)                \
+        ret name(__VA_ARGS__);                  \
+        extern char name##_end[];               \
+        extern char name##_reloc[]              \
+DECL_ASM(void, xen_irq_enable_direct, void);
+DECL_ASM(void, xen_irq_disable_direct, void);
+DECL_ASM(unsigned long, xen_save_fl_direct, void);
+DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+void xen_iret_direct(void);
+#endif /* XEN_OPS_H */