58 files changed, 35362 insertions, 0 deletions
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
new file mode 100644
index 000000000000..c1a02bbc252c
--- /dev/null
+++ b/arch/ia64/kernel/Makefile
@@ -0,0 +1,52 @@
+#
+# Makefile for the linux kernel.
+#
+extra-y := head.o init_task.o vmlinux.lds
+obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o       \
+         irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o          \
+         salinfo.o semaphore.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
+         unwind.o mca.o mca_asm.o topology.o
+obj-$(CONFIG_IA64_BRL_EMU)      += brl_emu.o
+obj-$(CONFIG_IA64_GENERIC)      += acpi-ext.o
+obj-$(CONFIG_IA64_HP_ZX1)       += acpi-ext.o
+obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o
+obj-$(CONFIG_IA64_PALINFO)      += palinfo.o
+obj-$(CONFIG_IOSAPIC)           += iosapic.o
+obj-$(CONFIG_MODULES)           += module.o
+obj-$(CONFIG_SMP)               += smp.o smpboot.o domain.o
+obj-$(CONFIG_PERFMON)           += perfmon_default_smpl.o
+obj-$(CONFIG_IA64_CYCLONE)      += cyclone.o
+obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
+mca_recovery-y                  += mca_drv.o mca_drv_asm.o
+# The gate DSO image is built using a special linker script.
+targets += gate.so gate-syms.o
+extra-y += gate.so gate-syms.o gate.lds gate.o
+# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
+CFLAGS_traps.o  += -mfixed-range=f2-f5,f16-f31
+CPPFLAGS_gate.lds := -P -C -U$(ARCH)
+quiet_cmd_gate = GATE $@
+      cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
+GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1
+$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
+        $(call if_changed,gate)
+$(obj)/built-in.o: $(obj)/gate-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
+GATECFLAGS_gate-syms.o = -r
+$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
+        $(call if_changed,gate)
+# gate-data.o contains the gate DSO image as data in section .data.gate.
+# We must build gate.so before we can assemble it.
+# Note: kbuild does not track this dependency due to usage of .incbin
+$(obj)/gate-data.o: $(obj)/gate.so
diff --git a/arch/ia64/kernel/acpi-ext.c b/arch/ia64/kernel/acpi-ext.c
new file mode 100644
index 000000000000..2623df5e2633
--- /dev/null
+++ b/arch/ia64/kernel/acpi-ext.c
@@ -0,0 +1,100 @@
+/*
+ * arch/ia64/kernel/acpi-ext.c
+ *
+ * Copyright (C) 2003 Hewlett-Packard
+ * Copyright (C) Alex Williamson
+ * Copyright (C) Bjorn Helgaas
+ *
+ * Vendor specific extensions to ACPI.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
+#include <asm/acpi-ext.h>
+struct acpi_vendor_descriptor {
+        u8                              guid_id;
+        efi_guid_t                      guid;
+};
+struct acpi_vendor_info {
+        struct acpi_vendor_descriptor   *descriptor;
+        u8                              *data;
+        u32                             length;
+};
+acpi_status
+acpi_vendor_resource_match(struct acpi_resource *resource, void *context)
+{
+        struct acpi_vendor_info *info = (struct acpi_vendor_info *) context;
+        struct acpi_resource_vendor *vendor;
+        struct acpi_vendor_descriptor *descriptor;
+        u32 length;
+        if (resource->id != ACPI_RSTYPE_VENDOR)
+                return AE_OK;
+        vendor = (struct acpi_resource_vendor *) &resource->data;
+        descriptor = (struct acpi_vendor_descriptor *) vendor->reserved;
+        if (vendor->length <= sizeof(*info->descriptor) ||
+            descriptor->guid_id != info->descriptor->guid_id ||
+            efi_guidcmp(descriptor->guid, info->descriptor->guid))
+                return AE_OK;
+        length = vendor->length - sizeof(struct acpi_vendor_descriptor);
+        info->data = acpi_os_allocate(length);
+        if (!info->data)
+                return AE_NO_MEMORY;
+        memcpy(info->data, vendor->reserved + sizeof(struct acpi_vendor_descriptor), length);
+        info->length = length;
+        return AE_CTRL_TERMINATE;
+}
+acpi_status
+acpi_find_vendor_resource(acpi_handle obj, struct acpi_vendor_descriptor *id,
+                u8 **data, u32 *length)
+{
+        struct acpi_vendor_info info;
+        info.descriptor = id;
+        info.data = NULL;
+        acpi_walk_resources(obj, METHOD_NAME__CRS, acpi_vendor_resource_match, &info);
+        if (!info.data)
+                return AE_NOT_FOUND;
+        *data = info.data;
+        *length = info.length;
+        return AE_OK;
+}
+struct acpi_vendor_descriptor hp_ccsr_descriptor = {
+        .guid_id = 2,
+        .guid    = EFI_GUID(0x69e9adf9, 0x924f, 0xab5f, 0xf6, 0x4a, 0x24, 0xd2, 0x01, 0x37, 0x0e, 0xad)
+};
+acpi_status
+hp_acpi_csr_space(acpi_handle obj, u64 *csr_base, u64 *csr_length)
+{
+        acpi_status status;
+        u8 *data;
+        u32 length;
+        status = acpi_find_vendor_resource(obj, &hp_ccsr_descriptor, &data, &length);
+        if (ACPI_FAILURE(status) || length != 16)
+                return AE_NOT_FOUND;
+        memcpy(csr_base, data, sizeof(*csr_base));
+        memcpy(csr_length, data + 8, sizeof(*csr_length));
+        acpi_os_free(data);
+        return AE_OK;
+}
+EXPORT_SYMBOL(hp_acpi_csr_space);
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
new file mode 100644
index 000000000000..a8e99c56a768
--- /dev/null
+++ b/arch/ia64/kernel/acpi.c
@@ -0,0 +1,841 @@
+/*
+ *  acpi.c - Architecture-Specific Low-Level ACPI Support
+ *
+ *  Copyright (C) 1999 VA Linux Systems
+ *  Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com>
+ *  Copyright (C) 2000, 2002-2003 Hewlett-Packard Co.
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *  Copyright (C) 2000 Intel Corp.
+ *  Copyright (C) 2000,2001 J.I. Lee <jung-ik.lee@intel.com>
+ *  Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2001 Jenna Hall <jenna.s.hall@intel.com>
+ *  Copyright (C) 2001 Takayoshi Kochi <t-kochi@bq.jp.nec.com>
+ *  Copyright (C) 2002 Erich Focht <efocht@ess.nec.de>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/irq.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
+#include <linux/mmzone.h>
+#include <linux/nodemask.h>
+#include <asm/io.h>
+#include <asm/iosapic.h>
+#include <asm/machvec.h>
+#include <asm/page.h>
+#include <asm/system.h>
+#include <asm/numa.h>
+#include <asm/sal.h>
+#include <asm/cyclone.h>
+#define BAD_MADT_ENTRY(entry, end) (                                        \
+                (!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
+                ((acpi_table_entry_header *)entry)->length != sizeof(*entry))
+#define PREFIX                  "ACPI: "
+void (*pm_idle) (void);
+EXPORT_SYMBOL(pm_idle);
+void (*pm_power_off) (void);
+EXPORT_SYMBOL(pm_power_off);
+unsigned char acpi_kbd_controller_present = 1;
+unsigned char acpi_legacy_devices;
+#define MAX_SAPICS 256
+u16 ia64_acpiid_to_sapicid[MAX_SAPICS] =
+        { [0 ... MAX_SAPICS - 1] = -1 };
+EXPORT_SYMBOL(ia64_acpiid_to_sapicid);
+const char *
+acpi_get_sysname (void)
+{
+#ifdef CONFIG_IA64_GENERIC
+        unsigned long rsdp_phys;
+        struct acpi20_table_rsdp *rsdp;
+        struct acpi_table_xsdt *xsdt;
+        struct acpi_table_header *hdr;
+        rsdp_phys = acpi_find_rsdp();
+        if (!rsdp_phys) {
+                printk(KERN_ERR "ACPI 2.0 RSDP not found, default to \"dig\"\n");
+                return "dig";
+        }
+        rsdp = (struct acpi20_table_rsdp *) __va(rsdp_phys);
+        if (strncmp(rsdp->signature, RSDP_SIG, sizeof(RSDP_SIG) - 1)) {
+                printk(KERN_ERR "ACPI 2.0 RSDP signature incorrect, default to \"dig\"\n");
+                return "dig";
+        }
+        xsdt = (struct acpi_table_xsdt *) __va(rsdp->xsdt_address);
+        hdr = &xsdt->header;
+        if (strncmp(hdr->signature, XSDT_SIG, sizeof(XSDT_SIG) - 1)) {
+                printk(KERN_ERR "ACPI 2.0 XSDT signature incorrect, default to \"dig\"\n");
+                return "dig";
+        }
+        if (!strcmp(hdr->oem_id, "HP")) {
+                return "hpzx1";
+        }
+        else if (!strcmp(hdr->oem_id, "SGI")) {
+                return "sn2";
+        }
+        return "dig";
+#else
+# if defined (CONFIG_IA64_HP_SIM)
+        return "hpsim";
+# elif defined (CONFIG_IA64_HP_ZX1)
+        return "hpzx1";
+# elif defined (CONFIG_IA64_HP_ZX1_SWIOTLB)
+        return "hpzx1_swiotlb";
+# elif defined (CONFIG_IA64_SGI_SN2)
+        return "sn2";
+# elif defined (CONFIG_IA64_DIG)
+        return "dig";
+# else
+#       error Unknown platform.  Fix acpi.c.
+# endif
+#endif
+}
+#ifdef CONFIG_ACPI_BOOT
+#define ACPI_MAX_PLATFORM_INTERRUPTS    256
+/* Array to record platform interrupt vectors for generic interrupt routing. */
+int platform_intr_list[ACPI_MAX_PLATFORM_INTERRUPTS] = {
+        [0 ... ACPI_MAX_PLATFORM_INTERRUPTS - 1] = -1
+};
+enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_IOSAPIC;
+/*
+ * Interrupt routing API for device drivers.  Provides interrupt vector for
+ * a generic platform event.  Currently only CPEI is implemented.
+ */
+int
+acpi_request_vector (u32 int_type)
+{
+        int vector = -1;
+        if (int_type < ACPI_MAX_PLATFORM_INTERRUPTS) {
+                /* corrected platform error interrupt */
+                vector = platform_intr_list[int_type];
+        } else
+                printk(KERN_ERR "acpi_request_vector(): invalid interrupt type\n");
+        return vector;
+}
+char *
+__acpi_map_table (unsigned long phys_addr, unsigned long size)
+{
+        return __va(phys_addr);
+}
+/* --------------------------------------------------------------------------
+                            Boot-time Table Parsing
+   -------------------------------------------------------------------------- */
+static int                      total_cpus __initdata;
+static int                      available_cpus __initdata;
+struct acpi_table_madt *        acpi_madt __initdata;
+static u8                       has_8259;
+static int __init
+acpi_parse_lapic_addr_ovr (
+        acpi_table_entry_header *header, const unsigned long end)
+{
+        struct acpi_table_lapic_addr_ovr *lapic;
+        lapic = (struct acpi_table_lapic_addr_ovr *) header;
+        if (BAD_MADT_ENTRY(lapic, end))
+                return -EINVAL;
+        if (lapic->address) {
+                iounmap(ipi_base_addr);
+                ipi_base_addr = ioremap(lapic->address, 0);
+        }
+        return 0;
+}
+static int __init
+acpi_parse_lsapic (acpi_table_entry_header *header, const unsigned long end)
+{
+        struct acpi_table_lsapic *lsapic;
+        lsapic = (struct acpi_table_lsapic *) header;
+        if (BAD_MADT_ENTRY(lsapic, end))
+                return -EINVAL;
+        if (lsapic->flags.enabled) {
+#ifdef CONFIG_SMP
+                smp_boot_data.cpu_phys_id[available_cpus] = (lsapic->id << 8) | lsapic->eid;
+#endif
+                ia64_acpiid_to_sapicid[lsapic->acpi_id] = (lsapic->id << 8) | lsapic->eid;
+                ++available_cpus;
+        }
+        total_cpus++;
+        return 0;
+}
+static int __init
+acpi_parse_lapic_nmi (acpi_table_entry_header *header, const unsigned long end)
+{
+        struct acpi_table_lapic_nmi *lacpi_nmi;
+        lacpi_nmi = (struct acpi_table_lapic_nmi*) header;
+        if (BAD_MADT_ENTRY(lacpi_nmi, end))
+                return -EINVAL;
+        /* TBD: Support lapic_nmi entries */
+        return 0;
+}
+static int __init
+acpi_parse_iosapic (acpi_table_entry_header *header, const unsigned long end)
+{
+        struct acpi_table_iosapic *iosapic;
+        iosapic = (struct acpi_table_iosapic *) header;
+        if (BAD_MADT_ENTRY(iosapic, end))
+                return -EINVAL;
+        iosapic_init(iosapic->address, iosapic->global_irq_base);
+        return 0;
+}
+static int __init
+acpi_parse_plat_int_src (
+        acpi_table_entry_header *header, const unsigned long end)
+{
+        struct acpi_table_plat_int_src *plintsrc;
+        int vector;
+        plintsrc = (struct acpi_table_plat_int_src *) header;
+        if (BAD_MADT_ENTRY(plintsrc, end))
+                return -EINVAL;
+        /*
+         * Get vector assignment for this interrupt, set attributes,
+         * and program the IOSAPIC routing table.
+         */
+        vector = iosapic_register_platform_intr(plintsrc->type,
+                                                plintsrc->global_irq,
+                                                plintsrc->iosapic_vector,
+                                                plintsrc->eid,
+                                                plintsrc->id,
+                                                (plintsrc->flags.polarity == 1) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW,
+                                                (plintsrc->flags.trigger == 1) ? IOSAPIC_EDGE : IOSAPIC_LEVEL);
+        platform_intr_list[plintsrc->type] = vector;
+        return 0;
+}
+static int __init
+acpi_parse_int_src_ovr (
+        acpi_table_entry_header *header, const unsigned long end)
+{
+        struct acpi_table_int_src_ovr *p;
+        p = (struct acpi_table_int_src_ovr *) header;
+        if (BAD_MADT_ENTRY(p, end))
+                return -EINVAL;
+        iosapic_override_isa_irq(p->bus_irq, p->global_irq,
+                                 (p->flags.polarity == 1) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW,
+                                 (p->flags.trigger == 1) ? IOSAPIC_EDGE : IOSAPIC_LEVEL);
+        return 0;
+}
+static int __init
+acpi_parse_nmi_src (acpi_table_entry_header *header, const unsigned long end)
+{
+        struct acpi_table_nmi_src *nmi_src;
+        nmi_src = (struct acpi_table_nmi_src*) header;
+        if (BAD_MADT_ENTRY(nmi_src, end))
+                return -EINVAL;
+        /* TBD: Support nimsrc entries */
+        return 0;
+}
+static void __init
+acpi_madt_oem_check (char *oem_id, char *oem_table_id)
+{
+        if (!strncmp(oem_id, "IBM", 3) &&
+            (!strncmp(oem_table_id, "SERMOW", 6))) {
+                /*
+                 * Unfortunately ITC_DRIFT is not yet part of the
+                 * official SAL spec, so the ITC_DRIFT bit is not
+                 * set by the BIOS on this hardware.
+                 */
+                sal_platform_features |= IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT;
+                cyclone_setup();
+        }
+}
+static int __init
+acpi_parse_madt (unsigned long phys_addr, unsigned long size)
+{
+        if (!phys_addr || !size)
+                return -EINVAL;
+        acpi_madt = (struct acpi_table_madt *) __va(phys_addr);
+        /* remember the value for reference after free_initmem() */
+#ifdef CONFIG_ITANIUM
+        has_8259 = 1; /* Firmware on old Itanium systems is broken */
+#else
+        has_8259 = acpi_madt->flags.pcat_compat;
+#endif
+        iosapic_system_init(has_8259);
+        /* Get base address of IPI Message Block */
+        if (acpi_madt->lapic_address)
+                ipi_base_addr = ioremap(acpi_madt->lapic_address, 0);
+        printk(KERN_INFO PREFIX "Local APIC address %p\n", ipi_base_addr);
+        acpi_madt_oem_check(acpi_madt->header.oem_id,
+                acpi_madt->header.oem_table_id);
+        return 0;
+}
+#ifdef CONFIG_ACPI_NUMA
+#undef SLIT_DEBUG
+#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
+static int __initdata srat_num_cpus;                    /* number of cpus */
+static u32 __devinitdata pxm_flag[PXM_FLAG_LEN];
+#define pxm_bit_set(bit)        (set_bit(bit,(void *)pxm_flag))
+#define pxm_bit_test(bit)       (test_bit(bit,(void *)pxm_flag))
+/* maps to convert between proximity domain and logical node ID */
+int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
+int __initdata nid_to_pxm_map[MAX_NUMNODES];
+static struct acpi_table_slit __initdata *slit_table;
+/*
+ * ACPI 2.0 SLIT (System Locality Information Table)
+ * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf
+ */
+void __init
+acpi_numa_slit_init (struct acpi_table_slit *slit)
+{
+        u32 len;
+        len = sizeof(struct acpi_table_header) + 8
+                + slit->localities * slit->localities;
+        if (slit->header.length != len) {
+                printk(KERN_ERR "ACPI 2.0 SLIT: size mismatch: %d expected, %d actual\n",
+                       len, slit->header.length);
+                memset(numa_slit, 10, sizeof(numa_slit));
+                return;
+        }
+        slit_table = slit;
+}
+void __init
+acpi_numa_processor_affinity_init (struct acpi_table_processor_affinity *pa)
+{
+        /* record this node in proximity bitmap */
+        pxm_bit_set(pa->proximity_domain);
+        node_cpuid[srat_num_cpus].phys_id = (pa->apic_id << 8) | (pa->lsapic_eid);
+        /* nid should be overridden as logical node id later */
+        node_cpuid[srat_num_cpus].nid = pa->proximity_domain;
+        srat_num_cpus++;
+}
+void __init
+acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *ma)
+{
+        unsigned long paddr, size;
+        u8 pxm;
+        struct node_memblk_s *p, *q, *pend;
+        pxm = ma->proximity_domain;
+        /* fill node memory chunk structure */
+        paddr = ma->base_addr_hi;
+        paddr = (paddr << 32) | ma->base_addr_lo;
+        size = ma->length_hi;
+        size = (size << 32) | ma->length_lo;
+        /* Ignore disabled entries */
+        if (!ma->flags.enabled)
+                return;
+        /* record this node in proximity bitmap */
+        pxm_bit_set(pxm);
+        /* Insertion sort based on base address */
+        pend = &node_memblk[num_node_memblks];
+        for (p = &node_memblk[0]; p < pend; p++) {
+                if (paddr < p->start_paddr)
+                        break;
+        }
+        if (p < pend) {
+                for (q = pend - 1; q >= p; q--)
+                        *(q + 1) = *q;
+        }
+        p->start_paddr = paddr;
+        p->size = size;
+        p->nid = pxm;
+        num_node_memblks++;
+}
+void __init
+acpi_numa_arch_fixup (void)
+{
+        int i, j, node_from, node_to;
+        /* If there's no SRAT, fix the phys_id and mark node 0 online */
+        if (srat_num_cpus == 0) {
+                node_set_online(0);
+                node_cpuid[0].phys_id = hard_smp_processor_id();
+                return;
+        }
+        /*
+         * MCD - This can probably be dropped now.  No need for pxm ID to node ID
+         * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES.
+         */
+        /* calculate total number of nodes in system from PXM bitmap */
+        memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
+        memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
+        nodes_clear(node_online_map);
+        for (i = 0; i < MAX_PXM_DOMAINS; i++) {
+                if (pxm_bit_test(i)) {
+                        int nid = num_online_nodes();
+                        pxm_to_nid_map[i] = nid;
+                        nid_to_pxm_map[nid] = i;
+                        node_set_online(nid);
+                }
+        }
+        /* set logical node id in memory chunk structure */
+        for (i = 0; i < num_node_memblks; i++)
+                node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
+        /* assign memory bank numbers for each chunk on each node */
+        for_each_online_node(i) {
+                int bank;
+                bank = 0;
+                for (j = 0; j < num_node_memblks; j++)
+                        if (node_memblk[j].nid == i)
+                                node_memblk[j].bank = bank++;
+        }
+        /* set logical node id in cpu structure */
+        for (i = 0; i < srat_num_cpus; i++)
+                node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid];
+        printk(KERN_INFO "Number of logical nodes in system = %d\n", num_online_nodes());
+        printk(KERN_INFO "Number of memory chunks in system = %d\n", num_node_memblks);
+        if (!slit_table) return;
+        memset(numa_slit, -1, sizeof(numa_slit));
+        for (i=0; i<slit_table->localities; i++) {
+                if (!pxm_bit_test(i))
+                        continue;
+                node_from = pxm_to_nid_map[i];
+                for (j=0; j<slit_table->localities; j++) {
+                        if (!pxm_bit_test(j))
+                                continue;
+                        node_to = pxm_to_nid_map[j];
+                        node_distance(node_from, node_to) =
+                                slit_table->entry[i*slit_table->localities + j];
+                }
+        }
+#ifdef SLIT_DEBUG
+        printk("ACPI 2.0 SLIT locality table:\n");
+        for_each_online_node(i) {
+                for_each_online_node(j)
+                        printk("%03d ", node_distance(i,j));
+                printk("\n");
+        }
+#endif
+}
+#endif /* CONFIG_ACPI_NUMA */
+unsigned int
+acpi_register_gsi (u32 gsi, int edge_level, int active_high_low)
+{
+        if (has_8259 && gsi < 16)
+                return isa_irq_to_vector(gsi);
+        return iosapic_register_intr(gsi,
+                        (active_high_low == ACPI_ACTIVE_HIGH) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW,
+                        (edge_level == ACPI_EDGE_SENSITIVE) ? IOSAPIC_EDGE : IOSAPIC_LEVEL);
+}
+EXPORT_SYMBOL(acpi_register_gsi);
+#ifdef CONFIG_ACPI_DEALLOCATE_IRQ
+void
+acpi_unregister_gsi (u32 gsi)
+{
+        iosapic_unregister_intr(gsi);
+}
+EXPORT_SYMBOL(acpi_unregister_gsi);
+#endif /* CONFIG_ACPI_DEALLOCATE_IRQ */
+static int __init
+acpi_parse_fadt (unsigned long phys_addr, unsigned long size)
+{
+        struct acpi_table_header *fadt_header;
+        struct fadt_descriptor_rev2 *fadt;
+        if (!phys_addr || !size)
+                return -EINVAL;
+        fadt_header = (struct acpi_table_header *) __va(phys_addr);
+        if (fadt_header->revision != 3)
+                return -ENODEV;         /* Only deal with ACPI 2.0 FADT */
+        fadt = (struct fadt_descriptor_rev2 *) fadt_header;
+        if (!(fadt->iapc_boot_arch & BAF_8042_KEYBOARD_CONTROLLER))
+                acpi_kbd_controller_present = 0;
+        if (fadt->iapc_boot_arch & BAF_LEGACY_DEVICES)
+                acpi_legacy_devices = 1;
+        acpi_register_gsi(fadt->sci_int, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW);
+        return 0;
+}
+unsigned long __init
+acpi_find_rsdp (void)
+{
+        unsigned long rsdp_phys = 0;
+        if (efi.acpi20)
+                rsdp_phys = __pa(efi.acpi20);
+        else if (efi.acpi)
+                printk(KERN_WARNING PREFIX "v1.0/r0.71 tables no longer supported\n");
+        return rsdp_phys;
+}
+int __init
+acpi_boot_init (void)
+{
+        /*
+         * MADT
+         * ----
+         * Parse the Multiple APIC Description Table (MADT), if exists.
+         * Note that this table provides platform SMP configuration
+         * information -- the successor to MPS tables.
+         */
+        if (acpi_table_parse(ACPI_APIC, acpi_parse_madt) < 1) {
+                printk(KERN_ERR PREFIX "Can't find MADT\n");
+                goto skip_madt;
+        }
+        /* Local APIC */
+        if (acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR, acpi_parse_lapic_addr_ovr, 0) < 0)
+                printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n");
+        if (acpi_table_parse_madt(ACPI_MADT_LSAPIC, acpi_parse_lsapic, NR_CPUS) < 1)
+                printk(KERN_ERR PREFIX "Error parsing MADT - no LAPIC entries\n");
+        if (acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0) < 0)
+                printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
+        /* I/O APIC */
+        if (acpi_table_parse_madt(ACPI_MADT_IOSAPIC, acpi_parse_iosapic, NR_IOSAPICS) < 1)
+                printk(KERN_ERR PREFIX "Error parsing MADT - no IOSAPIC entries\n");
+        /* System-Level Interrupt Routing */
+        if (acpi_table_parse_madt(ACPI_MADT_PLAT_INT_SRC, acpi_parse_plat_int_src, ACPI_MAX_PLATFORM_INTERRUPTS) < 0)
+                printk(KERN_ERR PREFIX "Error parsing platform interrupt source entry\n");
+        if (acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, 0) < 0)
+                printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n");
+        if (acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, 0) < 0)
+                printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
+  skip_madt:
+        /*
+         * FADT says whether a legacy keyboard controller is present.
+         * The FADT also contains an SCI_INT line, by which the system
+         * gets interrupts such as power and sleep buttons.  If it's not
+         * on a Legacy interrupt, it needs to be setup.
+         */
+        if (acpi_table_parse(ACPI_FADT, acpi_parse_fadt) < 1)
+                printk(KERN_ERR PREFIX "Can't find FADT\n");
+#ifdef CONFIG_SMP
+        if (available_cpus == 0) {
+                printk(KERN_INFO "ACPI: Found 0 CPUS; assuming 1\n");
+                printk(KERN_INFO "CPU 0 (0x%04x)", hard_smp_processor_id());
+                smp_boot_data.cpu_phys_id[available_cpus] = hard_smp_processor_id();
+                available_cpus = 1; /* We've got at least one of these, no? */
+        }
+        smp_boot_data.cpu_count = available_cpus;
+        smp_build_cpu_map();
+# ifdef CONFIG_ACPI_NUMA
+        if (srat_num_cpus == 0) {
+                int cpu, i = 1;
+                for (cpu = 0; cpu < smp_boot_data.cpu_count; cpu++)
+                        if (smp_boot_data.cpu_phys_id[cpu] != hard_smp_processor_id())
+                                node_cpuid[i++].phys_id = smp_boot_data.cpu_phys_id[cpu];
+        }
+        build_cpu_to_node_map();
+# endif
+#endif
+        /* Make boot-up look pretty */
+        printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus, total_cpus);
+        return 0;
+}
+int
+acpi_gsi_to_irq (u32 gsi, unsigned int *irq)
+{
+        int vector;
+        if (has_8259 && gsi < 16)
+                *irq = isa_irq_to_vector(gsi);
+        else {
+                vector = gsi_to_vector(gsi);
+                if (vector == -1)
+                        return -1;
+                *irq = vector;
+        }
+        return 0;
+}
+/*
+ *  ACPI based hotplug CPU support
+ */
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+static
+int
+acpi_map_cpu2node(acpi_handle handle, int cpu, long physid)
+{
+#ifdef CONFIG_ACPI_NUMA
+        int                     pxm_id;
+        pxm_id = acpi_get_pxm(handle);
+        /*
+         * Assuming that the container driver would have set the proximity
+         * domain and would have initialized pxm_to_nid_map[pxm_id] && pxm_flag
+         */
+        node_cpuid[cpu].nid = (pxm_id < 0) ? 0:
+                        pxm_to_nid_map[pxm_id];
+        node_cpuid[cpu].phys_id =  physid;
+#endif
+        return(0);
+}
+int
+acpi_map_lsapic(acpi_handle handle, int *pcpu)
+{
+        struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+        union acpi_object *obj;
+        struct acpi_table_lsapic *lsapic;
+        cpumask_t tmp_map;
+        long physid;
+        int cpu;
+ 
+        if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+                return -EINVAL;
+        if (!buffer.length ||  !buffer.pointer)
+                return -EINVAL;
+ 
+        obj = buffer.pointer;
+        if (obj->type != ACPI_TYPE_BUFFER ||
+            obj->buffer.length < sizeof(*lsapic)) {
+                acpi_os_free(buffer.pointer);
+                return -EINVAL;
+        }
+        lsapic = (struct acpi_table_lsapic *)obj->buffer.pointer;
+        if ((lsapic->header.type != ACPI_MADT_LSAPIC) ||
+            (!lsapic->flags.enabled)) {
+                acpi_os_free(buffer.pointer);
+                return -EINVAL;
+        }
+        physid = ((lsapic->id <<8) | (lsapic->eid));
+        acpi_os_free(buffer.pointer);
+        buffer.length = ACPI_ALLOCATE_BUFFER;
+        buffer.pointer = NULL;
+        cpus_complement(tmp_map, cpu_present_map);
+        cpu = first_cpu(tmp_map);
+        if(cpu >= NR_CPUS)
+                return -EINVAL;
+        acpi_map_cpu2node(handle, cpu, physid);
+        cpu_set(cpu, cpu_present_map);
+        ia64_cpu_to_sapicid[cpu] = physid;
+        ia64_acpiid_to_sapicid[lsapic->acpi_id] = ia64_cpu_to_sapicid[cpu];
+        *pcpu = cpu;
+        return(0);
+}
+EXPORT_SYMBOL(acpi_map_lsapic);
+int
+acpi_unmap_lsapic(int cpu)
+{
+        int i;
+        for (i=0; i<MAX_SAPICS; i++) {
+                if (ia64_acpiid_to_sapicid[i] == ia64_cpu_to_sapicid[cpu]) {
+                        ia64_acpiid_to_sapicid[i] = -1;
+                        break;
+                }
+        }
+        ia64_cpu_to_sapicid[cpu] = -1;
+        cpu_clear(cpu,cpu_present_map);
+#ifdef CONFIG_ACPI_NUMA
+        /* NUMA specific cleanup's */
+#endif
+        return(0);
+}
+EXPORT_SYMBOL(acpi_unmap_lsapic);
+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
+ 
+#ifdef CONFIG_ACPI_NUMA
+acpi_status __init
+acpi_map_iosapic (acpi_handle handle, u32 depth, void *context, void **ret)
+{
+        struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+        union acpi_object *obj;
+        struct acpi_table_iosapic *iosapic;
+        unsigned int gsi_base;
+        int node;
+        /* Only care about objects w/ a method that returns the MADT */
+        if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+                return AE_OK;
+        if (!buffer.length || !buffer.pointer)
+                return AE_OK;
+        obj = buffer.pointer;
+        if (obj->type != ACPI_TYPE_BUFFER ||
+            obj->buffer.length < sizeof(*iosapic)) {
+                acpi_os_free(buffer.pointer);
+                return AE_OK;
+        }
+        iosapic = (struct acpi_table_iosapic *)obj->buffer.pointer;
+        if (iosapic->header.type != ACPI_MADT_IOSAPIC) {
+                acpi_os_free(buffer.pointer);
+                return AE_OK;
+        }
+        gsi_base = iosapic->global_irq_base;
+        acpi_os_free(buffer.pointer);
+        buffer.length = ACPI_ALLOCATE_BUFFER;
+        buffer.pointer = NULL;
+        /*
+         * OK, it's an IOSAPIC MADT entry, look for a _PXM method to tell
+         * us which node to associate this with.
+         */
+        if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PXM", NULL, &buffer)))
+                return AE_OK;
+        if (!buffer.length || !buffer.pointer)
+                return AE_OK;
+        obj = buffer.pointer;
+        if (obj->type != ACPI_TYPE_INTEGER ||
+            obj->integer.value >= MAX_PXM_DOMAINS) {
+                acpi_os_free(buffer.pointer);
+                return AE_OK;
+        }
+        node = pxm_to_nid_map[obj->integer.value];
+        acpi_os_free(buffer.pointer);
+        if (node >= MAX_NUMNODES || !node_online(node) ||
+            cpus_empty(node_to_cpumask(node)))
+                return AE_OK;
+        /* We know a gsi to node mapping! */
+        map_iosapic_to_node(gsi_base, node);
+        return AE_OK;
+}
+#endif /* CONFIG_NUMA */
+#endif /* CONFIG_ACPI_BOOT */
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
new file mode 100644
index 000000000000..7d1ae2982c53
--- /dev/null
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -0,0 +1,239 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed
+ * to extract and format the required data.
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <asm-ia64/processor.h>
+#include <asm-ia64/ptrace.h>
+#include <asm-ia64/siginfo.h>
+#include <asm-ia64/sigcontext.h>
+#include <asm-ia64/mca.h>
+#include "../kernel/sigframe.h"
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+#define BLANK() asm volatile("\n->" : : )
+void foo(void)
+{
+        DEFINE(IA64_TASK_SIZE, sizeof (struct task_struct));
+        DEFINE(IA64_THREAD_INFO_SIZE, sizeof (struct thread_info));
+        DEFINE(IA64_PT_REGS_SIZE, sizeof (struct pt_regs));
+        DEFINE(IA64_SWITCH_STACK_SIZE, sizeof (struct switch_stack));
+        DEFINE(IA64_SIGINFO_SIZE, sizeof (struct siginfo));
+        DEFINE(IA64_CPU_SIZE, sizeof (struct cpuinfo_ia64));
+        DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe));
+        DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info));
+        BLANK();
+        DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
+        DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
+        BLANK();
+        DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked));
+        DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid));
+        DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader));
+        DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending));
+        DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid));
+        DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent));
+        DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand));
+        DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal));
+        DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid));
+        DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp));
+        DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack));
+        BLANK();
+        DEFINE(IA64_SIGHAND_SIGLOCK_OFFSET,offsetof (struct sighand_struct, siglock));
+        BLANK();
+        DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct,
+                                                             group_stop_count));
+        DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending));
+        BLANK();
+        DEFINE(IA64_PT_REGS_B6_OFFSET, offsetof (struct pt_regs, b6));
+        DEFINE(IA64_PT_REGS_B7_OFFSET, offsetof (struct pt_regs, b7));
+        DEFINE(IA64_PT_REGS_AR_CSD_OFFSET, offsetof (struct pt_regs, ar_csd));
+        DEFINE(IA64_PT_REGS_AR_SSD_OFFSET, offsetof (struct pt_regs, ar_ssd));
+        DEFINE(IA64_PT_REGS_R8_OFFSET, offsetof (struct pt_regs, r8));
+        DEFINE(IA64_PT_REGS_R9_OFFSET, offsetof (struct pt_regs, r9));
+        DEFINE(IA64_PT_REGS_R10_OFFSET, offsetof (struct pt_regs, r10));
+        DEFINE(IA64_PT_REGS_R11_OFFSET, offsetof (struct pt_regs, r11));
+        DEFINE(IA64_PT_REGS_CR_IPSR_OFFSET, offsetof (struct pt_regs, cr_ipsr));
+        DEFINE(IA64_PT_REGS_CR_IIP_OFFSET, offsetof (struct pt_regs, cr_iip));
+        DEFINE(IA64_PT_REGS_CR_IFS_OFFSET, offsetof (struct pt_regs, cr_ifs));
+        DEFINE(IA64_PT_REGS_AR_UNAT_OFFSET, offsetof (struct pt_regs, ar_unat));
+        DEFINE(IA64_PT_REGS_AR_PFS_OFFSET, offsetof (struct pt_regs, ar_pfs));
+        DEFINE(IA64_PT_REGS_AR_RSC_OFFSET, offsetof (struct pt_regs, ar_rsc));
+        DEFINE(IA64_PT_REGS_AR_RNAT_OFFSET, offsetof (struct pt_regs, ar_rnat));
+        DEFINE(IA64_PT_REGS_AR_BSPSTORE_OFFSET, offsetof (struct pt_regs, ar_bspstore));
+        DEFINE(IA64_PT_REGS_PR_OFFSET, offsetof (struct pt_regs, pr));
+        DEFINE(IA64_PT_REGS_B0_OFFSET, offsetof (struct pt_regs, b0));
+        DEFINE(IA64_PT_REGS_LOADRS_OFFSET, offsetof (struct pt_regs, loadrs));
+        DEFINE(IA64_PT_REGS_R1_OFFSET, offsetof (struct pt_regs, r1));
+        DEFINE(IA64_PT_REGS_R12_OFFSET, offsetof (struct pt_regs, r12));
+        DEFINE(IA64_PT_REGS_R13_OFFSET, offsetof (struct pt_regs, r13));
+        DEFINE(IA64_PT_REGS_AR_FPSR_OFFSET, offsetof (struct pt_regs, ar_fpsr));
+        DEFINE(IA64_PT_REGS_R15_OFFSET, offsetof (struct pt_regs, r15));
+        DEFINE(IA64_PT_REGS_R14_OFFSET, offsetof (struct pt_regs, r14));
+        DEFINE(IA64_PT_REGS_R2_OFFSET, offsetof (struct pt_regs, r2));
+        DEFINE(IA64_PT_REGS_R3_OFFSET, offsetof (struct pt_regs, r3));
+        DEFINE(IA64_PT_REGS_R16_OFFSET, offsetof (struct pt_regs, r16));
+        DEFINE(IA64_PT_REGS_R17_OFFSET, offsetof (struct pt_regs, r17));
+        DEFINE(IA64_PT_REGS_R18_OFFSET, offsetof (struct pt_regs, r18));
+        DEFINE(IA64_PT_REGS_R19_OFFSET, offsetof (struct pt_regs, r19));
+        DEFINE(IA64_PT_REGS_R20_OFFSET, offsetof (struct pt_regs, r20));
+        DEFINE(IA64_PT_REGS_R21_OFFSET, offsetof (struct pt_regs, r21));
+        DEFINE(IA64_PT_REGS_R22_OFFSET, offsetof (struct pt_regs, r22));
+        DEFINE(IA64_PT_REGS_R23_OFFSET, offsetof (struct pt_regs, r23));
+        DEFINE(IA64_PT_REGS_R24_OFFSET, offsetof (struct pt_regs, r24));
+        DEFINE(IA64_PT_REGS_R25_OFFSET, offsetof (struct pt_regs, r25));
+        DEFINE(IA64_PT_REGS_R26_OFFSET, offsetof (struct pt_regs, r26));
+        DEFINE(IA64_PT_REGS_R27_OFFSET, offsetof (struct pt_regs, r27));
+        DEFINE(IA64_PT_REGS_R28_OFFSET, offsetof (struct pt_regs, r28));
+        DEFINE(IA64_PT_REGS_R29_OFFSET, offsetof (struct pt_regs, r29));
+        DEFINE(IA64_PT_REGS_R30_OFFSET, offsetof (struct pt_regs, r30));
+        DEFINE(IA64_PT_REGS_R31_OFFSET, offsetof (struct pt_regs, r31));
+        DEFINE(IA64_PT_REGS_AR_CCV_OFFSET, offsetof (struct pt_regs, ar_ccv));
+        DEFINE(IA64_PT_REGS_F6_OFFSET, offsetof (struct pt_regs, f6));
+        DEFINE(IA64_PT_REGS_F7_OFFSET, offsetof (struct pt_regs, f7));
+        DEFINE(IA64_PT_REGS_F8_OFFSET, offsetof (struct pt_regs, f8));
+        DEFINE(IA64_PT_REGS_F9_OFFSET, offsetof (struct pt_regs, f9));
+        DEFINE(IA64_PT_REGS_F10_OFFSET, offsetof (struct pt_regs, f10));
+        DEFINE(IA64_PT_REGS_F11_OFFSET, offsetof (struct pt_regs, f11));
+        BLANK();
+        DEFINE(IA64_SWITCH_STACK_CALLER_UNAT_OFFSET, offsetof (struct switch_stack, caller_unat));
+        DEFINE(IA64_SWITCH_STACK_AR_FPSR_OFFSET, offsetof (struct switch_stack, ar_fpsr));
+        DEFINE(IA64_SWITCH_STACK_F2_OFFSET, offsetof (struct switch_stack, f2));
+        DEFINE(IA64_SWITCH_STACK_F3_OFFSET, offsetof (struct switch_stack, f3));
+        DEFINE(IA64_SWITCH_STACK_F4_OFFSET, offsetof (struct switch_stack, f4));
+        DEFINE(IA64_SWITCH_STACK_F5_OFFSET, offsetof (struct switch_stack, f5));
+        DEFINE(IA64_SWITCH_STACK_F12_OFFSET, offsetof (struct switch_stack, f12));
+        DEFINE(IA64_SWITCH_STACK_F13_OFFSET, offsetof (struct switch_stack, f13));
+        DEFINE(IA64_SWITCH_STACK_F14_OFFSET, offsetof (struct switch_stack, f14));
+        DEFINE(IA64_SWITCH_STACK_F15_OFFSET, offsetof (struct switch_stack, f15));
+        DEFINE(IA64_SWITCH_STACK_F16_OFFSET, offsetof (struct switch_stack, f16));
+        DEFINE(IA64_SWITCH_STACK_F17_OFFSET, offsetof (struct switch_stack, f17));
+        DEFINE(IA64_SWITCH_STACK_F18_OFFSET, offsetof (struct switch_stack, f18));
+        DEFINE(IA64_SWITCH_STACK_F19_OFFSET, offsetof (struct switch_stack, f19));
+        DEFINE(IA64_SWITCH_STACK_F20_OFFSET, offsetof (struct switch_stack, f20));
+        DEFINE(IA64_SWITCH_STACK_F21_OFFSET, offsetof (struct switch_stack, f21));
+        DEFINE(IA64_SWITCH_STACK_F22_OFFSET, offsetof (struct switch_stack, f22));
+        DEFINE(IA64_SWITCH_STACK_F23_OFFSET, offsetof (struct switch_stack, f23));
+        DEFINE(IA64_SWITCH_STACK_F24_OFFSET, offsetof (struct switch_stack, f24));
+        DEFINE(IA64_SWITCH_STACK_F25_OFFSET, offsetof (struct switch_stack, f25));
+        DEFINE(IA64_SWITCH_STACK_F26_OFFSET, offsetof (struct switch_stack, f26));
+        DEFINE(IA64_SWITCH_STACK_F27_OFFSET, offsetof (struct switch_stack, f27));
+        DEFINE(IA64_SWITCH_STACK_F28_OFFSET, offsetof (struct switch_stack, f28));
+        DEFINE(IA64_SWITCH_STACK_F29_OFFSET, offsetof (struct switch_stack, f29));
+        DEFINE(IA64_SWITCH_STACK_F30_OFFSET, offsetof (struct switch_stack, f30));
+        DEFINE(IA64_SWITCH_STACK_F31_OFFSET, offsetof (struct switch_stack, f31));
+        DEFINE(IA64_SWITCH_STACK_R4_OFFSET, offsetof (struct switch_stack, r4));
+        DEFINE(IA64_SWITCH_STACK_R5_OFFSET, offsetof (struct switch_stack, r5));
+        DEFINE(IA64_SWITCH_STACK_R6_OFFSET, offsetof (struct switch_stack, r6));
+        DEFINE(IA64_SWITCH_STACK_R7_OFFSET, offsetof (struct switch_stack, r7));
+        DEFINE(IA64_SWITCH_STACK_B0_OFFSET, offsetof (struct switch_stack, b0));
+        DEFINE(IA64_SWITCH_STACK_B1_OFFSET, offsetof (struct switch_stack, b1));
+        DEFINE(IA64_SWITCH_STACK_B2_OFFSET, offsetof (struct switch_stack, b2));
+        DEFINE(IA64_SWITCH_STACK_B3_OFFSET, offsetof (struct switch_stack, b3));
+        DEFINE(IA64_SWITCH_STACK_B4_OFFSET, offsetof (struct switch_stack, b4));
+        DEFINE(IA64_SWITCH_STACK_B5_OFFSET, offsetof (struct switch_stack, b5));
+        DEFINE(IA64_SWITCH_STACK_AR_PFS_OFFSET, offsetof (struct switch_stack, ar_pfs));
+        DEFINE(IA64_SWITCH_STACK_AR_LC_OFFSET, offsetof (struct switch_stack, ar_lc));
+        DEFINE(IA64_SWITCH_STACK_AR_UNAT_OFFSET, offsetof (struct switch_stack, ar_unat));
+        DEFINE(IA64_SWITCH_STACK_AR_RNAT_OFFSET, offsetof (struct switch_stack, ar_rnat));
+        DEFINE(IA64_SWITCH_STACK_AR_BSPSTORE_OFFSET, offsetof (struct switch_stack, ar_bspstore));
+        DEFINE(IA64_SWITCH_STACK_PR_OFFSET, offsetof (struct switch_stack, pr));
+        BLANK();
+        DEFINE(IA64_SIGCONTEXT_IP_OFFSET, offsetof (struct sigcontext, sc_ip));
+        DEFINE(IA64_SIGCONTEXT_AR_BSP_OFFSET, offsetof (struct sigcontext, sc_ar_bsp));
+        DEFINE(IA64_SIGCONTEXT_AR_FPSR_OFFSET, offsetof (struct sigcontext, sc_ar_fpsr));
+        DEFINE(IA64_SIGCONTEXT_AR_RNAT_OFFSET, offsetof (struct sigcontext, sc_ar_rnat));
+        DEFINE(IA64_SIGCONTEXT_AR_UNAT_OFFSET, offsetof (struct sigcontext, sc_ar_unat));
+        DEFINE(IA64_SIGCONTEXT_B0_OFFSET, offsetof (struct sigcontext, sc_br[0]));
+        DEFINE(IA64_SIGCONTEXT_CFM_OFFSET, offsetof (struct sigcontext, sc_cfm));
+        DEFINE(IA64_SIGCONTEXT_FLAGS_OFFSET, offsetof (struct sigcontext, sc_flags));
+        DEFINE(IA64_SIGCONTEXT_FR6_OFFSET, offsetof (struct sigcontext, sc_fr[6]));
+        DEFINE(IA64_SIGCONTEXT_PR_OFFSET, offsetof (struct sigcontext, sc_pr));
+        DEFINE(IA64_SIGCONTEXT_R12_OFFSET, offsetof (struct sigcontext, sc_gr[12]));
+        DEFINE(IA64_SIGCONTEXT_RBS_BASE_OFFSET,offsetof (struct sigcontext, sc_rbs_base));
+        DEFINE(IA64_SIGCONTEXT_LOADRS_OFFSET, offsetof (struct sigcontext, sc_loadrs));
+        BLANK();
+        DEFINE(IA64_SIGPENDING_SIGNAL_OFFSET, offsetof (struct sigpending, signal));
+        BLANK();
+        DEFINE(IA64_SIGFRAME_ARG0_OFFSET, offsetof (struct sigframe, arg0));
+        DEFINE(IA64_SIGFRAME_ARG1_OFFSET, offsetof (struct sigframe, arg1));
+        DEFINE(IA64_SIGFRAME_ARG2_OFFSET, offsetof (struct sigframe, arg2));
+        DEFINE(IA64_SIGFRAME_HANDLER_OFFSET, offsetof (struct sigframe, handler));
+        DEFINE(IA64_SIGFRAME_SIGCONTEXT_OFFSET, offsetof (struct sigframe, sc));
+        BLANK();
+    /* for assembly files which can't include sched.h: */
+        DEFINE(IA64_CLONE_VFORK, CLONE_VFORK);
+        DEFINE(IA64_CLONE_VM, CLONE_VM);
+        BLANK();
+        DEFINE(IA64_CPUINFO_NSEC_PER_CYC_OFFSET,
+               offsetof (struct cpuinfo_ia64, nsec_per_cyc));
+        DEFINE(IA64_CPUINFO_PTCE_BASE_OFFSET,
+               offsetof (struct cpuinfo_ia64, ptce_base));
+        DEFINE(IA64_CPUINFO_PTCE_COUNT_OFFSET,
+               offsetof (struct cpuinfo_ia64, ptce_count));
+        DEFINE(IA64_CPUINFO_PTCE_STRIDE_OFFSET,
+               offsetof (struct cpuinfo_ia64, ptce_stride));
+        BLANK();
+        DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET,
+               offsetof (struct timespec, tv_nsec));
+        DEFINE(CLONE_SETTLS_BIT, 19);
+#if CLONE_SETTLS != (1<<19)
+# error "CLONE_SETTLS_BIT incorrect, please fix"
+#endif
+        BLANK();
+        DEFINE(IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET,
+               offsetof (struct ia64_mca_cpu, proc_state_dump));
+        DEFINE(IA64_MCA_CPU_STACK_OFFSET,
+               offsetof (struct ia64_mca_cpu, stack));
+        DEFINE(IA64_MCA_CPU_STACKFRAME_OFFSET,
+               offsetof (struct ia64_mca_cpu, stackframe));
+        DEFINE(IA64_MCA_CPU_RBSTORE_OFFSET,
+               offsetof (struct ia64_mca_cpu, rbstore));
+        DEFINE(IA64_MCA_CPU_INIT_STACK_OFFSET,
+               offsetof (struct ia64_mca_cpu, init_stack));
+        BLANK();
+        /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */
+        DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr));
+        DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source));
+        DEFINE(IA64_TIME_INTERPOLATOR_SHIFT_OFFSET, offsetof (struct time_interpolator, shift));
+        DEFINE(IA64_TIME_INTERPOLATOR_NSEC_OFFSET, offsetof (struct time_interpolator, nsec_per_cyc));
+        DEFINE(IA64_TIME_INTERPOLATOR_OFFSET_OFFSET, offsetof (struct time_interpolator, offset));
+        DEFINE(IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET, offsetof (struct time_interpolator, last_cycle));
+        DEFINE(IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET, offsetof (struct time_interpolator, last_counter));
+        DEFINE(IA64_TIME_INTERPOLATOR_JITTER_OFFSET, offsetof (struct time_interpolator, jitter));
+        DEFINE(IA64_TIME_INTERPOLATOR_MASK_OFFSET, offsetof (struct time_interpolator, mask));
+        DEFINE(IA64_TIME_SOURCE_CPU, TIME_SOURCE_CPU);
+        DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64);
+        DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32);
+        DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec));
+}
diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c
new file mode 100644
index 000000000000..0b286ca164f9
--- /dev/null
+++ b/arch/ia64/kernel/brl_emu.c
@@ -0,0 +1,234 @@
+/*
+ *  Emulation of the "brl" instruction for IA64 processors that
+ *  don't support it in hardware.
+ *  Author: Stephan Zeisset, Intel Corp. <Stephan.Zeisset@intel.com>
+ *
+ *    02/22/02  D. Mosberger    Clear si_flgs, si_isr, and si_imm to avoid
+ *                              leaking kernel bits.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+extern char ia64_set_b1, ia64_set_b2, ia64_set_b3, ia64_set_b4, ia64_set_b5;
+struct illegal_op_return {
+        unsigned long fkt, arg1, arg2, arg3;
+};
+/*
+ *  The unimplemented bits of a virtual address must be set
+ *  to the value of the most significant implemented bit.
+ *  unimpl_va_mask includes all unimplemented bits and
+ *  the most significant implemented bit, so the result
+ *  of an and operation with the mask must be all 0's
+ *  or all 1's for the address to be valid.
+ */
+#define unimplemented_virtual_address(va) (                                             \
+        ((va) & local_cpu_data->unimpl_va_mask) != 0 &&                                 \
+        ((va) & local_cpu_data->unimpl_va_mask) != local_cpu_data->unimpl_va_mask       \
+)
+/*
+ *  The unimplemented bits of a physical address must be 0.
+ *  unimpl_pa_mask includes all unimplemented bits, so the result
+ *  of an and operation with the mask must be all 0's for the
+ *  address to be valid.
+ */
+#define unimplemented_physical_address(pa) (            \
+        ((pa) & local_cpu_data->unimpl_pa_mask) != 0    \
+)
+/*
+ *  Handle an illegal operation fault that was caused by an
+ *  unimplemented "brl" instruction.
+ *  If we are not successful (e.g because the illegal operation
+ *  wasn't caused by a "brl" after all), we return -1.
+ *  If we are successful, we return either 0 or the address
+ *  of a "fixup" function for manipulating preserved register
+ *  state.
+ */
+struct illegal_op_return
+ia64_emulate_brl (struct pt_regs *regs, unsigned long ar_ec)
+{
+        unsigned long bundle[2];
+        unsigned long opcode, btype, qp, offset, cpl;
+        unsigned long next_ip;
+        struct siginfo siginfo;
+        struct illegal_op_return rv;
+        long tmp_taken, unimplemented_address;
+        rv.fkt = (unsigned long) -1;
+        /*
+         *  Decode the instruction bundle.
+         */
+        if (copy_from_user(bundle, (void *) (regs->cr_iip), sizeof(bundle)))
+                return rv;
+        next_ip = (unsigned long) regs->cr_iip + 16;
+        /* "brl" must be in slot 2. */
+        if (ia64_psr(regs)->ri != 1) return rv;
+        /* Must be "mlx" template */
+        if ((bundle[0] & 0x1e) != 0x4) return rv;
+        opcode = (bundle[1] >> 60);
+        btype = ((bundle[1] >> 29) & 0x7);
+        qp = ((bundle[1] >> 23) & 0x3f);
+        offset = ((bundle[1] & 0x0800000000000000L) << 4)
+                | ((bundle[1] & 0x00fffff000000000L) >> 32)
+                | ((bundle[1] & 0x00000000007fffffL) << 40)
+                | ((bundle[0] & 0xffff000000000000L) >> 24);
+        tmp_taken = regs->pr & (1L << qp);
+        switch(opcode) {
+                case 0xC:
+                        /*
+                         *  Long Branch.
+                         */
+                        if (btype != 0) return rv;
+                        rv.fkt = 0;
+                        if (!(tmp_taken)) {
+                                /*
+                                 *  Qualifying predicate is 0.
+                                 *  Skip instruction.
+                                 */
+                                regs->cr_iip = next_ip;
+                                ia64_psr(regs)->ri = 0;
+                                return rv;
+                        }
+                        break;
+                case 0xD:
+                        /*
+                         *  Long Call.
+                         */
+                        rv.fkt = 0;
+                        if (!(tmp_taken)) {
+                                /*
+                                 *  Qualifying predicate is 0.
+                                 *  Skip instruction.
+                                 */
+                                regs->cr_iip = next_ip;
+                                ia64_psr(regs)->ri = 0;
+                                return rv;
+                        }
+                        /*
+                         *  BR[btype] = IP+16
+                         */
+                        switch(btype) {
+                                case 0:
+                                        regs->b0 = next_ip;
+                                        break;
+                                case 1:
+                                        rv.fkt = (unsigned long) &ia64_set_b1;
+                                        break;
+                                case 2:
+                                        rv.fkt = (unsigned long) &ia64_set_b2;
+                                        break;
+                                case 3:
+                                        rv.fkt = (unsigned long) &ia64_set_b3;
+                                        break;
+                                case 4:
+                                        rv.fkt = (unsigned long) &ia64_set_b4;
+                                        break;
+                                case 5:
+                                        rv.fkt = (unsigned long) &ia64_set_b5;
+                                        break;
+                                case 6:
+                                        regs->b6 = next_ip;
+                                        break;
+                                case 7:
+                                        regs->b7 = next_ip;
+                                        break;
+                        }
+                        rv.arg1 = next_ip;
+                        /*
+                         *  AR[PFS].pfm = CFM
+                         *  AR[PFS].pec = AR[EC]
+                         *  AR[PFS].ppl = PSR.cpl
+                         */
+                        cpl = ia64_psr(regs)->cpl;
+                        regs->ar_pfs = ((regs->cr_ifs & 0x3fffffffff)
+                                        | (ar_ec << 52) | (cpl << 62));
+                        /*
+                         *  CFM.sof -= CFM.sol
+                         *  CFM.sol = 0
+                         *  CFM.sor = 0
+                         *  CFM.rrb.gr = 0
+                         *  CFM.rrb.fr = 0
+                         *  CFM.rrb.pr = 0
+                         */
+                        regs->cr_ifs = ((regs->cr_ifs & 0xffffffc00000007f)
+                                        - ((regs->cr_ifs >> 7) & 0x7f));
+                        break;
+                default:
+                        /*
+                         *  Unknown opcode.
+                         */
+                        return rv;
+        }
+        regs->cr_iip += offset;
+        ia64_psr(regs)->ri = 0;
+        if (ia64_psr(regs)->it == 0)
+                unimplemented_address = unimplemented_physical_address(regs->cr_iip);
+        else
+                unimplemented_address = unimplemented_virtual_address(regs->cr_iip);
+        if (unimplemented_address) {
+                /*
+                 *  The target address contains unimplemented bits.
+                 */
+                printk(KERN_DEBUG "Woah! Unimplemented Instruction Address Trap!\n");
+                siginfo.si_signo = SIGILL;
+                siginfo.si_errno = 0;
+                siginfo.si_flags = 0;
+                siginfo.si_isr = 0;
+                siginfo.si_imm = 0;
+                siginfo.si_code = ILL_BADIADDR;
+                force_sig_info(SIGILL, &siginfo, current);
+        } else if (ia64_psr(regs)->tb) {
+                /*
+                 *  Branch Tracing is enabled.
+                 *  Force a taken branch signal.
+                 */
+                siginfo.si_signo = SIGTRAP;
+                siginfo.si_errno = 0;
+                siginfo.si_code = TRAP_BRANCH;
+                siginfo.si_flags = 0;
+                siginfo.si_isr = 0;
+                siginfo.si_addr = 0;
+                siginfo.si_imm = 0;
+                force_sig_info(SIGTRAP, &siginfo, current);
+        } else if (ia64_psr(regs)->ss) {
+                /*
+                 *  Single Step is enabled.
+                 *  Force a trace signal.
+                 */
+                siginfo.si_signo = SIGTRAP;
+                siginfo.si_errno = 0;
+                siginfo.si_code = TRAP_TRACE;
+                siginfo.si_flags = 0;
+                siginfo.si_isr = 0;
+                siginfo.si_addr = 0;
+                siginfo.si_imm = 0;
+                force_sig_info(SIGTRAP, &siginfo, current);
+        }
+        return rv;
+}
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
new file mode 100644
index 000000000000..768c7e46957c
--- /dev/null
+++ b/arch/ia64/kernel/cyclone.c
@@ -0,0 +1,109 @@
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/time.h>
+#include <linux/errno.h>
+#include <asm/io.h>
+/* IBM Summit (EXA) Cyclone counter code*/
+#define CYCLONE_CBAR_ADDR 0xFEB00CD0
+#define CYCLONE_PMCC_OFFSET 0x51A0
+#define CYCLONE_MPMC_OFFSET 0x51D0
+#define CYCLONE_MPCS_OFFSET 0x51A8
+#define CYCLONE_TIMER_FREQ 100000000
+int use_cyclone;
+void __init cyclone_setup(void)
+{
+        use_cyclone = 1;
+}
+struct time_interpolator cyclone_interpolator = {
+        .source =       TIME_SOURCE_MMIO64,
+        .shift =        16,
+        .frequency =    CYCLONE_TIMER_FREQ,
+        .drift =        -100,
+        .mask =         (1LL << 40) - 1
+};
+int __init init_cyclone_clock(void)
+{
+        u64* reg;
+        u64 base;       /* saved cyclone base address */
+        u64 offset;     /* offset from pageaddr to cyclone_timer register */
+        int i;
+        u32* volatile cyclone_timer;    /* Cyclone MPMC0 register */
+        if (!use_cyclone)
+                return -ENODEV;
+        printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
+        /* find base address */
+        offset = (CYCLONE_CBAR_ADDR);
+        reg = (u64*)ioremap_nocache(offset, sizeof(u64));
+        if(!reg){
+                printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
+                use_cyclone = 0;
+                return -ENODEV;
+        }
+        base = readq(reg);
+        if(!base){
+                printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
+                use_cyclone = 0;
+                return -ENODEV;
+        }
+        iounmap(reg);
+        /* setup PMCC */
+        offset = (base + CYCLONE_PMCC_OFFSET);
+        reg = (u64*)ioremap_nocache(offset, sizeof(u64));
+        if(!reg){
+                printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
+                use_cyclone = 0;
+                return -ENODEV;
+        }
+        writel(0x00000001,reg);
+        iounmap(reg);
+        /* setup MPCS */
+        offset = (base + CYCLONE_MPCS_OFFSET);
+        reg = (u64*)ioremap_nocache(offset, sizeof(u64));
+        if(!reg){
+                printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
+                use_cyclone = 0;
+                return -ENODEV;
+        }
+        writel(0x00000001,reg);
+        iounmap(reg);
+        /* map in cyclone_timer */
+        offset = (base + CYCLONE_MPMC_OFFSET);
+        cyclone_timer = (u32*)ioremap_nocache(offset, sizeof(u32));
+        if(!cyclone_timer){
+                printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
+                use_cyclone = 0;
+                return -ENODEV;
+        }
+        /*quick test to make sure its ticking*/
+        for(i=0; i<3; i++){
+                u32 old = readl(cyclone_timer);
+                int stall = 100;
+                while(stall--) barrier();
+                if(readl(cyclone_timer) == old){
+                        printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
+                        iounmap(cyclone_timer);
+                        cyclone_timer = 0;
+                        use_cyclone = 0;
+                        return -ENODEV;
+                }
+        }
+        /* initialize last tick */
+        cyclone_interpolator.addr = cyclone_timer;
+        register_time_interpolator(&cyclone_interpolator);
+        return 0;
+}
+__initcall(init_cyclone_clock);
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
new file mode 100644
index 000000000000..fe532c970438
--- /dev/null
+++ b/arch/ia64/kernel/domain.c
@@ -0,0 +1,382 @@
+/*
+ * arch/ia64/kernel/domain.c
+ * Architecture specific sched-domains builder.
+ *
+ * Copyright (C) 2004 Jesse Barnes
+ * Copyright (C) 2004 Silicon Graphics, Inc.
+ */
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/topology.h>
+#include <linux/nodemask.h>
+#define SD_NODES_PER_DOMAIN 6
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
+{
+        int i, n, val, min_val, best_node = 0;
+        min_val = INT_MAX;
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Start at @node */
+                n = (node + i) % MAX_NUMNODES;
+                if (!nr_cpus_node(n))
+                        continue;
+                /* Skip already used nodes */
+                if (test_bit(n, used_nodes))
+                        continue;
+                /* Simple min distance search */
+                val = node_distance(node, n);
+                if (val < min_val) {
+                        min_val = val;
+                        best_node = n;
+                }
+        }
+        set_bit(best_node, used_nodes);
+        return best_node;
+}
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t __devinit sched_domain_node_span(int node)
+{
+        int i;
+        cpumask_t span, nodemask;
+        DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+        cpus_clear(span);
+        bitmap_zero(used_nodes, MAX_NUMNODES);
+        nodemask = node_to_cpumask(node);
+        cpus_or(span, span, nodemask);
+        set_bit(node, used_nodes);
+        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+                int next_node = find_next_best_node(node, used_nodes);
+                nodemask = node_to_cpumask(next_node);
+                cpus_or(span, span, nodemask);
+        }
+        return span;
+}
+#endif
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
+#ifdef CONFIG_SCHED_SMT
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+static struct sched_group sched_group_cpus[NR_CPUS];
+static int __devinit cpu_to_cpu_group(int cpu)
+{
+        return cpu;
+}
+#endif
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static struct sched_group sched_group_phys[NR_CPUS];
+static int __devinit cpu_to_phys_group(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+        return first_cpu(cpu_sibling_map[cpu]);
+#else
+        return cpu;
+#endif
+}
+#ifdef CONFIG_NUMA
+/*
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
+ */
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group *sched_group_nodes[MAX_NUMNODES];
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+static int __devinit cpu_to_allnodes_group(int cpu)
+{
+        return cpu_to_node(cpu);
+}
+#endif
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+void __devinit arch_init_sched_domains(void)
+{
+        int i;
+        cpumask_t cpu_default_map;
+        /*
+         * Setup mask for cpus without special case scheduling requirements.
+         * For now this just excludes isolated cpus, but could be used to
+         * exclude other special cases in the future.
+         */
+        cpus_complement(cpu_default_map, cpu_isolated_map);
+        cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
+        /*
+         * Set up domains. Isolated domains just stay on the dummy domain.
+         */
+        for_each_cpu_mask(i, cpu_default_map) {
+                int group;
+                struct sched_domain *sd = NULL, *p;
+                cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
+                cpus_and(nodemask, nodemask, cpu_default_map);
+#ifdef CONFIG_NUMA
+                if (num_online_cpus()
+                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+                        sd = &per_cpu(allnodes_domains, i);
+                        *sd = SD_ALLNODES_INIT;
+                        sd->span = cpu_default_map;
+                        group = cpu_to_allnodes_group(i);
+                        sd->groups = &sched_group_allnodes[group];
+                        p = sd;
+                } else
+                        p = NULL;
+                sd = &per_cpu(node_domains, i);
+                *sd = SD_NODE_INIT;
+                sd->span = sched_domain_node_span(cpu_to_node(i));
+                sd->parent = p;
+                cpus_and(sd->span, sd->span, cpu_default_map);
+#endif
+                p = sd;
+                sd = &per_cpu(phys_domains, i);
+                group = cpu_to_phys_group(i);
+                *sd = SD_CPU_INIT;
+                sd->span = nodemask;
+                sd->parent = p;
+                sd->groups = &sched_group_phys[group];
+#ifdef CONFIG_SCHED_SMT
+                p = sd;
+                sd = &per_cpu(cpu_domains, i);
+                group = cpu_to_cpu_group(i);
+                *sd = SD_SIBLING_INIT;
+                sd->span = cpu_sibling_map[i];
+                cpus_and(sd->span, sd->span, cpu_default_map);
+                sd->parent = p;
+                sd->groups = &sched_group_cpus[group];
+#endif
+        }
+#ifdef CONFIG_SCHED_SMT
+        /* Set up CPU (sibling) groups */
+        for_each_cpu_mask(i, cpu_default_map) {
+                cpumask_t this_sibling_map = cpu_sibling_map[i];
+                cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+                if (i != first_cpu(this_sibling_map))
+                        continue;
+                init_sched_build_groups(sched_group_cpus, this_sibling_map,
+                                                &cpu_to_cpu_group);
+        }
+#endif
+        /* Set up physical groups */
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                cpumask_t nodemask = node_to_cpumask(i);
+                cpus_and(nodemask, nodemask, cpu_default_map);
+                if (cpus_empty(nodemask))
+                        continue;
+                init_sched_build_groups(sched_group_phys, nodemask,
+                                                &cpu_to_phys_group);
+        }
+#ifdef CONFIG_NUMA
+        init_sched_build_groups(sched_group_allnodes, cpu_default_map,
+                                &cpu_to_allnodes_group);
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Set up node groups */
+                struct sched_group *sg, *prev;
+                cpumask_t nodemask = node_to_cpumask(i);
+                cpumask_t domainspan;
+                cpumask_t covered = CPU_MASK_NONE;
+                int j;
+                cpus_and(nodemask, nodemask, cpu_default_map);
+                if (cpus_empty(nodemask))
+                        continue;
+                domainspan = sched_domain_node_span(i);
+                cpus_and(domainspan, domainspan, cpu_default_map);
+                sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                sched_group_nodes[i] = sg;
+                for_each_cpu_mask(j, nodemask) {
+                        struct sched_domain *sd;
+                        sd = &per_cpu(node_domains, j);
+                        sd->groups = sg;
+                        if (sd->groups == NULL) {
+                                /* Turn off balancing if we have no groups */
+                                sd->flags = 0;
+                        }
+                }
+                if (!sg) {
+                        printk(KERN_WARNING
+                        "Can not alloc domain group for node %d\n", i);
+                        continue;
+                }
+                sg->cpu_power = 0;
+                sg->cpumask = nodemask;
+                cpus_or(covered, covered, nodemask);
+                prev = sg;
+                for (j = 0; j < MAX_NUMNODES; j++) {
+                        cpumask_t tmp, notcovered;
+                        int n = (i + j) % MAX_NUMNODES;
+                        cpus_complement(notcovered, covered);
+                        cpus_and(tmp, notcovered, cpu_default_map);
+                        cpus_and(tmp, tmp, domainspan);
+                        if (cpus_empty(tmp))
+                                break;
+                        nodemask = node_to_cpumask(n);
+                        cpus_and(tmp, tmp, nodemask);
+                        if (cpus_empty(tmp))
+                                continue;
+                        sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                        if (!sg) {
+                                printk(KERN_WARNING
+                                "Can not alloc domain group for node %d\n", j);
+                                break;
+                        }
+                        sg->cpu_power = 0;
+                        sg->cpumask = tmp;
+                        cpus_or(covered, covered, tmp);
+                        prev->next = sg;
+                        prev = sg;
+                }
+                prev->next = sched_group_nodes[i];
+        }
+#endif
+        /* Calculate CPU power for physical packages and nodes */
+        for_each_cpu_mask(i, cpu_default_map) {
+                int power;
+                struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+                sd = &per_cpu(cpu_domains, i);
+                power = SCHED_LOAD_SCALE;
+                sd->groups->cpu_power = power;
+#endif
+                sd = &per_cpu(phys_domains, i);
+                power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                sd->groups->cpu_power = power;
+#ifdef CONFIG_NUMA
+                sd = &per_cpu(allnodes_domains, i);
+                if (sd->groups) {
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sd->groups->cpu_power = power;
+                }
+#endif
+        }
+#ifdef CONFIG_NUMA
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                struct sched_group *sg = sched_group_nodes[i];
+                int j;
+                if (sg == NULL)
+                        continue;
+next_sg:
+                for_each_cpu_mask(j, sg->cpumask) {
+                        struct sched_domain *sd;
+                        int power;
+                        sd = &per_cpu(phys_domains, j);
+                        if (j != first_cpu(sd->groups->cpumask)) {
+                                /*
+                                 * Only add "power" once for each
+                                 * physical package.
+                                 */
+                                continue;
+                        }
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sg->cpu_power += power;
+                }
+                sg = sg->next;
+                if (sg != sched_group_nodes[i])
+                        goto next_sg;
+        }
+#endif
+        /* Attach the domains */
+        for_each_online_cpu(i) {
+                struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+                sd = &per_cpu(cpu_domains, i);
+#else
+                sd = &per_cpu(phys_domains, i);
+#endif
+                cpu_attach_domain(sd, i);
+        }
+}
+void __devinit arch_destroy_sched_domains(void)
+{
+#ifdef CONFIG_NUMA
+        int i;
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                struct sched_group *oldsg, *sg = sched_group_nodes[i];
+                if (sg == NULL)
+                        continue;
+                sg = sg->next;
+next_sg:
+                oldsg = sg;
+                sg = sg->next;
+                kfree(oldsg);
+                if (oldsg != sched_group_nodes[i])
+                        goto next_sg;
+                sched_group_nodes[i] = NULL;
+        }
+#endif
+}
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
new file mode 100644
index 000000000000..4a3b1aac43e7
--- /dev/null
+++ b/arch/ia64/kernel/efi.c
@@ -0,0 +1,832 @@
+/*
+ * Extensible Firmware Interface
+ *
+ * Based on Extensible Firmware Interface Specification version 0.9 April 30, 1999
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2003 Hewlett-Packard Co.
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version.  --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls.  --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ *      Skip non-WB memory and ignore empty memory ranges.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/efi.h>
+#include <asm/io.h>
+#include <asm/kregs.h>
+#include <asm/meminit.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/mca.h>
+#define EFI_DEBUG       0
+extern efi_status_t efi_call_phys (void *, ...);
+struct efi efi;
+EXPORT_SYMBOL(efi);
+static efi_runtime_services_t *runtime;
+static unsigned long mem_limit = ~0UL, max_addr = ~0UL;
+#define efi_call_virt(f, args...)       (*(f))(args)
+#define STUB_GET_TIME(prefix, adjust_arg)                                                         \
+static efi_status_t                                                                               \
+prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc)                                            \
+{                                                                                                 \
+        struct ia64_fpreg fr[6];                                                                  \
+        efi_time_cap_t *atc = NULL;                                                               \
+        efi_status_t ret;                                                                         \
+                                                                                                  \
+        if (tc)                                                                                   \
+                atc = adjust_arg(tc);                                                             \
+        ia64_save_scratch_fpregs(fr);                                                             \
+        ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time), adjust_arg(tm), atc); \
+        ia64_load_scratch_fpregs(fr);                                                             \
+        return ret;                                                                               \
+}
+#define STUB_SET_TIME(prefix, adjust_arg)                                                       \
+static efi_status_t                                                                             \
+prefix##_set_time (efi_time_t *tm)                                                              \
+{                                                                                               \
+        struct ia64_fpreg fr[6];                                                                \
+        efi_status_t ret;                                                                       \
+                                                                                                \
+        ia64_save_scratch_fpregs(fr);                                                           \
+        ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time), adjust_arg(tm));    \
+        ia64_load_scratch_fpregs(fr);                                                           \
+        return ret;                                                                             \
+}
+#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg)                                                \
+static efi_status_t                                                                             \
+prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending, efi_time_t *tm)             \
+{                                                                                               \
+        struct ia64_fpreg fr[6];                                                                \
+        efi_status_t ret;                                                                       \
+                                                                                                \
+        ia64_save_scratch_fpregs(fr);                                                           \
+        ret = efi_call_##prefix((efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time),       \
+                                adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm));      \
+        ia64_load_scratch_fpregs(fr);                                                           \
+        return ret;                                                                             \
+}
+#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg)                                                \
+static efi_status_t                                                                             \
+prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm)                                   \
+{                                                                                               \
+        struct ia64_fpreg fr[6];                                                                \
+        efi_time_t *atm = NULL;                                                                 \
+        efi_status_t ret;                                                                       \
+                                                                                                \
+        if (tm)                                                                                 \
+                atm = adjust_arg(tm);                                                           \
+        ia64_save_scratch_fpregs(fr);                                                           \
+        ret = efi_call_##prefix((efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time),       \
+                                enabled, atm);                                                  \
+        ia64_load_scratch_fpregs(fr);                                                           \
+        return ret;                                                                             \
+}
+#define STUB_GET_VARIABLE(prefix, adjust_arg)                                           \
+static efi_status_t                                                                     \
+prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr,               \
+                       unsigned long *data_size, void *data)                            \
+{                                                                                       \
+        struct ia64_fpreg fr[6];                                                        \
+        u32 *aattr = NULL;                                                                      \
+        efi_status_t ret;                                                               \
+                                                                                        \
+        if (attr)                                                                       \
+                aattr = adjust_arg(attr);                                               \
+        ia64_save_scratch_fpregs(fr);                                                   \
+        ret = efi_call_##prefix((efi_get_variable_t *) __va(runtime->get_variable),     \
+                                adjust_arg(name), adjust_arg(vendor), aattr,            \
+                                adjust_arg(data_size), adjust_arg(data));               \
+        ia64_load_scratch_fpregs(fr);                                                   \
+        return ret;                                                                     \
+}
+#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg)                                              \
+static efi_status_t                                                                             \
+prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor)   \
+{                                                                                               \
+        struct ia64_fpreg fr[6];                                                                \
+        efi_status_t ret;                                                                       \
+                                                                                                \
+        ia64_save_scratch_fpregs(fr);                                                           \
+        ret = efi_call_##prefix((efi_get_next_variable_t *) __va(runtime->get_next_variable),   \
+                                adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor));   \
+        ia64_load_scratch_fpregs(fr);                                                           \
+        return ret;                                                                             \
+}
+#define STUB_SET_VARIABLE(prefix, adjust_arg)                                           \
+static efi_status_t                                                                     \
+prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor, unsigned long attr,      \
+                       unsigned long data_size, void *data)                             \
+{                                                                                       \
+        struct ia64_fpreg fr[6];                                                        \
+        efi_status_t ret;                                                               \
+                                                                                        \
+        ia64_save_scratch_fpregs(fr);                                                   \
+        ret = efi_call_##prefix((efi_set_variable_t *) __va(runtime->set_variable),     \
+                                adjust_arg(name), adjust_arg(vendor), attr, data_size,  \
+                                adjust_arg(data));                                      \
+        ia64_load_scratch_fpregs(fr);                                                   \
+        return ret;                                                                     \
+}
+#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg)                                       \
+static efi_status_t                                                                             \
+prefix##_get_next_high_mono_count (u32 *count)                                                  \
+{                                                                                               \
+        struct ia64_fpreg fr[6];                                                                \
+        efi_status_t ret;                                                                       \
+                                                                                                \
+        ia64_save_scratch_fpregs(fr);                                                           \
+        ret = efi_call_##prefix((efi_get_next_high_mono_count_t *)                              \
+                                __va(runtime->get_next_high_mono_count), adjust_arg(count));    \
+        ia64_load_scratch_fpregs(fr);                                                           \
+        return ret;                                                                             \
+}
+#define STUB_RESET_SYSTEM(prefix, adjust_arg)                                   \
+static void                                                                     \
+prefix##_reset_system (int reset_type, efi_status_t status,                     \
+                       unsigned long data_size, efi_char16_t *data)             \
+{                                                                               \
+        struct ia64_fpreg fr[6];                                                \
+        efi_char16_t *adata = NULL;                                             \
+                                                                                \
+        if (data)                                                               \
+                adata = adjust_arg(data);                                       \
+                                                                                \
+        ia64_save_scratch_fpregs(fr);                                           \
+        efi_call_##prefix((efi_reset_system_t *) __va(runtime->reset_system),   \
+                          reset_type, status, data_size, adata);                \
+        /* should not return, but just in case... */                            \
+        ia64_load_scratch_fpregs(fr);                                           \
+}
+#define phys_ptr(arg)   ((__typeof__(arg)) ia64_tpa(arg))
+STUB_GET_TIME(phys, phys_ptr)
+STUB_SET_TIME(phys, phys_ptr)
+STUB_GET_WAKEUP_TIME(phys, phys_ptr)
+STUB_SET_WAKEUP_TIME(phys, phys_ptr)
+STUB_GET_VARIABLE(phys, phys_ptr)
+STUB_GET_NEXT_VARIABLE(phys, phys_ptr)
+STUB_SET_VARIABLE(phys, phys_ptr)
+STUB_GET_NEXT_HIGH_MONO_COUNT(phys, phys_ptr)
+STUB_RESET_SYSTEM(phys, phys_ptr)
+#define id(arg) arg
+STUB_GET_TIME(virt, id)
+STUB_SET_TIME(virt, id)
+STUB_GET_WAKEUP_TIME(virt, id)
+STUB_SET_WAKEUP_TIME(virt, id)
+STUB_GET_VARIABLE(virt, id)
+STUB_GET_NEXT_VARIABLE(virt, id)
+STUB_SET_VARIABLE(virt, id)
+STUB_GET_NEXT_HIGH_MONO_COUNT(virt, id)
+STUB_RESET_SYSTEM(virt, id)
+void
+efi_gettimeofday (struct timespec *ts)
+{
+        efi_time_t tm;
+        memset(ts, 0, sizeof(ts));
+        if ((*efi.get_time)(&tm, NULL) != EFI_SUCCESS)
+                return;
+        ts->tv_sec = mktime(tm.year, tm.month, tm.day, tm.hour, tm.minute, tm.second);
+        ts->tv_nsec = tm.nanosecond;
+}
+static int
+is_available_memory (efi_memory_desc_t *md)
+{
+        if (!(md->attribute & EFI_MEMORY_WB))
+                return 0;
+        switch (md->type) {
+              case EFI_LOADER_CODE:
+              case EFI_LOADER_DATA:
+              case EFI_BOOT_SERVICES_CODE:
+              case EFI_BOOT_SERVICES_DATA:
+              case EFI_CONVENTIONAL_MEMORY:
+                return 1;
+        }
+        return 0;
+}
+/*
+ * Trim descriptor MD so its starts at address START_ADDR.  If the descriptor covers
+ * memory that is normally available to the kernel, issue a warning that some memory
+ * is being ignored.
+ */
+static void
+trim_bottom (efi_memory_desc_t *md, u64 start_addr)
+{
+        u64 num_skipped_pages;
+        if (md->phys_addr >= start_addr || !md->num_pages)
+                return;
+        num_skipped_pages = (start_addr - md->phys_addr) >> EFI_PAGE_SHIFT;
+        if (num_skipped_pages > md->num_pages)
+                num_skipped_pages = md->num_pages;
+        if (is_available_memory(md))
+                printk(KERN_NOTICE "efi.%s: ignoring %luKB of memory at 0x%lx due to granule hole "
+                       "at 0x%lx\n", __FUNCTION__,
+                       (num_skipped_pages << EFI_PAGE_SHIFT) >> 10,
+                       md->phys_addr, start_addr - IA64_GRANULE_SIZE);
+        /*
+         * NOTE: Don't set md->phys_addr to START_ADDR because that could cause the memory
+         * descriptor list to become unsorted.  In such a case, md->num_pages will be
+         * zero, so the Right Thing will happen.
+         */
+        md->phys_addr += num_skipped_pages << EFI_PAGE_SHIFT;
+        md->num_pages -= num_skipped_pages;
+}
+static void
+trim_top (efi_memory_desc_t *md, u64 end_addr)
+{
+        u64 num_dropped_pages, md_end_addr;
+        md_end_addr = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
+        if (md_end_addr <= end_addr || !md->num_pages)
+                return;
+        num_dropped_pages = (md_end_addr - end_addr) >> EFI_PAGE_SHIFT;
+        if (num_dropped_pages > md->num_pages)
+                num_dropped_pages = md->num_pages;
+        if (is_available_memory(md))
+                printk(KERN_NOTICE "efi.%s: ignoring %luKB of memory at 0x%lx due to granule hole "
+                       "at 0x%lx\n", __FUNCTION__,
+                       (num_dropped_pages << EFI_PAGE_SHIFT) >> 10,
+                       md->phys_addr, end_addr);
+        md->num_pages -= num_dropped_pages;
+}
+/*
+ * Walks the EFI memory map and calls CALLBACK once for each EFI memory descriptor that
+ * has memory that is available for OS use.
+ */
+void
+efi_memmap_walk (efi_freemem_callback_t callback, void *arg)
+{
+        int prev_valid = 0;
+        struct range {
+                u64 start;
+                u64 end;
+        } prev, curr;
+        void *efi_map_start, *efi_map_end, *p, *q;
+        efi_memory_desc_t *md, *check_md;
+        u64 efi_desc_size, start, end, granule_addr, last_granule_addr, first_non_wb_addr = 0;
+        unsigned long total_mem = 0;
+        efi_map_start = __va(ia64_boot_param->efi_memmap);
+        efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+        efi_desc_size = ia64_boot_param->efi_memdesc_size;
+        for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+                md = p;
+                /* skip over non-WB memory descriptors; that's all we're interested in... */
+                if (!(md->attribute & EFI_MEMORY_WB))
+                        continue;
+                /*
+                 * granule_addr is the base of md's first granule.
+                 * [granule_addr - first_non_wb_addr) is guaranteed to
+                 * be contiguous WB memory.
+                 */
+                granule_addr = GRANULEROUNDDOWN(md->phys_addr);
+                first_non_wb_addr = max(first_non_wb_addr, granule_addr);
+                if (first_non_wb_addr < md->phys_addr) {
+                        trim_bottom(md, granule_addr + IA64_GRANULE_SIZE);
+                        granule_addr = GRANULEROUNDDOWN(md->phys_addr);
+                        first_non_wb_addr = max(first_non_wb_addr, granule_addr);
+                }
+                for (q = p; q < efi_map_end; q += efi_desc_size) {
+                        check_md = q;
+                        if ((check_md->attribute & EFI_MEMORY_WB) &&
+                            (check_md->phys_addr == first_non_wb_addr))
+                                first_non_wb_addr += check_md->num_pages << EFI_PAGE_SHIFT;
+                        else
+                                break;          /* non-WB or hole */
+                }
+                last_granule_addr = GRANULEROUNDDOWN(first_non_wb_addr);
+                if (last_granule_addr < md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT))
+                        trim_top(md, last_granule_addr);
+                if (is_available_memory(md)) {
+                        if (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) >= max_addr) {
+                                if (md->phys_addr >= max_addr)
+                                        continue;
+                                md->num_pages = (max_addr - md->phys_addr) >> EFI_PAGE_SHIFT;
+                                first_non_wb_addr = max_addr;
+                        }
+                        if (total_mem >= mem_limit)
+                                continue;
+                        if (total_mem + (md->num_pages << EFI_PAGE_SHIFT) > mem_limit) {
+                                unsigned long limit_addr = md->phys_addr;
+                                limit_addr += mem_limit - total_mem;
+                                limit_addr = GRANULEROUNDDOWN(limit_addr);
+                                if (md->phys_addr > limit_addr)
+                                        continue;
+                                md->num_pages = (limit_addr - md->phys_addr) >>
+                                                EFI_PAGE_SHIFT;
+                                first_non_wb_addr = max_addr = md->phys_addr +
+                                              (md->num_pages << EFI_PAGE_SHIFT);
+                        }
+                        total_mem += (md->num_pages << EFI_PAGE_SHIFT);
+                        if (md->num_pages == 0)
+                                continue;
+                        curr.start = PAGE_OFFSET + md->phys_addr;
+                        curr.end   = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
+                        if (!prev_valid) {
+                                prev = curr;
+                                prev_valid = 1;
+                        } else {
+                                if (curr.start < prev.start)
+                                        printk(KERN_ERR "Oops: EFI memory table not ordered!\n");
+                                if (prev.end == curr.start) {
+                                        /* merge two consecutive memory ranges */
+                                        prev.end = curr.end;
+                                } else {
+                                        start = PAGE_ALIGN(prev.start);
+                                        end = prev.end & PAGE_MASK;
+                                        if ((end > start) && (*callback)(start, end, arg) < 0)
+                                                return;
+                                        prev = curr;
+                                }
+                        }
+                }
+        }
+        if (prev_valid) {
+                start = PAGE_ALIGN(prev.start);
+                end = prev.end & PAGE_MASK;
+                if (end > start)
+                        (*callback)(start, end, arg);
+        }
+}
+/*
+ * Look for the PAL_CODE region reported by EFI and maps it using an
+ * ITR to enable safe PAL calls in virtual mode.  See IA-64 Processor
+ * Abstraction Layer chapter 11 in ADAG
+ */
+void *
+efi_get_pal_addr (void)
+{
+        void *efi_map_start, *efi_map_end, *p;
+        efi_memory_desc_t *md;
+        u64 efi_desc_size;
+        int pal_code_count = 0;
+        u64 vaddr, mask;
+        efi_map_start = __va(ia64_boot_param->efi_memmap);
+        efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+        efi_desc_size = ia64_boot_param->efi_memdesc_size;
+        for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+                md = p;
+                if (md->type != EFI_PAL_CODE)
+                        continue;
+                if (++pal_code_count > 1) {
+                        printk(KERN_ERR "Too many EFI Pal Code memory ranges, dropped @ %lx\n",
+                               md->phys_addr);
+                        continue;
+                }
+                /*
+                 * The only ITLB entry in region 7 that is used is the one installed by
+                 * __start().  That entry covers a 64MB range.
+                 */
+                mask  = ~((1 << KERNEL_TR_PAGE_SHIFT) - 1);
+                vaddr = PAGE_OFFSET + md->phys_addr;
+                /*
+                 * We must check that the PAL mapping won't overlap with the kernel
+                 * mapping.
+                 *
+                 * PAL code is guaranteed to be aligned on a power of 2 between 4k and
+                 * 256KB and that only one ITR is needed to map it. This implies that the
+                 * PAL code is always aligned on its size, i.e., the closest matching page
+                 * size supported by the TLB. Therefore PAL code is guaranteed never to
+                 * cross a 64MB unless it is bigger than 64MB (very unlikely!).  So for
+                 * now the following test is enough to determine whether or not we need a
+                 * dedicated ITR for the PAL code.
+                 */
+                if ((vaddr & mask) == (KERNEL_START & mask)) {
+                        printk(KERN_INFO "%s: no need to install ITR for PAL code\n",
+                               __FUNCTION__);
+                        continue;
+                }
+                if (md->num_pages << EFI_PAGE_SHIFT > IA64_GRANULE_SIZE)
+                        panic("Woah!  PAL code size bigger than a granule!");
+#if EFI_DEBUG
+                mask  = ~((1 << IA64_GRANULE_SHIFT) - 1);
+                printk(KERN_INFO "CPU %d: mapping PAL code [0x%lx-0x%lx) into [0x%lx-0x%lx)\n",
+                        smp_processor_id(), md->phys_addr,
+                        md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
+                        vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE);
+#endif
+                return __va(md->phys_addr);
+        }
+        printk(KERN_WARNING "%s: no PAL-code memory-descriptor found",
+               __FUNCTION__);
+        return NULL;
+}
+void
+efi_map_pal_code (void)
+{
+        void *pal_vaddr = efi_get_pal_addr ();
+        u64 psr;
+        if (!pal_vaddr)
+                return;
+        /*
+         * Cannot write to CRx with PSR.ic=1
+         */
+        psr = ia64_clear_ic();
+        ia64_itr(0x1, IA64_TR_PALCODE, GRANULEROUNDDOWN((unsigned long) pal_vaddr),
+                 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
+                 IA64_GRANULE_SHIFT);
+        ia64_set_psr(psr);              /* restore psr */
+        ia64_srlz_i();
+}
+void __init
+efi_init (void)
+{
+        void *efi_map_start, *efi_map_end;
+        efi_config_table_t *config_tables;
+        efi_char16_t *c16;
+        u64 efi_desc_size;
+        char *cp, *end, vendor[100] = "unknown";
+        extern char saved_command_line[];
+        int i;
+        /* it's too early to be able to use the standard kernel command line support... */
+        for (cp = saved_command_line; *cp; ) {
+                if (memcmp(cp, "mem=", 4) == 0) {
+                        cp += 4;
+                        mem_limit = memparse(cp, &end);
+                        if (end != cp)
+                                break;
+                        cp = end;
+                } else if (memcmp(cp, "max_addr=", 9) == 0) {
+                        cp += 9;
+                        max_addr = GRANULEROUNDDOWN(memparse(cp, &end));
+                        if (end != cp)
+                                break;
+                        cp = end;
+                } else {
+                        while (*cp != ' ' && *cp)
+                                ++cp;
+                        while (*cp == ' ')
+                                ++cp;
+                }
+        }
+        if (max_addr != ~0UL)
+                printk(KERN_INFO "Ignoring memory above %luMB\n", max_addr >> 20);
+        efi.systab = __va(ia64_boot_param->efi_systab);
+        /*
+         * Verify the EFI Table
+         */
+        if (efi.systab == NULL)
+                panic("Woah! Can't find EFI system table.\n");
+        if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+                panic("Woah! EFI system table signature incorrect\n");
+        if ((efi.systab->hdr.revision ^ EFI_SYSTEM_TABLE_REVISION) >> 16 != 0)
+                printk(KERN_WARNING "Warning: EFI system table major version mismatch: "
+                       "got %d.%02d, expected %d.%02d\n",
+                       efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff,
+                       EFI_SYSTEM_TABLE_REVISION >> 16, EFI_SYSTEM_TABLE_REVISION & 0xffff);
+        config_tables = __va(efi.systab->tables);
+        /* Show what we know for posterity */
+        c16 = __va(efi.systab->fw_vendor);
+        if (c16) {
+                for (i = 0;i < (int) sizeof(vendor) && *c16; ++i)
+                        vendor[i] = *c16++;
+                vendor[i] = '\0';
+        }
+        printk(KERN_INFO "EFI v%u.%.02u by %s:",
+               efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor);
+        for (i = 0; i < (int) efi.systab->nr_tables; i++) {
+                if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
+                        efi.mps = __va(config_tables[i].table);
+                        printk(" MPS=0x%lx", config_tables[i].table);
+                } else if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
+                        efi.acpi20 = __va(config_tables[i].table);
+                        printk(" ACPI 2.0=0x%lx", config_tables[i].table);
+                } else if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
+                        efi.acpi = __va(config_tables[i].table);
+                        printk(" ACPI=0x%lx", config_tables[i].table);
+                } else if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
+                        efi.smbios = __va(config_tables[i].table);
+                        printk(" SMBIOS=0x%lx", config_tables[i].table);
+                } else if (efi_guidcmp(config_tables[i].guid, SAL_SYSTEM_TABLE_GUID) == 0) {
+                        efi.sal_systab = __va(config_tables[i].table);
+                        printk(" SALsystab=0x%lx", config_tables[i].table);
+                } else if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
+                        efi.hcdp = __va(config_tables[i].table);
+                        printk(" HCDP=0x%lx", config_tables[i].table);
+                }
+        }
+        printk("\n");
+        runtime = __va(efi.systab->runtime);
+        efi.get_time = phys_get_time;
+        efi.set_time = phys_set_time;
+        efi.get_wakeup_time = phys_get_wakeup_time;
+        efi.set_wakeup_time = phys_set_wakeup_time;
+        efi.get_variable = phys_get_variable;
+        efi.get_next_variable = phys_get_next_variable;
+        efi.set_variable = phys_set_variable;
+        efi.get_next_high_mono_count = phys_get_next_high_mono_count;
+        efi.reset_system = phys_reset_system;
+        efi_map_start = __va(ia64_boot_param->efi_memmap);
+        efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+        efi_desc_size = ia64_boot_param->efi_memdesc_size;
+#if EFI_DEBUG
+        /* print EFI memory map: */
+        {
+                efi_memory_desc_t *md;
+                void *p;
+                for (i = 0, p = efi_map_start; p < efi_map_end; ++i, p += efi_desc_size) {
+                        md = p;
+                        printk("mem%02u: type=%u, attr=0x%lx, range=[0x%016lx-0x%016lx) (%luMB)\n",
+                               i, md->type, md->attribute, md->phys_addr,
+                               md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
+                               md->num_pages >> (20 - EFI_PAGE_SHIFT));
+                }
+        }
+#endif
+        efi_map_pal_code();
+        efi_enter_virtual_mode();
+}
+void
+efi_enter_virtual_mode (void)
+{
+        void *efi_map_start, *efi_map_end, *p;
+        efi_memory_desc_t *md;
+        efi_status_t status;
+        u64 efi_desc_size;
+        efi_map_start = __va(ia64_boot_param->efi_memmap);
+        efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+        efi_desc_size = ia64_boot_param->efi_memdesc_size;
+        for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+                md = p;
+                if (md->attribute & EFI_MEMORY_RUNTIME) {
+                        /*
+                         * Some descriptors have multiple bits set, so the order of
+                         * the tests is relevant.
+                         */
+                        if (md->attribute & EFI_MEMORY_WB) {
+                                md->virt_addr = (u64) __va(md->phys_addr);
+                        } else if (md->attribute & EFI_MEMORY_UC) {
+                                md->virt_addr = (u64) ioremap(md->phys_addr, 0);
+                        } else if (md->attribute & EFI_MEMORY_WC) {
+#if 0
+                                md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P
+                                                                           | _PAGE_D
+                                                                           | _PAGE_MA_WC
+                                                                           | _PAGE_PL_0
+                                                                           | _PAGE_AR_RW));
+#else
+                                printk(KERN_INFO "EFI_MEMORY_WC mapping\n");
+                                md->virt_addr = (u64) ioremap(md->phys_addr, 0);
+#endif
+                        } else if (md->attribute & EFI_MEMORY_WT) {
+#if 0
+                                md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P
+                                                                           | _PAGE_D | _PAGE_MA_WT
+                                                                           | _PAGE_PL_0
+                                                                           | _PAGE_AR_RW));
+#else
+                                printk(KERN_INFO "EFI_MEMORY_WT mapping\n");
+                                md->virt_addr = (u64) ioremap(md->phys_addr, 0);
+#endif
+                        }
+                }
+        }
+        status = efi_call_phys(__va(runtime->set_virtual_address_map),
+                               ia64_boot_param->efi_memmap_size,
+                               efi_desc_size, ia64_boot_param->efi_memdesc_version,
+                               ia64_boot_param->efi_memmap);
+        if (status != EFI_SUCCESS) {
+                printk(KERN_WARNING "warning: unable to switch EFI into virtual mode "
+                       "(status=%lu)\n", status);
+                return;
+        }
+        /*
+         * Now that EFI is in virtual mode, we call the EFI functions more efficiently:
+         */
+        efi.get_time = virt_get_time;
+        efi.set_time = virt_set_time;
+        efi.get_wakeup_time = virt_get_wakeup_time;
+        efi.set_wakeup_time = virt_set_wakeup_time;
+        efi.get_variable = virt_get_variable;
+        efi.get_next_variable = virt_get_next_variable;
+        efi.set_variable = virt_set_variable;
+        efi.get_next_high_mono_count = virt_get_next_high_mono_count;
+        efi.reset_system = virt_reset_system;
+}
+/*
+ * Walk the EFI memory map looking for the I/O port range.  There can only be one entry of
+ * this type, other I/O port ranges should be described via ACPI.
+ */
+u64
+efi_get_iobase (void)
+{
+        void *efi_map_start, *efi_map_end, *p;
+        efi_memory_desc_t *md;
+        u64 efi_desc_size;
+        efi_map_start = __va(ia64_boot_param->efi_memmap);
+        efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+        efi_desc_size = ia64_boot_param->efi_memdesc_size;
+        for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+                md = p;
+                if (md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
+                        if (md->attribute & EFI_MEMORY_UC)
+                                return md->phys_addr;
+                }
+        }
+        return 0;
+}
+u32
+efi_mem_type (unsigned long phys_addr)
+{
+        void *efi_map_start, *efi_map_end, *p;
+        efi_memory_desc_t *md;
+        u64 efi_desc_size;
+        efi_map_start = __va(ia64_boot_param->efi_memmap);
+        efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+        efi_desc_size = ia64_boot_param->efi_memdesc_size;
+        for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+                md = p;
+                if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
+                         return md->type;
+        }
+        return 0;
+}
+u64
+efi_mem_attributes (unsigned long phys_addr)
+{
+        void *efi_map_start, *efi_map_end, *p;
+        efi_memory_desc_t *md;
+        u64 efi_desc_size;
+        efi_map_start = __va(ia64_boot_param->efi_memmap);
+        efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+        efi_desc_size = ia64_boot_param->efi_memdesc_size;
+        for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+                md = p;
+                if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT))
+                        return md->attribute;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(efi_mem_attributes);
+int
+valid_phys_addr_range (unsigned long phys_addr, unsigned long *size)
+{
+        void *efi_map_start, *efi_map_end, *p;
+        efi_memory_desc_t *md;
+        u64 efi_desc_size;
+        efi_map_start = __va(ia64_boot_param->efi_memmap);
+        efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
+        efi_desc_size = ia64_boot_param->efi_memdesc_size;
+        for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
+                md = p;
+                if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) {
+                        if (!(md->attribute & EFI_MEMORY_WB))
+                                return 0;
+                        if (*size > md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr)
+                                *size = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr;
+                        return 1;
+                }
+        }
+        return 0;
+}
+int __init
+efi_uart_console_only(void)
+{
+        efi_status_t status;
+        char *s, name[] = "ConOut";
+        efi_guid_t guid = EFI_GLOBAL_VARIABLE_GUID;
+        efi_char16_t *utf16, name_utf16[32];
+        unsigned char data[1024];
+        unsigned long size = sizeof(data);
+        struct efi_generic_dev_path *hdr, *end_addr;
+        int uart = 0;
+        /* Convert to UTF-16 */
+        utf16 = name_utf16;
+        s = name;
+        while (*s)
+                *utf16++ = *s++ & 0x7f;
+        *utf16 = 0;
+        status = efi.get_variable(name_utf16, &guid, NULL, &size, data);
+        if (status != EFI_SUCCESS) {
+                printk(KERN_ERR "No EFI %s variable?\n", name);
+                return 0;
+        }
+        hdr = (struct efi_generic_dev_path *) data;
+        end_addr = (struct efi_generic_dev_path *) ((u8 *) data + size);
+        while (hdr < end_addr) {
+                if (hdr->type == EFI_DEV_MSG &&
+                    hdr->sub_type == EFI_DEV_MSG_UART)
+                        uart = 1;
+                else if (hdr->type == EFI_DEV_END_PATH ||
+                          hdr->type == EFI_DEV_END_PATH2) {
+                        if (!uart)
+                                return 0;
+                        if (hdr->sub_type == EFI_DEV_END_ENTIRE)
+                                return 1;
+                        uart = 0;
+                }
+                hdr = (struct efi_generic_dev_path *) ((u8 *) hdr + hdr->length);
+        }
+        printk(KERN_ERR "Malformed %s value\n", name);
+        return 0;
+}
diff --git a/arch/ia64/kernel/efi_stub.S b/arch/ia64/kernel/efi_stub.S
new file mode 100644
index 000000000000..5a7fe70212a9
--- /dev/null
+++ b/arch/ia64/kernel/efi_stub.S
@@ -0,0 +1,86 @@
+/*
+ * EFI call stub.
+ *
+ * Copyright (C) 1999-2001 Hewlett-Packard Co
+ *      David Mosberger <davidm@hpl.hp.com>
+ *
+ * This stub allows us to make EFI calls in physical mode with interrupts
+ * turned off.  We need this because we can't call SetVirtualMap() until
+ * the kernel has booted far enough to allow allocation of struct vma_struct
+ * entries (which we would need to map stuff with memory attributes other
+ * than uncached or writeback...).  Since the GetTime() service gets called
+ * earlier than that, we need to be able to make physical mode EFI calls from
+ * the kernel.
+ */
+/*
+ * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System
+ * Abstraction Layer Specification", revision 2.6e).  Note that
+ * psr.dfl and psr.dfh MUST be cleared, despite what this manual says.
+ * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call
+ * (the br.ia instruction fails unless psr.dfl and psr.dfh are
+ * cleared).  Fortunately, SAL promises not to touch the floating
+ * point regs, so at least we don't have to save f2-f127.
+ */
+#define PSR_BITS_TO_CLEAR                                               \
+        (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT |         \
+         IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |        \
+         IA64_PSR_DFL | IA64_PSR_DFH)
+#define PSR_BITS_TO_SET                                                 \
+        (IA64_PSR_BN)
+#include <asm/processor.h>
+#include <asm/asmmacro.h>
+/*
+ * Inputs:
+ *      in0 = address of function descriptor of EFI routine to call
+ *      in1..in7 = arguments to routine
+ *
+ * Outputs:
+ *      r8 = EFI_STATUS returned by called function
+ */
+GLOBAL_ENTRY(efi_call_phys)
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
+        alloc loc1=ar.pfs,8,7,7,0
+        ld8 r2=[in0],8                  // load EFI function's entry point
+        mov loc0=rp
+        .body
+        ;;
+        mov loc2=gp                     // save global pointer
+        mov loc4=ar.rsc                 // save RSE configuration
+        mov ar.rsc=0                    // put RSE in enforced lazy, LE mode
+        ;;
+        ld8 gp=[in0]                    // load EFI function's global pointer
+        movl r16=PSR_BITS_TO_CLEAR
+        mov loc3=psr                    // save processor status word
+        movl r17=PSR_BITS_TO_SET
+        ;;
+        or loc3=loc3,r17
+        mov b6=r2
+        ;;
+        andcm r16=loc3,r16              // get psr with IT, DT, and RT bits cleared
+        br.call.sptk.many rp=ia64_switch_mode_phys
+.ret0:  mov out4=in5
+        mov out0=in1
+        mov out1=in2
+        mov out2=in3
+        mov out3=in4
+        mov out5=in6
+        mov out6=in7
+        mov loc5=r19
+        mov loc6=r20
+        br.call.sptk.many rp=b6         // call the EFI function
+.ret1:  mov ar.rsc=0                    // put RSE in enforced lazy, LE mode
+        mov r16=loc3
+        mov r19=loc5
+        mov r20=loc6
+        br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
+.ret2:  mov ar.rsc=loc4                 // restore RSE configuration
+        mov ar.pfs=loc1
+        mov rp=loc0
+        mov gp=loc2
+        br.ret.sptk.many rp
+END(efi_call_phys)
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
new file mode 100644
index 000000000000..0272c010a3ba
--- /dev/null
+++ b/arch/ia64/kernel/entry.S
@@ -0,0 +1,1587 @@
+/*
+ * ia64/kernel/entry.S
+ *
+ * Kernel entry points.
+ *
+ * Copyright (C) 1998-2003, 2005 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1999, 2002-2003
+ *      Asit Mallick <Asit.K.Mallick@intel.com>
+ *      Don Dugger <Don.Dugger@intel.com>
+ *      Suresh Siddha <suresh.b.siddha@intel.com>
+ *      Fenghua Yu <fenghua.yu@intel.com>
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ */
+/*
+ * ia64_switch_to now places correct virtual mapping in in TR2 for
+ * kernel stack. This allows us to handle interrupts without changing
+ * to physical mode.
+ *
+ * Jonathan Nicklin     <nicklin@missioncriticallinux.com>
+ * Patrick O'Rourke     <orourke@missioncriticallinux.com>
+ * 11/07/2000
+ */
+/*
+ * Global (preserved) predicate usage on syscall entry/exit path:
+ *
+ *      pKStk:          See entry.h.
+ *      pUStk:          See entry.h.
+ *      pSys:           See entry.h.
+ *      pNonSys:        !pSys
+ */
+#include <linux/config.h>
+#include <asm/asmmacro.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/kregs.h>
+#include <asm/offsets.h>
+#include <asm/pgtable.h>
+#include <asm/percpu.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+#include <asm/unistd.h>
+#include "minstate.h"
+        /*
+         * execve() is special because in case of success, we need to
+         * setup a null register window frame.
+         */
+ENTRY(ia64_execve)
+        /*
+         * Allocate 8 input registers since ptrace() may clobber them
+         */
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
+        alloc loc1=ar.pfs,8,2,4,0
+        mov loc0=rp
+        .body
+        mov out0=in0                    // filename
+        ;;                              // stop bit between alloc and call
+        mov out1=in1                    // argv
+        mov out2=in2                    // envp
+        add out3=16,sp                  // regs
+        br.call.sptk.many rp=sys_execve
+.ret0:
+#ifdef CONFIG_IA32_SUPPORT
+        /*
+         * Check if we're returning to ia32 mode. If so, we need to restore ia32 registers
+         * from pt_regs.
+         */
+        adds r16=PT(CR_IPSR)+16,sp
+        ;;
+        ld8 r16=[r16]
+#endif
+        cmp4.ge p6,p7=r8,r0
+        mov ar.pfs=loc1                 // restore ar.pfs
+        sxt4 r8=r8                      // return 64-bit result
+        ;;
+        stf.spill [sp]=f0
+(p6)    cmp.ne pKStk,pUStk=r0,r0        // a successful execve() lands us in user-mode...
+        mov rp=loc0
+(p6)    mov ar.pfs=r0                   // clear ar.pfs on success
+(p7)    br.ret.sptk.many rp
+        /*
+         * In theory, we'd have to zap this state only to prevent leaking of
+         * security sensitive state (e.g., if current->mm->dumpable is zero).  However,
+         * this executes in less than 20 cycles even on Itanium, so it's not worth
+         * optimizing for...).
+         */
+        mov ar.unat=0;          mov ar.lc=0
+        mov r4=0;               mov f2=f0;              mov b1=r0
+        mov r5=0;               mov f3=f0;              mov b2=r0
+        mov r6=0;               mov f4=f0;              mov b3=r0
+        mov r7=0;               mov f5=f0;              mov b4=r0
+        ldf.fill f12=[sp];      mov f13=f0;             mov b5=r0
+        ldf.fill f14=[sp];      ldf.fill f15=[sp];      mov f16=f0
+        ldf.fill f17=[sp];      ldf.fill f18=[sp];      mov f19=f0
+        ldf.fill f20=[sp];      ldf.fill f21=[sp];      mov f22=f0
+        ldf.fill f23=[sp];      ldf.fill f24=[sp];      mov f25=f0
+        ldf.fill f26=[sp];      ldf.fill f27=[sp];      mov f28=f0
+        ldf.fill f29=[sp];      ldf.fill f30=[sp];      mov f31=f0
+#ifdef CONFIG_IA32_SUPPORT
+        tbit.nz p6,p0=r16, IA64_PSR_IS_BIT
+        movl loc0=ia64_ret_from_ia32_execve
+        ;;
+(p6)    mov rp=loc0
+#endif
+        br.ret.sptk.many rp
+END(ia64_execve)
+/*
+ * sys_clone2(u64 flags, u64 ustack_base, u64 ustack_size, u64 parent_tidptr, u64 child_tidptr,
+ *            u64 tls)
+ */
+GLOBAL_ENTRY(sys_clone2)
+        /*
+         * Allocate 8 input registers since ptrace() may clobber them
+         */
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
+        alloc r16=ar.pfs,8,2,6,0
+        DO_SAVE_SWITCH_STACK
+        adds r2=PT(R16)+IA64_SWITCH_STACK_SIZE+16,sp
+        mov loc0=rp
+        mov loc1=r16                            // save ar.pfs across do_fork
+        .body
+        mov out1=in1
+        mov out3=in2
+        tbit.nz p6,p0=in0,CLONE_SETTLS_BIT
+        mov out4=in3    // parent_tidptr: valid only w/CLONE_PARENT_SETTID
+        ;;
+(p6)    st8 [r2]=in5                            // store TLS in r16 for copy_thread()
+        mov out5=in4    // child_tidptr:  valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID
+        adds out2=IA64_SWITCH_STACK_SIZE+16,sp  // out2 = &regs
+        mov out0=in0                            // out0 = clone_flags
+        br.call.sptk.many rp=do_fork
+.ret1:  .restore sp
+        adds sp=IA64_SWITCH_STACK_SIZE,sp       // pop the switch stack
+        mov ar.pfs=loc1
+        mov rp=loc0
+        br.ret.sptk.many rp
+END(sys_clone2)
+/*
+ * sys_clone(u64 flags, u64 ustack_base, u64 parent_tidptr, u64 child_tidptr, u64 tls)
+ *      Deprecated.  Use sys_clone2() instead.
+ */
+GLOBAL_ENTRY(sys_clone)
+        /*
+         * Allocate 8 input registers since ptrace() may clobber them
+         */
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
+        alloc r16=ar.pfs,8,2,6,0
+        DO_SAVE_SWITCH_STACK
+        adds r2=PT(R16)+IA64_SWITCH_STACK_SIZE+16,sp
+        mov loc0=rp
+        mov loc1=r16                            // save ar.pfs across do_fork
+        .body
+        mov out1=in1
+        mov out3=16                             // stacksize (compensates for 16-byte scratch area)
+        tbit.nz p6,p0=in0,CLONE_SETTLS_BIT
+        mov out4=in2    // parent_tidptr: valid only w/CLONE_PARENT_SETTID
+        ;;
+(p6)    st8 [r2]=in4                            // store TLS in r13 (tp)
+        mov out5=in3    // child_tidptr:  valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID
+        adds out2=IA64_SWITCH_STACK_SIZE+16,sp  // out2 = &regs
+        mov out0=in0                            // out0 = clone_flags
+        br.call.sptk.many rp=do_fork
+.ret2:  .restore sp
+        adds sp=IA64_SWITCH_STACK_SIZE,sp       // pop the switch stack
+        mov ar.pfs=loc1
+        mov rp=loc0
+        br.ret.sptk.many rp
+END(sys_clone)
+/*
+ * prev_task <- ia64_switch_to(struct task_struct *next)
+ *      With Ingo's new scheduler, interrupts are disabled when this routine gets
+ *      called.  The code starting at .map relies on this.  The rest of the code
+ *      doesn't care about the interrupt masking status.
+ */
+GLOBAL_ENTRY(ia64_switch_to)
+        .prologue
+        alloc r16=ar.pfs,1,0,0,0
+        DO_SAVE_SWITCH_STACK
+        .body
+        adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
+        movl r25=init_task
+        mov r27=IA64_KR(CURRENT_STACK)
+        adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
+        dep r20=0,in0,61,3              // physical address of "next"
+        ;;
+        st8 [r22]=sp                    // save kernel stack pointer of old task
+        shr.u r26=r20,IA64_GRANULE_SHIFT
+        cmp.eq p7,p6=r25,in0
+        ;;
+        /*
+         * If we've already mapped this task's page, we can skip doing it again.
+         */
+(p6)    cmp.eq p7,p6=r26,r27
+(p6)    br.cond.dpnt .map
+        ;;
+.done:
+(p6)    ssm psr.ic                      // if we had to map, reenable the psr.ic bit FIRST!!!
+        ;;
+(p6)    srlz.d
+        ld8 sp=[r21]                    // load kernel stack pointer of new task
+        mov IA64_KR(CURRENT)=in0        // update "current" application register
+        mov r8=r13                      // return pointer to previously running task
+        mov r13=in0                     // set "current" pointer
+        ;;
+        DO_LOAD_SWITCH_STACK
+#ifdef CONFIG_SMP
+        sync.i                          // ensure "fc"s done by this CPU are visible on other CPUs
+#endif
+        br.ret.sptk.many rp             // boogie on out in new context
+.map:
+        rsm psr.ic                      // interrupts (psr.i) are already disabled here
+        movl r25=PAGE_KERNEL
+        ;;
+        srlz.d
+        or r23=r25,r20                  // construct PA | page properties
+        mov r25=IA64_GRANULE_SHIFT<<2
+        ;;
+        mov cr.itir=r25
+        mov cr.ifa=in0                  // VA of next task...
+        ;;
+        mov r25=IA64_TR_CURRENT_STACK
+        mov IA64_KR(CURRENT_STACK)=r26  // remember last page we mapped...
+        ;;
+        itr.d dtr[r25]=r23              // wire in new mapping...
+        br.cond.sptk .done
+END(ia64_switch_to)
+/*
+ * Note that interrupts are enabled during save_switch_stack and load_switch_stack.  This
+ * means that we may get an interrupt with "sp" pointing to the new kernel stack while
+ * ar.bspstore is still pointing to the old kernel backing store area.  Since ar.rsc,
+ * ar.rnat, ar.bsp, and ar.bspstore are all preserved by interrupts, this is not a
+ * problem.  Also, we don't need to specify unwind information for preserved registers
+ * that are not modified in save_switch_stack as the right unwind information is already
+ * specified at the call-site of save_switch_stack.
+ */
+/*
+ * save_switch_stack:
+ *      - r16 holds ar.pfs
+ *      - b7 holds address to return to
+ *      - rp (b0) holds return address to save
+ */
+GLOBAL_ENTRY(save_switch_stack)
+        .prologue
+        .altrp b7
+        flushrs                 // flush dirty regs to backing store (must be first in insn group)
+        .save @priunat,r17
+        mov r17=ar.unat         // preserve caller's
+        .body
+#ifdef CONFIG_ITANIUM
+        adds r2=16+128,sp
+        adds r3=16+64,sp
+        adds r14=SW(R4)+16,sp
+        ;;
+        st8.spill [r14]=r4,16           // spill r4
+        lfetch.fault.excl.nt1 [r3],128
+        ;;
+        lfetch.fault.excl.nt1 [r2],128
+        lfetch.fault.excl.nt1 [r3],128
+        ;;
+        lfetch.fault.excl [r2]
+        lfetch.fault.excl [r3]
+        adds r15=SW(R5)+16,sp
+#else
+        add r2=16+3*128,sp
+        add r3=16,sp
+        add r14=SW(R4)+16,sp
+        ;;
+        st8.spill [r14]=r4,SW(R6)-SW(R4)        // spill r4 and prefetch offset 0x1c0
+        lfetch.fault.excl.nt1 [r3],128  //              prefetch offset 0x010
+        ;;
+        lfetch.fault.excl.nt1 [r3],128  //              prefetch offset 0x090
+        lfetch.fault.excl.nt1 [r2],128  //              prefetch offset 0x190
+        ;;
+        lfetch.fault.excl.nt1 [r3]      //              prefetch offset 0x110
+        lfetch.fault.excl.nt1 [r2]      //              prefetch offset 0x210
+        adds r15=SW(R5)+16,sp
+#endif
+        ;;
+        st8.spill [r15]=r5,SW(R7)-SW(R5)        // spill r5
+        mov.m ar.rsc=0                  // put RSE in mode: enforced lazy, little endian, pl 0
+        add r2=SW(F2)+16,sp             // r2 = &sw->f2
+        ;;
+        st8.spill [r14]=r6,SW(B0)-SW(R6)        // spill r6
+        mov.m r18=ar.fpsr               // preserve fpsr
+        add r3=SW(F3)+16,sp             // r3 = &sw->f3
+        ;;
+        stf.spill [r2]=f2,32
+        mov.m r19=ar.rnat
+        mov r21=b0
+        stf.spill [r3]=f3,32
+        st8.spill [r15]=r7,SW(B2)-SW(R7)        // spill r7
+        mov r22=b1
+        ;;
+        // since we're done with the spills, read and save ar.unat:
+        mov.m r29=ar.unat
+        mov.m r20=ar.bspstore
+        mov r23=b2
+        stf.spill [r2]=f4,32
+        stf.spill [r3]=f5,32
+        mov r24=b3
+        ;;
+        st8 [r14]=r21,SW(B1)-SW(B0)             // save b0
+        st8 [r15]=r23,SW(B3)-SW(B2)             // save b2
+        mov r25=b4
+        mov r26=b5
+        ;;
+        st8 [r14]=r22,SW(B4)-SW(B1)             // save b1
+        st8 [r15]=r24,SW(AR_PFS)-SW(B3)         // save b3
+        mov r21=ar.lc           // I-unit
+        stf.spill [r2]=f12,32
+        stf.spill [r3]=f13,32
+        ;;
+        st8 [r14]=r25,SW(B5)-SW(B4)             // save b4
+        st8 [r15]=r16,SW(AR_LC)-SW(AR_PFS)      // save ar.pfs
+        stf.spill [r2]=f14,32
+        stf.spill [r3]=f15,32
+        ;;
+        st8 [r14]=r26                           // save b5
+        st8 [r15]=r21                           // save ar.lc
+        stf.spill [r2]=f16,32
+        stf.spill [r3]=f17,32
+        ;;
+        stf.spill [r2]=f18,32
+        stf.spill [r3]=f19,32
+        ;;
+        stf.spill [r2]=f20,32
+        stf.spill [r3]=f21,32
+        ;;
+        stf.spill [r2]=f22,32
+        stf.spill [r3]=f23,32
+        ;;
+        stf.spill [r2]=f24,32
+        stf.spill [r3]=f25,32
+        ;;
+        stf.spill [r2]=f26,32
+        stf.spill [r3]=f27,32
+        ;;
+        stf.spill [r2]=f28,32
+        stf.spill [r3]=f29,32
+        ;;
+        stf.spill [r2]=f30,SW(AR_UNAT)-SW(F30)
+        stf.spill [r3]=f31,SW(PR)-SW(F31)
+        add r14=SW(CALLER_UNAT)+16,sp
+        ;;
+        st8 [r2]=r29,SW(AR_RNAT)-SW(AR_UNAT)    // save ar.unat
+        st8 [r14]=r17,SW(AR_FPSR)-SW(CALLER_UNAT) // save caller_unat
+        mov r21=pr
+        ;;
+        st8 [r2]=r19,SW(AR_BSPSTORE)-SW(AR_RNAT) // save ar.rnat
+        st8 [r3]=r21                            // save predicate registers
+        ;;
+        st8 [r2]=r20                            // save ar.bspstore
+        st8 [r14]=r18                           // save fpsr
+        mov ar.rsc=3            // put RSE back into eager mode, pl 0
+        br.cond.sptk.many b7
+END(save_switch_stack)
+/*
+ * load_switch_stack:
+ *      - "invala" MUST be done at call site (normally in DO_LOAD_SWITCH_STACK)
+ *      - b7 holds address to return to
+ *      - must not touch r8-r11
+ */
+ENTRY(load_switch_stack)
+        .prologue
+        .altrp b7
+        .body
+        lfetch.fault.nt1 [sp]
+        adds r2=SW(AR_BSPSTORE)+16,sp
+        adds r3=SW(AR_UNAT)+16,sp
+        mov ar.rsc=0                                            // put RSE into enforced lazy mode
+        adds r14=SW(CALLER_UNAT)+16,sp
+        adds r15=SW(AR_FPSR)+16,sp
+        ;;
+        ld8 r27=[r2],(SW(B0)-SW(AR_BSPSTORE))   // bspstore
+        ld8 r29=[r3],(SW(B1)-SW(AR_UNAT))       // unat
+        ;;
+        ld8 r21=[r2],16         // restore b0
+        ld8 r22=[r3],16         // restore b1
+        ;;
+        ld8 r23=[r2],16         // restore b2
+        ld8 r24=[r3],16         // restore b3
+        ;;
+        ld8 r25=[r2],16         // restore b4
+        ld8 r26=[r3],16         // restore b5
+        ;;
+        ld8 r16=[r2],(SW(PR)-SW(AR_PFS))        // ar.pfs
+        ld8 r17=[r3],(SW(AR_RNAT)-SW(AR_LC))    // ar.lc
+        ;;
+        ld8 r28=[r2]            // restore pr
+        ld8 r30=[r3]            // restore rnat
+        ;;
+        ld8 r18=[r14],16        // restore caller's unat
+        ld8 r19=[r15],24        // restore fpsr
+        ;;
+        ldf.fill f2=[r14],32
+        ldf.fill f3=[r15],32
+        ;;
+        ldf.fill f4=[r14],32
+        ldf.fill f5=[r15],32
+        ;;
+        ldf.fill f12=[r14],32
+        ldf.fill f13=[r15],32
+        ;;
+        ldf.fill f14=[r14],32
+        ldf.fill f15=[r15],32
+        ;;
+        ldf.fill f16=[r14],32
+        ldf.fill f17=[r15],32
+        ;;
+        ldf.fill f18=[r14],32
+        ldf.fill f19=[r15],32
+        mov b0=r21
+        ;;
+        ldf.fill f20=[r14],32
+        ldf.fill f21=[r15],32
+        mov b1=r22
+        ;;
+        ldf.fill f22=[r14],32
+        ldf.fill f23=[r15],32
+        mov b2=r23
+        ;;
+        mov ar.bspstore=r27
+        mov ar.unat=r29         // establish unat holding the NaT bits for r4-r7
+        mov b3=r24
+        ;;
+        ldf.fill f24=[r14],32
+        ldf.fill f25=[r15],32
+        mov b4=r25
+        ;;
+        ldf.fill f26=[r14],32
+        ldf.fill f27=[r15],32
+        mov b5=r26
+        ;;
+        ldf.fill f28=[r14],32
+        ldf.fill f29=[r15],32
+        mov ar.pfs=r16
+        ;;
+        ldf.fill f30=[r14],32
+        ldf.fill f31=[r15],24
+        mov ar.lc=r17
+        ;;
+        ld8.fill r4=[r14],16
+        ld8.fill r5=[r15],16
+        mov pr=r28,-1
+        ;;
+        ld8.fill r6=[r14],16
+        ld8.fill r7=[r15],16
+        mov ar.unat=r18                         // restore caller's unat
+        mov ar.rnat=r30                         // must restore after bspstore but before rsc!
+        mov ar.fpsr=r19                         // restore fpsr
+        mov ar.rsc=3                            // put RSE back into eager mode, pl 0
+        br.cond.sptk.many b7
+END(load_switch_stack)
+GLOBAL_ENTRY(__ia64_syscall)
+        .regstk 6,0,0,0
+        mov r15=in5                             // put syscall number in place
+        break __BREAK_SYSCALL
+        movl r2=errno
+        cmp.eq p6,p7=-1,r10
+        ;;
+(p6)    st4 [r2]=r8
+(p6)    mov r8=-1
+        br.ret.sptk.many rp
+END(__ia64_syscall)
+GLOBAL_ENTRY(execve)
+        mov r15=__NR_execve                     // put syscall number in place
+        break __BREAK_SYSCALL
+        br.ret.sptk.many rp
+END(execve)
+GLOBAL_ENTRY(clone)
+        mov r15=__NR_clone                      // put syscall number in place
+        break __BREAK_SYSCALL
+        br.ret.sptk.many rp
+END(clone)
+        /*
+         * Invoke a system call, but do some tracing before and after the call.
+         * We MUST preserve the current register frame throughout this routine
+         * because some system calls (such as ia64_execve) directly
+         * manipulate ar.pfs.
+         */
+GLOBAL_ENTRY(ia64_trace_syscall)
+        PT_REGS_UNWIND_INFO(0)
+        /*
+         * We need to preserve the scratch registers f6-f11 in case the system
+         * call is sigreturn.
+         */
+        adds r16=PT(F6)+16,sp
+        adds r17=PT(F7)+16,sp
+        ;;
+        stf.spill [r16]=f6,32
+        stf.spill [r17]=f7,32
+        ;;
+        stf.spill [r16]=f8,32
+        stf.spill [r17]=f9,32
+        ;;
+        stf.spill [r16]=f10
+        stf.spill [r17]=f11
+        br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
+        adds r16=PT(F6)+16,sp
+        adds r17=PT(F7)+16,sp
+        ;;
+        ldf.fill f6=[r16],32
+        ldf.fill f7=[r17],32
+        ;;
+        ldf.fill f8=[r16],32
+        ldf.fill f9=[r17],32
+        ;;
+        ldf.fill f10=[r16]
+        ldf.fill f11=[r17]
+        // the syscall number may have changed, so re-load it and re-calculate the
+        // syscall entry-point:
+        adds r15=PT(R15)+16,sp                  // r15 = &pt_regs.r15 (syscall #)
+        ;;
+        ld8 r15=[r15]
+        mov r3=NR_syscalls - 1
+        ;;
+        adds r15=-1024,r15
+        movl r16=sys_call_table
+        ;;
+        shladd r20=r15,3,r16                    // r20 = sys_call_table + 8*(syscall-1024)
+        cmp.leu p6,p7=r15,r3
+        ;;
+(p6)    ld8 r20=[r20]                           // load address of syscall entry point
+(p7)    movl r20=sys_ni_syscall
+        ;;
+        mov b6=r20
+        br.call.sptk.many rp=b6                 // do the syscall
+.strace_check_retval:
+        cmp.lt p6,p0=r8,r0                      // syscall failed?
+        adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
+        adds r3=PT(R10)+16,sp                   // r3 = &pt_regs.r10
+        mov r10=0
+(p6)    br.cond.sptk strace_error               // syscall failed ->
+        ;;                                      // avoid RAW on r10
+.strace_save_retval:
+.mem.offset 0,0; st8.spill [r2]=r8              // store return value in slot for r8
+.mem.offset 8,0; st8.spill [r3]=r10             // clear error indication in slot for r10
+        br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
+.ret3:  br.cond.sptk .work_pending_syscall_end
+strace_error:
+        ld8 r3=[r2]                             // load pt_regs.r8
+        sub r9=0,r8                             // negate return value to get errno value
+        ;;
+        cmp.ne p6,p0=r3,r0                      // is pt_regs.r8!=0?
+        adds r3=16,r2                           // r3=&pt_regs.r10
+        ;;
+(p6)    mov r10=-1
+(p6)    mov r8=r9
+        br.cond.sptk .strace_save_retval
+END(ia64_trace_syscall)
+        /*
+         * When traced and returning from sigreturn, we invoke syscall_trace but then
+         * go straight to ia64_leave_kernel rather than ia64_leave_syscall.
+         */
+GLOBAL_ENTRY(ia64_strace_leave_kernel)
+        PT_REGS_UNWIND_INFO(0)
+{       /*
+         * Some versions of gas generate bad unwind info if the first instruction of a
+         * procedure doesn't go into the first slot of a bundle.  This is a workaround.
+         */
+        nop.m 0
+        nop.i 0
+        br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
+}
+.ret4:  br.cond.sptk ia64_leave_kernel
+END(ia64_strace_leave_kernel)
+GLOBAL_ENTRY(ia64_ret_from_clone)
+        PT_REGS_UNWIND_INFO(0)
+{       /*
+         * Some versions of gas generate bad unwind info if the first instruction of a
+         * procedure doesn't go into the first slot of a bundle.  This is a workaround.
+         */
+        nop.m 0
+        nop.i 0
+        /*
+         * We need to call schedule_tail() to complete the scheduling process.
+         * Called by ia64_switch_to() after do_fork()->copy_thread().  r8 contains the
+         * address of the previously executing task.
+         */
+        br.call.sptk.many rp=ia64_invoke_schedule_tail
+}
+.ret8:
+        adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
+        ;;
+        ld4 r2=[r2]
+        ;;
+        mov r8=0
+        and r2=_TIF_SYSCALL_TRACEAUDIT,r2
+        ;;
+        cmp.ne p6,p0=r2,r0
+(p6)    br.cond.spnt .strace_check_retval
+        ;;                                      // added stop bits to prevent r8 dependency
+END(ia64_ret_from_clone)
+        // fall through
+GLOBAL_ENTRY(ia64_ret_from_syscall)
+        PT_REGS_UNWIND_INFO(0)
+        cmp.ge p6,p7=r8,r0                      // syscall executed successfully?
+        adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
+        mov r10=r0                              // clear error indication in r10
+(p7)    br.cond.spnt handle_syscall_error       // handle potential syscall failure
+END(ia64_ret_from_syscall)
+        // fall through
+/*
+ * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
+ *      need to switch to bank 0 and doesn't restore the scratch registers.
+ *      To avoid leaking kernel bits, the scratch registers are set to
+ *      the following known-to-be-safe values:
+ *
+ *                r1: restored (global pointer)
+ *                r2: cleared
+ *                r3: 1 (when returning to user-level)
+ *            r8-r11: restored (syscall return value(s))
+ *               r12: restored (user-level stack pointer)
+ *               r13: restored (user-level thread pointer)
+ *               r14: cleared
+ *               r15: restored (syscall #)
+ *           r16-r17: cleared
+ *               r18: user-level b6
+ *               r19: cleared
+ *               r20: user-level ar.fpsr
+ *               r21: user-level b0
+ *               r22: cleared
+ *               r23: user-level ar.bspstore
+ *               r24: user-level ar.rnat
+ *               r25: user-level ar.unat
+ *               r26: user-level ar.pfs
+ *               r27: user-level ar.rsc
+ *               r28: user-level ip
+ *               r29: user-level psr
+ *               r30: user-level cfm
+ *               r31: user-level pr
+ *            f6-f11: cleared
+ *                pr: restored (user-level pr)
+ *                b0: restored (user-level rp)
+ *                b6: restored
+ *                b7: cleared
+ *           ar.unat: restored (user-level ar.unat)
+ *            ar.pfs: restored (user-level ar.pfs)
+ *            ar.rsc: restored (user-level ar.rsc)
+ *           ar.rnat: restored (user-level ar.rnat)
+ *       ar.bspstore: restored (user-level ar.bspstore)
+ *           ar.fpsr: restored (user-level ar.fpsr)
+ *            ar.ccv: cleared
+ *            ar.csd: cleared
+ *            ar.ssd: cleared
+ */
+ENTRY(ia64_leave_syscall)
+        PT_REGS_UNWIND_INFO(0)
+        /*
+         * work.need_resched etc. mustn't get changed by this CPU before it returns to
+         * user- or fsys-mode, hence we disable interrupts early on.
+         *
+         * p6 controls whether current_thread_info()->flags needs to be check for
+         * extra work.  We always check for extra work when returning to user-level.
+         * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
+         * is 0.  After extra work processing has been completed, execution
+         * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
+         * needs to be redone.
+         */
+#ifdef CONFIG_PREEMPT
+        rsm psr.i                               // disable interrupts
+        cmp.eq pLvSys,p0=r0,r0                  // pLvSys=1: leave from syscall
+(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
+        ;;
+        .pred.rel.mutex pUStk,pKStk
+(pKStk) ld4 r21=[r20]                   // r21 <- preempt_count
+(pUStk) mov r21=0                       // r21 <- 0
+        ;;
+        cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
+#else /* !CONFIG_PREEMPT */
+(pUStk) rsm psr.i
+        cmp.eq pLvSys,p0=r0,r0          // pLvSys=1: leave from syscall
+(pUStk) cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
+#endif
+.work_processed_syscall:
+        adds r2=PT(LOADRS)+16,r12
+        adds r3=PT(AR_BSPSTORE)+16,r12
+        adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
+        ;;
+(p6)    ld4 r31=[r18]                           // load current_thread_info()->flags
+        ld8 r19=[r2],PT(B6)-PT(LOADRS)          // load ar.rsc value for "loadrs"
+        mov b7=r0               // clear b7
+        ;;
+        ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)    // load ar.bspstore (may be garbage)
+        ld8 r18=[r2],PT(R9)-PT(B6)              // load b6
+(p6)    and r15=TIF_WORK_MASK,r31               // any work other than TIF_SYSCALL_TRACE?
+        ;;
+        mov r16=ar.bsp                          // M2  get existing backing store pointer
+(p6)    cmp4.ne.unc p6,p0=r15, r0               // any special work pending?
+(p6)    br.cond.spnt .work_pending_syscall
+        ;;
+        // start restoring the state saved on the kernel stack (struct pt_regs):
+        ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
+        ld8 r11=[r3],PT(CR_IIP)-PT(R11)
+        mov f6=f0               // clear f6
+        ;;
+        invala                  // M0|1 invalidate ALAT
+        rsm psr.i | psr.ic      // M2 initiate turning off of interrupt and interruption collection
+        mov f9=f0               // clear f9
+        ld8 r29=[r2],16         // load cr.ipsr
+        ld8 r28=[r3],16                 // load cr.iip
+        mov f8=f0               // clear f8
+        ;;
+        ld8 r30=[r2],16         // M0|1 load cr.ifs
+        mov.m ar.ssd=r0         // M2 clear ar.ssd
+        cmp.eq p9,p0=r0,r0      // set p9 to indicate that we should restore cr.ifs
+        ;;
+        ld8 r25=[r3],16         // M0|1 load ar.unat
+        mov.m ar.csd=r0         // M2 clear ar.csd
+        mov r22=r0              // clear r22
+        ;;
+        ld8 r26=[r2],PT(B0)-PT(AR_PFS)  // M0|1 load ar.pfs
+(pKStk) mov r22=psr             // M2 read PSR now that interrupts are disabled
+        mov f10=f0              // clear f10
+        ;;
+        ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0
+        ld8 r27=[r3],PT(PR)-PT(AR_RSC)  // load ar.rsc
+        mov f11=f0              // clear f11
+        ;;
+        ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)    // load ar.rnat (may be garbage)
+        ld8 r31=[r3],PT(R1)-PT(PR)              // load predicates
+(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
+        ;;
+        ld8 r20=[r2],PT(R12)-PT(AR_FPSR)        // load ar.fpsr
+        ld8.fill r1=[r3],16     // load r1
+(pUStk) mov r17=1
+        ;;
+        srlz.d                  // M0  ensure interruption collection is off
+        ld8.fill r13=[r3],16
+        mov f7=f0               // clear f7
+        ;;
+        ld8.fill r12=[r2]       // restore r12 (sp)
+        ld8.fill r15=[r3]       // restore r15
+        addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0
+        ;;
+(pUStk) ld4 r3=[r3]             // r3 = cpu_data->phys_stacked_size_p8
+(pUStk) st1 [r14]=r17
+        mov b6=r18              // I0  restore b6
+        ;;
+        mov r14=r0              // clear r14
+        shr.u r18=r19,16        // I0|1 get byte size of existing "dirty" partition
+(pKStk) br.cond.dpnt.many skip_rbs_switch
+        mov.m ar.ccv=r0         // clear ar.ccv
+(pNonSys) br.cond.dpnt.many dont_preserve_current_frame
+        br.cond.sptk.many rbs_switch
+END(ia64_leave_syscall)
+#ifdef CONFIG_IA32_SUPPORT
+GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
+        PT_REGS_UNWIND_INFO(0)
+        adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
+        adds r3=PT(R10)+16,sp                   // r3 = &pt_regs.r10
+        ;;
+        .mem.offset 0,0
+        st8.spill [r2]=r8       // store return value in slot for r8 and set unat bit
+        .mem.offset 8,0
+        st8.spill [r3]=r0       // clear error indication in slot for r10 and set unat bit
+END(ia64_ret_from_ia32_execve_syscall)
+        // fall through
+#endif /* CONFIG_IA32_SUPPORT */
+GLOBAL_ENTRY(ia64_leave_kernel)
+        PT_REGS_UNWIND_INFO(0)
+        /*
+         * work.need_resched etc. mustn't get changed by this CPU before it returns to
+         * user- or fsys-mode, hence we disable interrupts early on.
+         *
+         * p6 controls whether current_thread_info()->flags needs to be check for
+         * extra work.  We always check for extra work when returning to user-level.
+         * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
+         * is 0.  After extra work processing has been completed, execution
+         * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
+         * needs to be redone.
+         */
+#ifdef CONFIG_PREEMPT
+        rsm psr.i                               // disable interrupts
+        cmp.eq p0,pLvSys=r0,r0                  // pLvSys=0: leave from kernel
+(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
+        ;;
+        .pred.rel.mutex pUStk,pKStk
+(pKStk) ld4 r21=[r20]                   // r21 <- preempt_count
+(pUStk) mov r21=0                       // r21 <- 0
+        ;;
+        cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
+#else
+(pUStk) rsm psr.i
+        cmp.eq p0,pLvSys=r0,r0          // pLvSys=0: leave from kernel
+(pUStk) cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
+#endif
+.work_processed_kernel:
+        adds r17=TI_FLAGS+IA64_TASK_SIZE,r13
+        ;;
+(p6)    ld4 r31=[r17]                           // load current_thread_info()->flags
+        adds r21=PT(PR)+16,r12
+        ;;
+        lfetch [r21],PT(CR_IPSR)-PT(PR)
+        adds r2=PT(B6)+16,r12
+        adds r3=PT(R16)+16,r12
+        ;;
+        lfetch [r21]
+        ld8 r28=[r2],8          // load b6
+        adds r29=PT(R24)+16,r12
+        ld8.fill r16=[r3],PT(AR_CSD)-PT(R16)
+        adds r30=PT(AR_CCV)+16,r12
+(p6)    and r19=TIF_WORK_MASK,r31               // any work other than TIF_SYSCALL_TRACE?
+        ;;
+        ld8.fill r24=[r29]
+        ld8 r15=[r30]           // load ar.ccv
+(p6)    cmp4.ne.unc p6,p0=r19, r0               // any special work pending?
+        ;;
+        ld8 r29=[r2],16         // load b7
+        ld8 r30=[r3],16         // load ar.csd
+(p6)    br.cond.spnt .work_pending
+        ;;
+        ld8 r31=[r2],16         // load ar.ssd
+        ld8.fill r8=[r3],16
+        ;;
+        ld8.fill r9=[r2],16
+        ld8.fill r10=[r3],PT(R17)-PT(R10)
+        ;;
+        ld8.fill r11=[r2],PT(R18)-PT(R11)
+        ld8.fill r17=[r3],16
+        ;;
+        ld8.fill r18=[r2],16
+        ld8.fill r19=[r3],16
+        ;;
+        ld8.fill r20=[r2],16
+        ld8.fill r21=[r3],16
+        mov ar.csd=r30
+        mov ar.ssd=r31
+        ;;
+        rsm psr.i | psr.ic      // initiate turning off of interrupt and interruption collection
+        invala                  // invalidate ALAT
+        ;;
+        ld8.fill r22=[r2],24
+        ld8.fill r23=[r3],24
+        mov b6=r28
+        ;;
+        ld8.fill r25=[r2],16
+        ld8.fill r26=[r3],16
+        mov b7=r29
+        ;;
+        ld8.fill r27=[r2],16
+        ld8.fill r28=[r3],16
+        ;;
+        ld8.fill r29=[r2],16
+        ld8.fill r30=[r3],24
+        ;;
+        ld8.fill r31=[r2],PT(F9)-PT(R31)
+        adds r3=PT(F10)-PT(F6),r3
+        ;;
+        ldf.fill f9=[r2],PT(F6)-PT(F9)
+        ldf.fill f10=[r3],PT(F8)-PT(F10)
+        ;;
+        ldf.fill f6=[r2],PT(F7)-PT(F6)
+        ;;
+        ldf.fill f7=[r2],PT(F11)-PT(F7)
+        ldf.fill f8=[r3],32
+        ;;
+        srlz.i                  // ensure interruption collection is off
+        mov ar.ccv=r15
+        ;;
+        ldf.fill f11=[r2]
+        bsw.0                   // switch back to bank 0 (no stop bit required beforehand...)
+        ;;
+(pUStk) mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
+        adds r16=PT(CR_IPSR)+16,r12
+        adds r17=PT(CR_IIP)+16,r12
+(pKStk) mov r22=psr             // M2 read PSR now that interrupts are disabled
+        nop.i 0
+        nop.i 0
+        ;;
+        ld8 r29=[r16],16        // load cr.ipsr
+        ld8 r28=[r17],16        // load cr.iip
+        ;;
+        ld8 r30=[r16],16        // load cr.ifs
+        ld8 r25=[r17],16        // load ar.unat
+        ;;
+        ld8 r26=[r16],16        // load ar.pfs
+        ld8 r27=[r17],16        // load ar.rsc
+        cmp.eq p9,p0=r0,r0      // set p9 to indicate that we should restore cr.ifs
+        ;;
+        ld8 r24=[r16],16        // load ar.rnat (may be garbage)
+        ld8 r23=[r17],16        // load ar.bspstore (may be garbage)
+        ;;
+        ld8 r31=[r16],16        // load predicates
+        ld8 r21=[r17],16        // load b0
+        ;;
+        ld8 r19=[r16],16        // load ar.rsc value for "loadrs"
+        ld8.fill r1=[r17],16    // load r1
+        ;;
+        ld8.fill r12=[r16],16
+        ld8.fill r13=[r17],16
+(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
+        ;;
+        ld8 r20=[r16],16        // ar.fpsr
+        ld8.fill r15=[r17],16
+        ;;
+        ld8.fill r14=[r16],16
+        ld8.fill r2=[r17]
+(pUStk) mov r17=1
+        ;;
+        ld8.fill r3=[r16]
+(pUStk) st1 [r18]=r17           // restore current->thread.on_ustack
+        shr.u r18=r19,16        // get byte size of existing "dirty" partition
+        ;;
+        mov r16=ar.bsp          // get existing backing store pointer
+        addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
+        ;;
+        ld4 r17=[r17]           // r17 = cpu_data->phys_stacked_size_p8
+(pKStk) br.cond.dpnt skip_rbs_switch
+        /*
+         * Restore user backing store.
+         *
+         * NOTE: alloc, loadrs, and cover can't be predicated.
+         */
+(pNonSys) br.cond.dpnt dont_preserve_current_frame
+rbs_switch:
+        cover                           // add current frame into dirty partition and set cr.ifs
+        ;;
+        mov r19=ar.bsp                  // get new backing store pointer
+        sub r16=r16,r18                 // krbs = old bsp - size of dirty partition
+        cmp.ne p9,p0=r0,r0              // clear p9 to skip restore of cr.ifs
+        ;;
+        sub r19=r19,r16                 // calculate total byte size of dirty partition
+        add r18=64,r18                  // don't force in0-in7 into memory...
+        ;;
+        shl r19=r19,16                  // shift size of dirty partition into loadrs position
+        ;;
+dont_preserve_current_frame:
+        /*
+         * To prevent leaking bits between the kernel and user-space,
+         * we must clear the stacked registers in the "invalid" partition here.
+         * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
+         * 5 registers/cycle on McKinley).
+         */
+#       define pRecurse p6
+#       define pReturn  p7
+#ifdef CONFIG_ITANIUM
+#       define Nregs    10
+#else
+#       define Nregs    14
+#endif
+        alloc loc0=ar.pfs,2,Nregs-2,2,0
+        shr.u loc1=r18,9                // RNaTslots <= floor(dirtySize / (64*8))
+        sub r17=r17,r18                 // r17 = (physStackedSize + 8) - dirtySize
+        ;;
+        mov ar.rsc=r19                  // load ar.rsc to be used for "loadrs"
+        shladd in0=loc1,3,r17
+        mov in1=0
+        ;;
+        TEXT_ALIGN(32)
+rse_clear_invalid:
+#ifdef CONFIG_ITANIUM
+        // cycle 0
+ { .mii
+        alloc loc0=ar.pfs,2,Nregs-2,2,0
+        cmp.lt pRecurse,p0=Nregs*8,in0  // if more than Nregs regs left to clear, (re)curse
+        add out0=-Nregs*8,in0
+}{ .mfb
+        add out1=1,in1                  // increment recursion count
+        nop.f 0
+        nop.b 0                         // can't do br.call here because of alloc (WAW on CFM)
+        ;;
+}{ .mfi // cycle 1
+        mov loc1=0
+        nop.f 0
+        mov loc2=0
+}{ .mib
+        mov loc3=0
+        mov loc4=0
+(pRecurse) br.call.sptk.many b0=rse_clear_invalid
+}{ .mfi // cycle 2
+        mov loc5=0
+        nop.f 0
+        cmp.ne pReturn,p0=r0,in1        // if recursion count != 0, we need to do a br.ret
+}{ .mib
+        mov loc6=0
+        mov loc7=0
+(pReturn) br.ret.sptk.many b0
+}
+#else /* !CONFIG_ITANIUM */
+        alloc loc0=ar.pfs,2,Nregs-2,2,0
+        cmp.lt pRecurse,p0=Nregs*8,in0  // if more than Nregs regs left to clear, (re)curse
+        add out0=-Nregs*8,in0
+        add out1=1,in1                  // increment recursion count
+        mov loc1=0
+        mov loc2=0
+        ;;
+        mov loc3=0
+        mov loc4=0
+        mov loc5=0
+        mov loc6=0
+        mov loc7=0
+(pRecurse) br.call.sptk.few b0=rse_clear_invalid
+        ;;
+        mov loc8=0
+        mov loc9=0
+        cmp.ne pReturn,p0=r0,in1        // if recursion count != 0, we need to do a br.ret
+        mov loc10=0
+        mov loc11=0
+(pReturn) br.ret.sptk.many b0
+#endif /* !CONFIG_ITANIUM */
+#       undef pRecurse
+#       undef pReturn
+        ;;
+        alloc r17=ar.pfs,0,0,0,0        // drop current register frame
+        ;;
+        loadrs
+        ;;
+skip_rbs_switch:
+        mov ar.unat=r25         // M2
+(pKStk) extr.u r22=r22,21,1     // I0 extract current value of psr.pp from r22
+(pLvSys)mov r19=r0              // A  clear r19 for leave_syscall, no-op otherwise
+        ;;
+(pUStk) mov ar.bspstore=r23     // M2
+(pKStk) dep r29=r22,r29,21,1    // I0 update ipsr.pp with psr.pp
+(pLvSys)mov r16=r0              // A  clear r16 for leave_syscall, no-op otherwise
+        ;;
+        mov cr.ipsr=r29         // M2
+        mov ar.pfs=r26          // I0
+(pLvSys)mov r17=r0              // A  clear r17 for leave_syscall, no-op otherwise
+(p9)    mov cr.ifs=r30          // M2
+        mov b0=r21              // I0
+(pLvSys)mov r18=r0              // A  clear r18 for leave_syscall, no-op otherwise
+        mov ar.fpsr=r20         // M2
+        mov cr.iip=r28          // M2
+        nop 0
+        ;;
+(pUStk) mov ar.rnat=r24         // M2 must happen with RSE in lazy mode
+        nop 0
+(pLvSys)mov r2=r0
+        mov ar.rsc=r27          // M2
+        mov pr=r31,-1           // I0
+        rfi                     // B
+        /*
+         * On entry:
+         *      r20 = &current->thread_info->pre_count (if CONFIG_PREEMPT)
+         *      r31 = current->thread_info->flags
+         * On exit:
+         *      p6 = TRUE if work-pending-check needs to be redone
+         */
+.work_pending_syscall:
+        add r2=-8,r2
+        add r3=-8,r3
+        ;;
+        st8 [r2]=r8
+        st8 [r3]=r10
+.work_pending:
+        tbit.nz p6,p0=r31,TIF_SIGDELAYED                // signal delayed from  MCA/INIT/NMI/PMI context?
+(p6)    br.cond.sptk.few .sigdelayed
+        ;;
+        tbit.z p6,p0=r31,TIF_NEED_RESCHED               // current_thread_info()->need_resched==0?
+(p6)    br.cond.sptk.few .notify
+#ifdef CONFIG_PREEMPT
+(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
+        ;;
+(pKStk) st4 [r20]=r21
+        ssm psr.i               // enable interrupts
+#endif
+        br.call.spnt.many rp=schedule
+.ret9:  cmp.eq p6,p0=r0,r0                              // p6 <- 1
+        rsm psr.i               // disable interrupts
+        ;;
+#ifdef CONFIG_PREEMPT
+(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
+        ;;
+(pKStk) st4 [r20]=r0            // preempt_count() <- 0
+#endif
+(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
+        br.cond.sptk.many .work_processed_kernel        // re-check
+.notify:
+(pUStk) br.call.spnt.many rp=notify_resume_user
+.ret10: cmp.ne p6,p0=r0,r0                              // p6 <- 0
+(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
+        br.cond.sptk.many .work_processed_kernel        // don't re-check
+// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
+// it could not be delivered.  Deliver it now.  The signal might be for us and
+// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
+// signal.
+.sigdelayed:
+        br.call.sptk.many rp=do_sigdelayed
+        cmp.eq p6,p0=r0,r0                              // p6 <- 1, always re-check
+(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
+        br.cond.sptk.many .work_processed_kernel        // re-check
+.work_pending_syscall_end:
+        adds r2=PT(R8)+16,r12
+        adds r3=PT(R10)+16,r12
+        ;;
+        ld8 r8=[r2]
+        ld8 r10=[r3]
+        br.cond.sptk.many .work_processed_syscall       // re-check
+END(ia64_leave_kernel)
+ENTRY(handle_syscall_error)
+        /*
+         * Some system calls (e.g., ptrace, mmap) can return arbitrary values which could
+         * lead us to mistake a negative return value as a failed syscall.  Those syscall
+         * must deposit a non-zero value in pt_regs.r8 to indicate an error.  If
+         * pt_regs.r8 is zero, we assume that the call completed successfully.
+         */
+        PT_REGS_UNWIND_INFO(0)
+        ld8 r3=[r2]             // load pt_regs.r8
+        ;;
+        cmp.eq p6,p7=r3,r0      // is pt_regs.r8==0?
+        ;;
+(p7)    mov r10=-1
+(p7)    sub r8=0,r8             // negate return value to get errno
+        br.cond.sptk ia64_leave_syscall
+END(handle_syscall_error)
+        /*
+         * Invoke schedule_tail(task) while preserving in0-in7, which may be needed
+         * in case a system call gets restarted.
+         */
+GLOBAL_ENTRY(ia64_invoke_schedule_tail)
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
+        alloc loc1=ar.pfs,8,2,1,0
+        mov loc0=rp
+        mov out0=r8                             // Address of previous task
+        ;;
+        br.call.sptk.many rp=schedule_tail
+.ret11: mov ar.pfs=loc1
+        mov rp=loc0
+        br.ret.sptk.many rp
+END(ia64_invoke_schedule_tail)
+        /*
+         * Setup stack and call do_notify_resume_user().  Note that pSys and pNonSys need to
+         * be set up by the caller.  We declare 8 input registers so the system call
+         * args get preserved, in case we need to restart a system call.
+         */
+ENTRY(notify_resume_user)
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
+        alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
+        mov r9=ar.unat
+        mov loc0=rp                             // save return address
+        mov out0=0                              // there is no "oldset"
+        adds out1=8,sp                          // out1=&sigscratch->ar_pfs
+(pSys)  mov out2=1                              // out2==1 => we're in a syscall
+        ;;
+(pNonSys) mov out2=0                            // out2==0 => not a syscall
+        .fframe 16
+        .spillpsp ar.unat, 16                   // (note that offset is relative to psp+0x10!)
+        st8 [sp]=r9,-16                         // allocate space for ar.unat and save it
+        st8 [out1]=loc1,-8                      // save ar.pfs, out1=&sigscratch
+        .body
+        br.call.sptk.many rp=do_notify_resume_user
+.ret15: .restore sp
+        adds sp=16,sp                           // pop scratch stack space
+        ;;
+        ld8 r9=[sp]                             // load new unat from sigscratch->scratch_unat
+        mov rp=loc0
+        ;;
+        mov ar.unat=r9
+        mov ar.pfs=loc1
+        br.ret.sptk.many rp
+END(notify_resume_user)
+GLOBAL_ENTRY(sys_rt_sigsuspend)
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
+        alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
+        mov r9=ar.unat
+        mov loc0=rp                             // save return address
+        mov out0=in0                            // mask
+        mov out1=in1                            // sigsetsize
+        adds out2=8,sp                          // out2=&sigscratch->ar_pfs
+        ;;
+        .fframe 16
+        .spillpsp ar.unat, 16                   // (note that offset is relative to psp+0x10!)
+        st8 [sp]=r9,-16                         // allocate space for ar.unat and save it
+        st8 [out2]=loc1,-8                      // save ar.pfs, out2=&sigscratch
+        .body
+        br.call.sptk.many rp=ia64_rt_sigsuspend
+.ret17: .restore sp
+        adds sp=16,sp                           // pop scratch stack space
+        ;;
+        ld8 r9=[sp]                             // load new unat from sw->caller_unat
+        mov rp=loc0
+        ;;
+        mov ar.unat=r9
+        mov ar.pfs=loc1
+        br.ret.sptk.many rp
+END(sys_rt_sigsuspend)
+ENTRY(sys_rt_sigreturn)
+        PT_REGS_UNWIND_INFO(0)
+        /*
+         * Allocate 8 input registers since ptrace() may clobber them
+         */
+        alloc r2=ar.pfs,8,0,1,0
+        .prologue
+        PT_REGS_SAVES(16)
+        adds sp=-16,sp
+        .body
+        cmp.eq pNonSys,pSys=r0,r0               // sigreturn isn't a normal syscall...
+        ;;
+        /*
+         * leave_kernel() restores f6-f11 from pt_regs, but since the streamlined
+         * syscall-entry path does not save them we save them here instead.  Note: we
+         * don't need to save any other registers that are not saved by the stream-lined
+         * syscall path, because restore_sigcontext() restores them.
+         */
+        adds r16=PT(F6)+32,sp
+        adds r17=PT(F7)+32,sp
+        ;;
+        stf.spill [r16]=f6,32
+        stf.spill [r17]=f7,32
+        ;;
+        stf.spill [r16]=f8,32
+        stf.spill [r17]=f9,32
+        ;;
+        stf.spill [r16]=f10
+        stf.spill [r17]=f11
+        adds out0=16,sp                         // out0 = &sigscratch
+        br.call.sptk.many rp=ia64_rt_sigreturn
+.ret19: .restore sp 0
+        adds sp=16,sp
+        ;;
+        ld8 r9=[sp]                             // load new ar.unat
+        mov.sptk b7=r8,ia64_leave_kernel
+        ;;
+        mov ar.unat=r9
+        br.many b7
+END(sys_rt_sigreturn)
+GLOBAL_ENTRY(ia64_prepare_handle_unaligned)
+        .prologue
+        /*
+         * r16 = fake ar.pfs, we simply need to make sure privilege is still 0
+         */
+        mov r16=r0
+        DO_SAVE_SWITCH_STACK
+        br.call.sptk.many rp=ia64_handle_unaligned      // stack frame setup in ivt
+.ret21: .body
+        DO_LOAD_SWITCH_STACK
+        br.cond.sptk.many rp                            // goes to ia64_leave_kernel
+END(ia64_prepare_handle_unaligned)
+        //
+        // unw_init_running(void (*callback)(info, arg), void *arg)
+        //
+#       define EXTRA_FRAME_SIZE ((UNW_FRAME_INFO_SIZE+15)&~15)
+GLOBAL_ENTRY(unw_init_running)
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
+        alloc loc1=ar.pfs,2,3,3,0
+        ;;
+        ld8 loc2=[in0],8
+        mov loc0=rp
+        mov r16=loc1
+        DO_SAVE_SWITCH_STACK
+        .body
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
+        .fframe IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE
+        SWITCH_STACK_SAVES(EXTRA_FRAME_SIZE)
+        adds sp=-EXTRA_FRAME_SIZE,sp
+        .body
+        ;;
+        adds out0=16,sp                         // &info
+        mov out1=r13                            // current
+        adds out2=16+EXTRA_FRAME_SIZE,sp        // &switch_stack
+        br.call.sptk.many rp=unw_init_frame_info
+1:      adds out0=16,sp                         // &info
+        mov b6=loc2
+        mov loc2=gp                             // save gp across indirect function call
+        ;;
+        ld8 gp=[in0]
+        mov out1=in1                            // arg
+        br.call.sptk.many rp=b6                 // invoke the callback function
+1:      mov gp=loc2                             // restore gp
+        // For now, we don't allow changing registers from within
+        // unw_init_running; if we ever want to allow that, we'd
+        // have to do a load_switch_stack here:
+        .restore sp
+        adds sp=IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE,sp
+        mov ar.pfs=loc1
+        mov rp=loc0
+        br.ret.sptk.many rp
+END(unw_init_running)
+        .rodata
+        .align 8
+        .globl sys_call_table
+sys_call_table:
+        data8 sys_ni_syscall            //  This must be sys_ni_syscall!  See ivt.S.
+        data8 sys_exit                          // 1025
+        data8 sys_read
+        data8 sys_write
+        data8 sys_open
+        data8 sys_close
+        data8 sys_creat                         // 1030
+        data8 sys_link
+        data8 sys_unlink
+        data8 ia64_execve
+        data8 sys_chdir
+        data8 sys_fchdir                        // 1035
+        data8 sys_utimes
+        data8 sys_mknod
+        data8 sys_chmod
+        data8 sys_chown
+        data8 sys_lseek                         // 1040
+        data8 sys_getpid
+        data8 sys_getppid
+        data8 sys_mount
+        data8 sys_umount
+        data8 sys_setuid                        // 1045
+        data8 sys_getuid
+        data8 sys_geteuid
+        data8 sys_ptrace
+        data8 sys_access
+        data8 sys_sync                          // 1050
+        data8 sys_fsync
+        data8 sys_fdatasync
+        data8 sys_kill
+        data8 sys_rename
+        data8 sys_mkdir                         // 1055
+        data8 sys_rmdir
+        data8 sys_dup
+        data8 sys_pipe
+        data8 sys_times
+        data8 ia64_brk                          // 1060
+        data8 sys_setgid
+        data8 sys_getgid
+        data8 sys_getegid
+        data8 sys_acct
+        data8 sys_ioctl                         // 1065
+        data8 sys_fcntl
+        data8 sys_umask
+        data8 sys_chroot
+        data8 sys_ustat
+        data8 sys_dup2                          // 1070
+        data8 sys_setreuid
+        data8 sys_setregid
+        data8 sys_getresuid
+        data8 sys_setresuid
+        data8 sys_getresgid                     // 1075
+        data8 sys_setresgid
+        data8 sys_getgroups
+        data8 sys_setgroups
+        data8 sys_getpgid
+        data8 sys_setpgid                       // 1080
+        data8 sys_setsid
+        data8 sys_getsid
+        data8 sys_sethostname
+        data8 sys_setrlimit
+        data8 sys_getrlimit                     // 1085
+        data8 sys_getrusage
+        data8 sys_gettimeofday
+        data8 sys_settimeofday
+        data8 sys_select
+        data8 sys_poll                          // 1090
+        data8 sys_symlink
+        data8 sys_readlink
+        data8 sys_uselib
+        data8 sys_swapon
+        data8 sys_swapoff                       // 1095
+        data8 sys_reboot
+        data8 sys_truncate
+        data8 sys_ftruncate
+        data8 sys_fchmod
+        data8 sys_fchown                        // 1100
+        data8 ia64_getpriority
+        data8 sys_setpriority
+        data8 sys_statfs
+        data8 sys_fstatfs
+        data8 sys_gettid                        // 1105
+        data8 sys_semget
+        data8 sys_semop
+        data8 sys_semctl
+        data8 sys_msgget
+        data8 sys_msgsnd                        // 1110
+        data8 sys_msgrcv
+        data8 sys_msgctl
+        data8 sys_shmget
+        data8 ia64_shmat
+        data8 sys_shmdt                         // 1115
+        data8 sys_shmctl
+        data8 sys_syslog
+        data8 sys_setitimer
+        data8 sys_getitimer
+        data8 sys_ni_syscall                    // 1120         /* was: ia64_oldstat */
+        data8 sys_ni_syscall                                    /* was: ia64_oldlstat */
+        data8 sys_ni_syscall                                    /* was: ia64_oldfstat */
+        data8 sys_vhangup
+        data8 sys_lchown
+        data8 sys_remap_file_pages              // 1125
+        data8 sys_wait4
+        data8 sys_sysinfo
+        data8 sys_clone
+        data8 sys_setdomainname
+        data8 sys_newuname                      // 1130
+        data8 sys_adjtimex
+        data8 sys_ni_syscall                                    /* was: ia64_create_module */
+        data8 sys_init_module
+        data8 sys_delete_module
+        data8 sys_ni_syscall                    // 1135         /* was: sys_get_kernel_syms */
+        data8 sys_ni_syscall                                    /* was: sys_query_module */
+        data8 sys_quotactl
+        data8 sys_bdflush
+        data8 sys_sysfs
+        data8 sys_personality                   // 1140
+        data8 sys_ni_syscall            // sys_afs_syscall
+        data8 sys_setfsuid
+        data8 sys_setfsgid
+        data8 sys_getdents
+        data8 sys_flock                         // 1145
+        data8 sys_readv
+        data8 sys_writev
+        data8 sys_pread64
+        data8 sys_pwrite64
+        data8 sys_sysctl                        // 1150
+        data8 sys_mmap
+        data8 sys_munmap
+        data8 sys_mlock
+        data8 sys_mlockall
+        data8 sys_mprotect                      // 1155
+        data8 ia64_mremap
+        data8 sys_msync
+        data8 sys_munlock
+        data8 sys_munlockall
+        data8 sys_sched_getparam                // 1160
+        data8 sys_sched_setparam
+        data8 sys_sched_getscheduler
+        data8 sys_sched_setscheduler
+        data8 sys_sched_yield
+        data8 sys_sched_get_priority_max        // 1165
+        data8 sys_sched_get_priority_min
+        data8 sys_sched_rr_get_interval
+        data8 sys_nanosleep
+        data8 sys_nfsservctl
+        data8 sys_prctl                         // 1170
+        data8 sys_getpagesize
+        data8 sys_mmap2
+        data8 sys_pciconfig_read
+        data8 sys_pciconfig_write
+        data8 sys_perfmonctl                    // 1175
+        data8 sys_sigaltstack
+        data8 sys_rt_sigaction
+        data8 sys_rt_sigpending
+        data8 sys_rt_sigprocmask
+        data8 sys_rt_sigqueueinfo               // 1180
+        data8 sys_rt_sigreturn
+        data8 sys_rt_sigsuspend
+        data8 sys_rt_sigtimedwait
+        data8 sys_getcwd
+        data8 sys_capget                        // 1185
+        data8 sys_capset
+        data8 sys_sendfile64
+        data8 sys_ni_syscall            // sys_getpmsg (STREAMS)
+        data8 sys_ni_syscall            // sys_putpmsg (STREAMS)
+        data8 sys_socket                        // 1190
+        data8 sys_bind
+        data8 sys_connect
+        data8 sys_listen
+        data8 sys_accept
+        data8 sys_getsockname                   // 1195
+        data8 sys_getpeername
+        data8 sys_socketpair
+        data8 sys_send
+        data8 sys_sendto
+        data8 sys_recv                          // 1200
+        data8 sys_recvfrom
+        data8 sys_shutdown
+        data8 sys_setsockopt
+        data8 sys_getsockopt
+        data8 sys_sendmsg                       // 1205
+        data8 sys_recvmsg
+        data8 sys_pivot_root
+        data8 sys_mincore
+        data8 sys_madvise
+        data8 sys_newstat                       // 1210
+        data8 sys_newlstat
+        data8 sys_newfstat
+        data8 sys_clone2
+        data8 sys_getdents64
+        data8 sys_getunwind                     // 1215
+        data8 sys_readahead
+        data8 sys_setxattr
+        data8 sys_lsetxattr
+        data8 sys_fsetxattr
+        data8 sys_getxattr                      // 1220
+        data8 sys_lgetxattr
+        data8 sys_fgetxattr
+        data8 sys_listxattr
+        data8 sys_llistxattr
+        data8 sys_flistxattr                    // 1225
+        data8 sys_removexattr
+        data8 sys_lremovexattr
+        data8 sys_fremovexattr
+        data8 sys_tkill
+        data8 sys_futex                         // 1230
+        data8 sys_sched_setaffinity
+        data8 sys_sched_getaffinity
+        data8 sys_set_tid_address
+        data8 sys_fadvise64_64
+        data8 sys_tgkill                        // 1235
+        data8 sys_exit_group
+        data8 sys_lookup_dcookie
+        data8 sys_io_setup
+        data8 sys_io_destroy
+        data8 sys_io_getevents                  // 1240
+        data8 sys_io_submit
+        data8 sys_io_cancel
+        data8 sys_epoll_create
+        data8 sys_epoll_ctl
+        data8 sys_epoll_wait                    // 1245
+        data8 sys_restart_syscall
+        data8 sys_semtimedop
+        data8 sys_timer_create
+        data8 sys_timer_settime
+        data8 sys_timer_gettime                 // 1250
+        data8 sys_timer_getoverrun
+        data8 sys_timer_delete
+        data8 sys_clock_settime
+        data8 sys_clock_gettime
+        data8 sys_clock_getres                  // 1255
+        data8 sys_clock_nanosleep
+        data8 sys_fstatfs64
+        data8 sys_statfs64
+        data8 sys_mbind
+        data8 sys_get_mempolicy                 // 1260
+        data8 sys_set_mempolicy
+        data8 sys_mq_open
+        data8 sys_mq_unlink
+        data8 sys_mq_timedsend
+        data8 sys_mq_timedreceive               // 1265
+        data8 sys_mq_notify
+        data8 sys_mq_getsetattr
+        data8 sys_ni_syscall                    // reserved for kexec_load
+        data8 sys_ni_syscall                    // reserved for vserver
+        data8 sys_waitid                        // 1270
+        data8 sys_add_key
+        data8 sys_request_key
+        data8 sys_keyctl
+        data8 sys_ni_syscall
+        data8 sys_ni_syscall                    // 1275
+        data8 sys_ni_syscall
+        data8 sys_ni_syscall
+        data8 sys_ni_syscall
+        data8 sys_ni_syscall
+        .org sys_call_table + 8*NR_syscalls     // guard against failures to increase NR_syscalls
diff --git a/arch/ia64/kernel/entry.h b/arch/ia64/kernel/entry.h
new file mode 100644
index 000000000000..6d4ecec989b5
--- /dev/null
+++ b/arch/ia64/kernel/entry.h
@@ -0,0 +1,82 @@
+#include <linux/config.h>
+/*
+ * Preserved registers that are shared between code in ivt.S and
+ * entry.S.  Be careful not to step on these!
+ */
+#define PRED_LEAVE_SYSCALL      1 /* TRUE iff leave from syscall */
+#define PRED_KERNEL_STACK       2 /* returning to kernel-stacks? */
+#define PRED_USER_STACK         3 /* returning to user-stacks? */
+#define PRED_SYSCALL            4 /* inside a system call? */
+#define PRED_NON_SYSCALL        5 /* complement of PRED_SYSCALL */
+#ifdef __ASSEMBLY__
+# define PASTE2(x,y)    x##y
+# define PASTE(x,y)     PASTE2(x,y)
+# define pLvSys         PASTE(p,PRED_LEAVE_SYSCALL)
+# define pKStk          PASTE(p,PRED_KERNEL_STACK)
+# define pUStk          PASTE(p,PRED_USER_STACK)
+# define pSys           PASTE(p,PRED_SYSCALL)
+# define pNonSys        PASTE(p,PRED_NON_SYSCALL)
+#endif
+#define PT(f)           (IA64_PT_REGS_##f##_OFFSET)
+#define SW(f)           (IA64_SWITCH_STACK_##f##_OFFSET)
+#define PT_REGS_SAVES(off)                      \
+        .unwabi 3, 'i';                         \
+        .fframe IA64_PT_REGS_SIZE+16+(off);     \
+        .spillsp rp, PT(CR_IIP)+16+(off);       \
+        .spillsp ar.pfs, PT(CR_IFS)+16+(off);   \
+        .spillsp ar.unat, PT(AR_UNAT)+16+(off); \
+        .spillsp ar.fpsr, PT(AR_FPSR)+16+(off); \
+        .spillsp pr, PT(PR)+16+(off);
+#define PT_REGS_UNWIND_INFO(off)                \
+        .prologue;                              \
+        PT_REGS_SAVES(off);                     \
+        .body
+#define SWITCH_STACK_SAVES(off)                                                 \
+        .savesp ar.unat,SW(CALLER_UNAT)+16+(off);                               \
+        .savesp ar.fpsr,SW(AR_FPSR)+16+(off);                                   \
+        .spillsp f2,SW(F2)+16+(off); .spillsp f3,SW(F3)+16+(off);               \
+        .spillsp f4,SW(F4)+16+(off); .spillsp f5,SW(F5)+16+(off);               \
+        .spillsp f16,SW(F16)+16+(off); .spillsp f17,SW(F17)+16+(off);           \
+        .spillsp f18,SW(F18)+16+(off); .spillsp f19,SW(F19)+16+(off);           \
+        .spillsp f20,SW(F20)+16+(off); .spillsp f21,SW(F21)+16+(off);           \
+        .spillsp f22,SW(F22)+16+(off); .spillsp f23,SW(F23)+16+(off);           \
+        .spillsp f24,SW(F24)+16+(off); .spillsp f25,SW(F25)+16+(off);           \
+        .spillsp f26,SW(F26)+16+(off); .spillsp f27,SW(F27)+16+(off);           \
+        .spillsp f28,SW(F28)+16+(off); .spillsp f29,SW(F29)+16+(off);           \
+        .spillsp f30,SW(F30)+16+(off); .spillsp f31,SW(F31)+16+(off);           \
+        .spillsp r4,SW(R4)+16+(off); .spillsp r5,SW(R5)+16+(off);               \
+        .spillsp r6,SW(R6)+16+(off); .spillsp r7,SW(R7)+16+(off);               \
+        .spillsp b0,SW(B0)+16+(off); .spillsp b1,SW(B1)+16+(off);               \
+        .spillsp b2,SW(B2)+16+(off); .spillsp b3,SW(B3)+16+(off);               \
+        .spillsp b4,SW(B4)+16+(off); .spillsp b5,SW(B5)+16+(off);               \
+        .spillsp ar.pfs,SW(AR_PFS)+16+(off); .spillsp ar.lc,SW(AR_LC)+16+(off); \
+        .spillsp @priunat,SW(AR_UNAT)+16+(off);                                 \
+        .spillsp ar.rnat,SW(AR_RNAT)+16+(off);                                  \
+        .spillsp ar.bspstore,SW(AR_BSPSTORE)+16+(off);                          \
+        .spillsp pr,SW(PR)+16+(off))
+#define DO_SAVE_SWITCH_STACK                    \
+        movl r28=1f;                            \
+        ;;                                      \
+        .fframe IA64_SWITCH_STACK_SIZE;         \
+        adds sp=-IA64_SWITCH_STACK_SIZE,sp;     \
+        mov.ret.sptk b7=r28,1f;                 \
+        SWITCH_STACK_SAVES(0);                  \
+        br.cond.sptk.many save_switch_stack;    \
+1:
+#define DO_LOAD_SWITCH_STACK                    \
+        movl r28=1f;                            \
+        ;;                                      \
+        invala;                                 \
+        mov.ret.sptk b7=r28,1f;                 \
+        br.cond.sptk.many load_switch_stack;    \
+1:      .restore sp;                            \
+        adds sp=IA64_SWITCH_STACK_SIZE,sp
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
new file mode 100644
index 000000000000..0d8650f7fce7
--- /dev/null
+++ b/arch/ia64/kernel/fsys.S
@@ -0,0 +1,884 @@
+/*
+ * This file contains the light-weight system call handlers (fsyscall-handlers).
+ *
+ * Copyright (C) 2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 25-Sep-03 davidm     Implement fsys_rt_sigprocmask().
+ * 18-Feb-03 louisk     Implement fsys_gettimeofday().
+ * 28-Feb-03 davidm     Fixed several bugs in fsys_gettimeofday().  Tuned it some more,
+ *                      probably broke it along the way... ;-)
+ * 13-Jul-04 clameter   Implement fsys_clock_gettime and revise fsys_gettimeofday to make
+ *                      it capable of using memory based clocks without falling back to C code.
+ */
+#include <asm/asmmacro.h>
+#include <asm/errno.h>
+#include <asm/offsets.h>
+#include <asm/percpu.h>
+#include <asm/thread_info.h>
+#include <asm/sal.h>
+#include <asm/signal.h>
+#include <asm/system.h>
+#include <asm/unistd.h>
+#include "entry.h"
+/*
+ * See Documentation/ia64/fsys.txt for details on fsyscalls.
+ *
+ * On entry to an fsyscall handler:
+ *   r10        = 0 (i.e., defaults to "successful syscall return")
+ *   r11        = saved ar.pfs (a user-level value)
+ *   r15        = system call number
+ *   r16        = "current" task pointer (in normal kernel-mode, this is in r13)
+ *   r32-r39    = system call arguments
+ *   b6         = return address (a user-level value)
+ *   ar.pfs     = previous frame-state (a user-level value)
+ *   PSR.be     = cleared to zero (i.e., little-endian byte order is in effect)
+ *   all other registers may contain values passed in from user-mode
+ *
+ * On return from an fsyscall handler:
+ *   r11        = saved ar.pfs (as passed into the fsyscall handler)
+ *   r15        = system call number (as passed into the fsyscall handler)
+ *   r32-r39    = system call arguments (as passed into the fsyscall handler)
+ *   b6         = return address (as passed into the fsyscall handler)
+ *   ar.pfs     = previous frame-state (as passed into the fsyscall handler)
+ */
+ENTRY(fsys_ni_syscall)
+        .prologue
+        .altrp b6
+        .body
+        mov r8=ENOSYS
+        mov r10=-1
+        FSYS_RETURN
+END(fsys_ni_syscall)
+ENTRY(fsys_getpid)
+        .prologue
+        .altrp b6
+        .body
+        add r9=TI_FLAGS+IA64_TASK_SIZE,r16
+        ;;
+        ld4 r9=[r9]
+        add r8=IA64_TASK_TGID_OFFSET,r16
+        ;;
+        and r9=TIF_ALLWORK_MASK,r9
+        ld4 r8=[r8]                             // r8 = current->tgid
+        ;;
+        cmp.ne p8,p0=0,r9
+(p8)    br.spnt.many fsys_fallback_syscall
+        FSYS_RETURN
+END(fsys_getpid)
+ENTRY(fsys_getppid)
+        .prologue
+        .altrp b6
+        .body
+        add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16
+        ;;
+        ld8 r17=[r17]                           // r17 = current->group_leader
+        add r9=TI_FLAGS+IA64_TASK_SIZE,r16
+        ;;
+        ld4 r9=[r9]
+        add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = &current->group_leader->real_parent
+        ;;
+        and r9=TIF_ALLWORK_MASK,r9
+1:      ld8 r18=[r17]                           // r18 = current->group_leader->real_parent
+        ;;
+        cmp.ne p8,p0=0,r9
+        add r8=IA64_TASK_TGID_OFFSET,r18        // r8 = &current->group_leader->real_parent->tgid
+        ;;
+        /*
+         * The .acq is needed to ensure that the read of tgid has returned its data before
+         * we re-check "real_parent".
+         */
+        ld4.acq r8=[r8]                         // r8 = current->group_leader->real_parent->tgid
+#ifdef CONFIG_SMP
+        /*
+         * Re-read current->group_leader->real_parent.
+         */
+        ld8 r19=[r17]                           // r19 = current->group_leader->real_parent
+(p8)    br.spnt.many fsys_fallback_syscall
+        ;;
+        cmp.ne p6,p0=r18,r19                    // did real_parent change?
+        mov r19=0                       // i must not leak kernel bits...
+(p6)    br.cond.spnt.few 1b                     // yes -> redo the read of tgid and the check
+        ;;
+        mov r17=0                       // i must not leak kernel bits...
+        mov r18=0                       // i must not leak kernel bits...
+#else
+        mov r17=0                       // i must not leak kernel bits...
+        mov r18=0                       // i must not leak kernel bits...
+        mov r19=0                       // i must not leak kernel bits...
+#endif
+        FSYS_RETURN
+END(fsys_getppid)
+ENTRY(fsys_set_tid_address)
+        .prologue
+        .altrp b6
+        .body
+        add r9=TI_FLAGS+IA64_TASK_SIZE,r16
+        ;;
+        ld4 r9=[r9]
+        tnat.z p6,p7=r32                // check argument register for being NaT
+        ;;
+        and r9=TIF_ALLWORK_MASK,r9
+        add r8=IA64_TASK_PID_OFFSET,r16
+        add r18=IA64_TASK_CLEAR_CHILD_TID_OFFSET,r16
+        ;;
+        ld4 r8=[r8]
+        cmp.ne p8,p0=0,r9
+        mov r17=-1
+        ;;
+(p6)    st8 [r18]=r32
+(p7)    st8 [r18]=r17
+(p8)    br.spnt.many fsys_fallback_syscall
+        ;;
+        mov r17=0                       // i must not leak kernel bits...
+        mov r18=0                       // i must not leak kernel bits...
+        FSYS_RETURN
+END(fsys_set_tid_address)
+/*
+ * Ensure that the time interpolator structure is compatible with the asm code
+ */
+#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \
+        || IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4
+#error fsys_gettimeofday incompatible with changes to struct time_interpolator
+#endif
+#define CLOCK_REALTIME 0
+#define CLOCK_MONOTONIC 1
+#define CLOCK_DIVIDE_BY_1000 0x4000
+#define CLOCK_ADD_MONOTONIC 0x8000
+ENTRY(fsys_gettimeofday)
+        .prologue
+        .altrp b6
+        .body
+        mov r31 = r32
+        tnat.nz p6,p0 = r33             // guard against NaT argument
+(p6)    br.cond.spnt.few .fail_einval
+        mov r30 = CLOCK_DIVIDE_BY_1000
+        ;;
+.gettime:
+        // Register map
+        // Incoming r31 = pointer to address where to place result
+        //          r30 = flags determining how time is processed
+        // r2,r3 = temp r4-r7 preserved
+        // r8 = result nanoseconds
+        // r9 = result seconds
+        // r10 = temporary storage for clock difference
+        // r11 = preserved: saved ar.pfs
+        // r12 = preserved: memory stack
+        // r13 = preserved: thread pointer
+        // r14 = address of mask / mask
+        // r15 = preserved: system call number
+        // r16 = preserved: current task pointer
+        // r17 = wall to monotonic use
+        // r18 = time_interpolator->offset
+        // r19 = address of wall_to_monotonic
+        // r20 = pointer to struct time_interpolator / pointer to time_interpolator->address
+        // r21 = shift factor
+        // r22 = address of time interpolator->last_counter
+        // r23 = address of time_interpolator->last_cycle
+        // r24 = adress of time_interpolator->offset
+        // r25 = last_cycle value
+        // r26 = last_counter value
+        // r27 = pointer to xtime
+        // r28 = sequence number at the beginning of critcal section
+        // r29 = address of seqlock
+        // r30 = time processing flags / memory address
+        // r31 = pointer to result
+        // Predicates
+        // p6,p7 short term use
+        // p8 = timesource ar.itc
+        // p9 = timesource mmio64
+        // p10 = timesource mmio32
+        // p11 = timesource not to be handled by asm code
+        // p12 = memory time source ( = p9 | p10)
+        // p13 = do cmpxchg with time_interpolator_last_cycle
+        // p14 = Divide by 1000
+        // p15 = Add monotonic
+        //
+        // Note that instructions are optimized for McKinley. McKinley can process two
+        // bundles simultaneously and therefore we continuously try to feed the CPU
+        // two bundles and then a stop.
+        tnat.nz p6,p0 = r31     // branch deferred since it does not fit into bundle structure
+        mov pr = r30,0xc000     // Set predicates according to function
+        add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
+        movl r20 = time_interpolator
+        ;;
+        ld8 r20 = [r20]         // get pointer to time_interpolator structure
+        movl r29 = xtime_lock
+        ld4 r2 = [r2]           // process work pending flags
+        movl r27 = xtime
+        ;;      // only one bundle here
+        ld8 r21 = [r20]         // first quad with control information
+        and r2 = TIF_ALLWORK_MASK,r2
+(p6)    br.cond.spnt.few .fail_einval   // deferred branch
+        ;;
+        add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20
+        extr r3 = r21,32,32     // time_interpolator->nsec_per_cyc
+        extr r8 = r21,0,16      // time_interpolator->source
+        cmp.ne p6, p0 = 0, r2   // Fallback if work is scheduled
+(p6)    br.cond.spnt.many fsys_fallback_syscall
+        ;;
+        cmp.eq p8,p12 = 0,r8    // Check for cpu timer
+        cmp.eq p9,p0 = 1,r8     // MMIO64 ?
+        extr r2 = r21,24,8      // time_interpolator->jitter
+        cmp.eq p10,p0 = 2,r8    // MMIO32 ?
+        cmp.ltu p11,p0 = 2,r8   // function or other clock
+(p11)   br.cond.spnt.many fsys_fallback_syscall
+        ;;
+        setf.sig f7 = r3        // Setup for scaling of counter
+(p15)   movl r19 = wall_to_monotonic
+(p12)   ld8 r30 = [r10]
+        cmp.ne p13,p0 = r2,r0   // need jitter compensation?
+        extr r21 = r21,16,8     // shift factor
+        ;;
+.time_redo:
+        .pred.rel.mutex p8,p9,p10
+        ld4.acq r28 = [r29]     // xtime_lock.sequence. Must come first for locking purposes
+(p8)    mov r2 = ar.itc         // CPU_TIMER. 36 clocks latency!!!
+        add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20
+(p9)    ld8 r2 = [r30]          // readq(ti->address). Could also have latency issues..
+(p10)   ld4 r2 = [r30]          // readw(ti->address)
+(p13)   add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20
+        ;;                      // could be removed by moving the last add upward
+        ld8 r26 = [r22]         // time_interpolator->last_counter
+(p13)   ld8 r25 = [r23]         // time interpolator->last_cycle
+        add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20
+(p15)   ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET
+        ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET
+        add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20
+        ;;
+        ld8 r18 = [r24]         // time_interpolator->offset
+        ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET    // xtime.tv_nsec
+(p13)   sub r3 = r25,r2 // Diff needed before comparison (thanks davidm)
+        ;;
+        ld8 r14 = [r14]         // time_interpolator->mask
+(p13)   cmp.gt.unc p6,p7 = r3,r0        // check if it is less than last. p6,p7 cleared
+        sub r10 = r2,r26        // current_counter - last_counter
+        ;;
+(p6)    sub r10 = r25,r26       // time we got was less than last_cycle
+(p7)    mov ar.ccv = r25        // more than last_cycle. Prep for cmpxchg
+        ;;
+        and r10 = r10,r14       // Apply mask
+        ;;
+        setf.sig f8 = r10
+        nop.i 123
+        ;;
+(p7)    cmpxchg8.rel r3 = [r23],r2,ar.ccv
+EX(.fail_efault, probe.w.fault r31, 3)  // This takes 5 cycles and we have spare time
+        xmpy.l f8 = f8,f7       // nsec_per_cyc*(counter-last_counter)
+(p15)   add r9 = r9,r17         // Add wall to monotonic.secs to result secs
+        ;;
+(p15)   ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET
+(p7)    cmp.ne p7,p0 = r25,r3   // if cmpxchg not successful redo
+        // simulate tbit.nz.or p7,p0 = r28,0
+        and r28 = ~1,r28        // Make sequence even to force retry if odd
+        getf.sig r2 = f8
+        mf
+        add r8 = r8,r18         // Add time interpolator offset
+        ;;
+        ld4 r10 = [r29]         // xtime_lock.sequence
+(p15)   add r8 = r8, r17        // Add monotonic.nsecs to nsecs
+        shr.u r2 = r2,r21
+        ;;              // overloaded 3 bundles!
+        // End critical section.
+        add r8 = r8,r2          // Add xtime.nsecs
+        cmp4.ne.or p7,p0 = r28,r10
+(p7)    br.cond.dpnt.few .time_redo     // sequence number changed ?
+        // Now r8=tv->tv_nsec and r9=tv->tv_sec
+        mov r10 = r0
+        movl r2 = 1000000000
+        add r23 = IA64_TIMESPEC_TV_NSEC_OFFSET, r31
+(p14)   movl r3 = 2361183241434822607   // Prep for / 1000 hack
+        ;;
+.time_normalize:
+        mov r21 = r8
+        cmp.ge p6,p0 = r8,r2
+(p14)   shr.u r20 = r8, 3               // We can repeat this if necessary just wasting some time
+        ;;
+(p14)   setf.sig f8 = r20
+(p6)    sub r8 = r8,r2
+(p6)    add r9 = 1,r9                   // two nops before the branch.
+(p14)   setf.sig f7 = r3                // Chances for repeats are 1 in 10000 for gettod
+(p6)    br.cond.dpnt.few .time_normalize
+        ;;
+        // Divided by 8 though shift. Now divide by 125
+        // The compiler was able to do that with a multiply
+        // and a shift and we do the same
+EX(.fail_efault, probe.w.fault r23, 3)          // This also costs 5 cycles
+(p14)   xmpy.hu f8 = f8, f7                     // xmpy has 5 cycles latency so use it...
+        ;;
+        mov r8 = r0
+(p14)   getf.sig r2 = f8
+        ;;
+(p14)   shr.u r21 = r2, 4
+        ;;
+EX(.fail_efault, st8 [r31] = r9)
+EX(.fail_efault, st8 [r23] = r21)
+        FSYS_RETURN
+.fail_einval:
+        mov r8 = EINVAL
+        mov r10 = -1
+        FSYS_RETURN
+.fail_efault:
+        mov r8 = EFAULT
+        mov r10 = -1
+        FSYS_RETURN
+END(fsys_gettimeofday)
+ENTRY(fsys_clock_gettime)
+        .prologue
+        .altrp b6
+        .body
+        cmp4.ltu p6, p0 = CLOCK_MONOTONIC, r32
+        // Fallback if this is not CLOCK_REALTIME or CLOCK_MONOTONIC
+(p6)    br.spnt.few fsys_fallback_syscall
+        mov r31 = r33
+        shl r30 = r32,15
+        br.many .gettime
+END(fsys_clock_gettime)
+/*
+ * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize).
+ */
+#if _NSIG_WORDS != 1
+# error Sorry, fsys_rt_sigprocmask() needs to be updated for _NSIG_WORDS != 1.
+#endif
+ENTRY(fsys_rt_sigprocmask)
+        .prologue
+        .altrp b6
+        .body
+        add r2=IA64_TASK_BLOCKED_OFFSET,r16
+        add r9=TI_FLAGS+IA64_TASK_SIZE,r16
+        cmp4.ltu p6,p0=SIG_SETMASK,r32
+        cmp.ne p15,p0=r0,r34                    // oset != NULL?
+        tnat.nz p8,p0=r34
+        add r31=IA64_TASK_SIGHAND_OFFSET,r16
+        ;;
+        ld8 r3=[r2]                             // read/prefetch current->blocked
+        ld4 r9=[r9]
+        tnat.nz.or p6,p0=r35
+        cmp.ne.or p6,p0=_NSIG_WORDS*8,r35
+        tnat.nz.or p6,p0=r32
+(p6)    br.spnt.few .fail_einval                // fail with EINVAL
+        ;;
+#ifdef CONFIG_SMP
+        ld8 r31=[r31]                           // r31 <- current->sighand
+#endif
+        and r9=TIF_ALLWORK_MASK,r9
+        tnat.nz.or p8,p0=r33
+        ;;
+        cmp.ne p7,p0=0,r9
+        cmp.eq p6,p0=r0,r33                     // set == NULL?
+        add r31=IA64_SIGHAND_SIGLOCK_OFFSET,r31 // r31 <- current->sighand->siglock
+(p8)    br.spnt.few .fail_efault                // fail with EFAULT
+(p7)    br.spnt.many fsys_fallback_syscall      // got pending kernel work...
+(p6)    br.dpnt.many .store_mask                // -> short-circuit to just reading the signal mask
+        /* Argh, we actually have to do some work and _update_ the signal mask: */
+EX(.fail_efault, probe.r.fault r33, 3)          // verify user has read-access to *set
+EX(.fail_efault, ld8 r14=[r33])                 // r14 <- *set
+        mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1))
+        ;;
+        rsm psr.i                               // mask interrupt delivery
+        mov ar.ccv=0
+        andcm r14=r14,r17                       // filter out SIGKILL & SIGSTOP
+#ifdef CONFIG_SMP
+        mov r17=1
+        ;;
+        cmpxchg4.acq r18=[r31],r17,ar.ccv       // try to acquire the lock
+        mov r8=EINVAL                   // default to EINVAL
+        ;;
+        ld8 r3=[r2]                     // re-read current->blocked now that we hold the lock
+        cmp4.ne p6,p0=r18,r0
+(p6)    br.cond.spnt.many .lock_contention
+        ;;
+#else
+        ld8 r3=[r2]                     // re-read current->blocked now that we hold the lock
+        mov r8=EINVAL                   // default to EINVAL
+#endif
+        add r18=IA64_TASK_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r16
+        add r19=IA64_TASK_SIGNAL_OFFSET,r16
+        cmp4.eq p6,p0=SIG_BLOCK,r32
+        ;;
+        ld8 r19=[r19]                   // r19 <- current->signal
+        cmp4.eq p7,p0=SIG_UNBLOCK,r32
+        cmp4.eq p8,p0=SIG_SETMASK,r32
+        ;;
+        ld8 r18=[r18]                   // r18 <- current->pending.signal
+        .pred.rel.mutex p6,p7,p8
+(p6)    or r14=r3,r14                   // SIG_BLOCK
+(p7)    andcm r14=r3,r14                // SIG_UNBLOCK
+(p8)    mov r14=r14                     // SIG_SETMASK
+(p6)    mov r8=0                        // clear error code
+        // recalc_sigpending()
+        add r17=IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,r19
+        add r19=IA64_SIGNAL_SHARED_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r19
+        ;;
+        ld4 r17=[r17]           // r17 <- current->signal->group_stop_count
+(p7)    mov r8=0                // clear error code
+        ld8 r19=[r19]           // r19 <- current->signal->shared_pending
+        ;;
+        cmp4.gt p6,p7=r17,r0    // p6/p7 <- (current->signal->group_stop_count > 0)?
+(p8)    mov r8=0                // clear error code
+        or r18=r18,r19          // r18 <- current->pending | current->signal->shared_pending
+        ;;
+        // r18 <- (current->pending | current->signal->shared_pending) & ~current->blocked:
+        andcm r18=r18,r14
+        add r9=TI_FLAGS+IA64_TASK_SIZE,r16
+        ;;
+(p7)    cmp.ne.or.andcm p6,p7=r18,r0            // p6/p7 <- signal pending
+        mov r19=0                                       // i must not leak kernel bits...
+(p6)    br.cond.dpnt.many .sig_pending
+        ;;
+1:      ld4 r17=[r9]                            // r17 <- current->thread_info->flags
+        ;;
+        mov ar.ccv=r17
+        and r18=~_TIF_SIGPENDING,r17            // r18 <- r17 & ~(1 << TIF_SIGPENDING)
+        ;;
+        st8 [r2]=r14                            // update current->blocked with new mask
+        cmpxchg4.acq r14=[r9],r18,ar.ccv        // current->thread_info->flags <- r18
+        ;;
+        cmp.ne p6,p0=r17,r14                    // update failed?
+(p6)    br.cond.spnt.few 1b                     // yes -> retry
+#ifdef CONFIG_SMP
+        st4.rel [r31]=r0                        // release the lock
+#endif
+        ssm psr.i
+        ;;
+        srlz.d                                  // ensure psr.i is set again
+        mov r18=0                                       // i must not leak kernel bits...
+.store_mask:
+EX(.fail_efault, (p15) probe.w.fault r34, 3)    // verify user has write-access to *oset
+EX(.fail_efault, (p15) st8 [r34]=r3)
+        mov r2=0                                        // i must not leak kernel bits...
+        mov r3=0                                        // i must not leak kernel bits...
+        mov r8=0                                // return 0
+        mov r9=0                                        // i must not leak kernel bits...
+        mov r14=0                                       // i must not leak kernel bits...
+        mov r17=0                                       // i must not leak kernel bits...
+        mov r31=0                                       // i must not leak kernel bits...
+        FSYS_RETURN
+.sig_pending:
+#ifdef CONFIG_SMP
+        st4.rel [r31]=r0                        // release the lock
+#endif
+        ssm psr.i
+        ;;
+        srlz.d
+        br.sptk.many fsys_fallback_syscall      // with signal pending, do the heavy-weight syscall
+#ifdef CONFIG_SMP
+.lock_contention:
+        /* Rather than spinning here, fall back on doing a heavy-weight syscall.  */
+        ssm psr.i
+        ;;
+        srlz.d
+        br.sptk.many fsys_fallback_syscall
+#endif
+END(fsys_rt_sigprocmask)
+ENTRY(fsys_fallback_syscall)
+        .prologue
+        .altrp b6
+        .body
+        /*
+         * We only get here from light-weight syscall handlers.  Thus, we already
+         * know that r15 contains a valid syscall number.  No need to re-check.
+         */
+        adds r17=-1024,r15
+        movl r14=sys_call_table
+        ;;
+        rsm psr.i
+        shladd r18=r17,3,r14
+        ;;
+        ld8 r18=[r18]                           // load normal (heavy-weight) syscall entry-point
+        mov r29=psr                             // read psr (12 cyc load latency)
+        mov r27=ar.rsc
+        mov r21=ar.fpsr
+        mov r26=ar.pfs
+END(fsys_fallback_syscall)
+        /* FALL THROUGH */
+GLOBAL_ENTRY(fsys_bubble_down)
+        .prologue
+        .altrp b6
+        .body
+        /*
+         * We get here for syscalls that don't have a lightweight handler.  For those, we
+         * need to bubble down into the kernel and that requires setting up a minimal
+         * pt_regs structure, and initializing the CPU state more or less as if an
+         * interruption had occurred.  To make syscall-restarts work, we setup pt_regs
+         * such that cr_iip points to the second instruction in syscall_via_break.
+         * Decrementing the IP hence will restart the syscall via break and not
+         * decrementing IP will return us to the caller, as usual.  Note that we preserve
+         * the value of psr.pp rather than initializing it from dcr.pp.  This makes it
+         * possible to distinguish fsyscall execution from other privileged execution.
+         *
+         * On entry:
+         *      - normal fsyscall handler register usage, except that we also have:
+         *      - r18: address of syscall entry point
+         *      - r21: ar.fpsr
+         *      - r26: ar.pfs
+         *      - r27: ar.rsc
+         *      - r29: psr
+         */
+#       define PSR_PRESERVED_BITS       (IA64_PSR_UP | IA64_PSR_MFL | IA64_PSR_MFH | IA64_PSR_PK \
+                                         | IA64_PSR_DT | IA64_PSR_PP | IA64_PSR_SP | IA64_PSR_RT \
+                                         | IA64_PSR_IC)
+        /*
+         * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc.  The rest we have
+         * to synthesize.
+         */
+#       define PSR_ONE_BITS             ((3 << IA64_PSR_CPL0_BIT) | (0x1 << IA64_PSR_RI_BIT) \
+                                         | IA64_PSR_BN | IA64_PSR_I)
+        invala
+        movl r8=PSR_ONE_BITS
+        mov r25=ar.unat                 // save ar.unat (5 cyc)
+        movl r9=PSR_PRESERVED_BITS
+        mov ar.rsc=0                    // set enforced lazy mode, pl 0, little-endian, loadrs=0
+        movl r28=__kernel_syscall_via_break
+        ;;
+        mov r23=ar.bspstore             // save ar.bspstore (12 cyc)
+        mov r31=pr                      // save pr (2 cyc)
+        mov r20=r1                      // save caller's gp in r20
+        ;;
+        mov r2=r16                      // copy current task addr to addl-addressable register
+        and r9=r9,r29
+        mov r19=b6                      // save b6 (2 cyc)
+        ;;
+        mov psr.l=r9                    // slam the door (17 cyc to srlz.i)
+        or r29=r8,r29                   // construct cr.ipsr value to save
+        addl r22=IA64_RBS_OFFSET,r2     // compute base of RBS
+        ;;
+        // GAS reports a spurious RAW hazard on the read of ar.rnat because it thinks
+        // we may be reading ar.itc after writing to psr.l.  Avoid that message with
+        // this directive:
+        dv_serialize_data
+        mov.m r24=ar.rnat               // read ar.rnat (5 cyc lat)
+        lfetch.fault.excl.nt1 [r22]
+        adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r2
+        // ensure previous insn group is issued before we stall for srlz.i:
+        ;;
+        srlz.i                          // ensure new psr.l has been established
+        /////////////////////////////////////////////////////////////////////////////
+        ////////// from this point on, execution is not interruptible anymore
+        /////////////////////////////////////////////////////////////////////////////
+        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2    // compute base of memory stack
+        cmp.ne pKStk,pUStk=r0,r0        // set pKStk <- 0, pUStk <- 1
+        ;;
+        st1 [r16]=r0                    // clear current->thread.on_ustack flag
+        mov ar.bspstore=r22             // switch to kernel RBS
+        mov b6=r18                      // copy syscall entry-point to b6 (7 cyc)
+        add r3=TI_FLAGS+IA64_TASK_SIZE,r2
+        ;;
+        ld4 r3=[r3]                             // r2 = current_thread_info()->flags
+        mov r18=ar.bsp                  // save (kernel) ar.bsp (12 cyc)
+        mov ar.rsc=0x3                  // set eager mode, pl 0, little-endian, loadrs=0
+        br.call.sptk.many b7=ia64_syscall_setup
+        ;;
+        ssm psr.i
+        movl r2=ia64_ret_from_syscall
+        ;;
+        mov rp=r2                               // set the real return addr
+        tbit.z p8,p0=r3,TIF_SYSCALL_TRACE
+        ;;
+(p10)   br.cond.spnt.many ia64_ret_from_syscall // p10==true means out registers are more than 8
+(p8)    br.call.sptk.many b6=b6         // ignore this return addr
+        br.cond.sptk ia64_trace_syscall
+END(fsys_bubble_down)
+        .rodata
+        .align 8
+        .globl fsyscall_table
+        data8 fsys_bubble_down
+fsyscall_table:
+        data8 fsys_ni_syscall
+        data8 0                         // exit                 // 1025
+        data8 0                         // read
+        data8 0                         // write
+        data8 0                         // open
+        data8 0                         // close
+        data8 0                         // creat                // 1030
+        data8 0                         // link
+        data8 0                         // unlink
+        data8 0                         // execve
+        data8 0                         // chdir
+        data8 0                         // fchdir               // 1035
+        data8 0                         // utimes
+        data8 0                         // mknod
+        data8 0                         // chmod
+        data8 0                         // chown
+        data8 0                         // lseek                // 1040
+        data8 fsys_getpid               // getpid
+        data8 fsys_getppid              // getppid
+        data8 0                         // mount
+        data8 0                         // umount
+        data8 0                         // setuid               // 1045
+        data8 0                         // getuid
+        data8 0                         // geteuid
+        data8 0                         // ptrace
+        data8 0                         // access
+        data8 0                         // sync                 // 1050
+        data8 0                         // fsync
+        data8 0                         // fdatasync
+        data8 0                         // kill
+        data8 0                         // rename
+        data8 0                         // mkdir                // 1055
+        data8 0                         // rmdir
+        data8 0                         // dup
+        data8 0                         // pipe
+        data8 0                         // times
+        data8 0                         // brk                  // 1060
+        data8 0                         // setgid
+        data8 0                         // getgid
+        data8 0                         // getegid
+        data8 0                         // acct
+        data8 0                         // ioctl                // 1065
+        data8 0                         // fcntl
+        data8 0                         // umask
+        data8 0                         // chroot
+        data8 0                         // ustat
+        data8 0                         // dup2                 // 1070
+        data8 0                         // setreuid
+        data8 0                         // setregid
+        data8 0                         // getresuid
+        data8 0                         // setresuid
+        data8 0                         // getresgid            // 1075
+        data8 0                         // setresgid
+        data8 0                         // getgroups
+        data8 0                         // setgroups
+        data8 0                         // getpgid
+        data8 0                         // setpgid              // 1080
+        data8 0                         // setsid
+        data8 0                         // getsid
+        data8 0                         // sethostname
+        data8 0                         // setrlimit
+        data8 0                         // getrlimit            // 1085
+        data8 0                         // getrusage
+        data8 fsys_gettimeofday         // gettimeofday
+        data8 0                         // settimeofday
+        data8 0                         // select
+        data8 0                         // poll                 // 1090
+        data8 0                         // symlink
+        data8 0                         // readlink
+        data8 0                         // uselib
+        data8 0                         // swapon
+        data8 0                         // swapoff              // 1095
+        data8 0                         // reboot
+        data8 0                         // truncate
+        data8 0                         // ftruncate
+        data8 0                         // fchmod
+        data8 0                         // fchown               // 1100
+        data8 0                         // getpriority
+        data8 0                         // setpriority
+        data8 0                         // statfs
+        data8 0                         // fstatfs
+        data8 0                         // gettid               // 1105
+        data8 0                         // semget
+        data8 0                         // semop
+        data8 0                         // semctl
+        data8 0                         // msgget
+        data8 0                         // msgsnd               // 1110
+        data8 0                         // msgrcv
+        data8 0                         // msgctl
+        data8 0                         // shmget
+        data8 0                         // shmat
+        data8 0                         // shmdt                // 1115
+        data8 0                         // shmctl
+        data8 0                         // syslog
+        data8 0                         // setitimer
+        data8 0                         // getitimer
+        data8 0                                                 // 1120
+        data8 0
+        data8 0
+        data8 0                         // vhangup
+        data8 0                         // lchown
+        data8 0                         // remap_file_pages     // 1125
+        data8 0                         // wait4
+        data8 0                         // sysinfo
+        data8 0                         // clone
+        data8 0                         // setdomainname
+        data8 0                         // newuname             // 1130
+        data8 0                         // adjtimex
+        data8 0
+        data8 0                         // init_module
+        data8 0                         // delete_module
+        data8 0                                                 // 1135
+        data8 0
+        data8 0                         // quotactl
+        data8 0                         // bdflush
+        data8 0                         // sysfs
+        data8 0                         // personality          // 1140
+        data8 0                         // afs_syscall
+        data8 0                         // setfsuid
+        data8 0                         // setfsgid
+        data8 0                         // getdents
+        data8 0                         // flock                // 1145
+        data8 0                         // readv
+        data8 0                         // writev
+        data8 0                         // pread64
+        data8 0                         // pwrite64
+        data8 0                         // sysctl               // 1150
+        data8 0                         // mmap
+        data8 0                         // munmap
+        data8 0                         // mlock
+        data8 0                         // mlockall
+        data8 0                         // mprotect             // 1155
+        data8 0                         // mremap
+        data8 0                         // msync
+        data8 0                         // munlock
+        data8 0                         // munlockall
+        data8 0                         // sched_getparam       // 1160
+        data8 0                         // sched_setparam
+        data8 0                         // sched_getscheduler
+        data8 0                         // sched_setscheduler
+        data8 0                         // sched_yield
+        data8 0                         // sched_get_priority_max       // 1165
+        data8 0                         // sched_get_priority_min
+        data8 0                         // sched_rr_get_interval
+        data8 0                         // nanosleep
+        data8 0                         // nfsservctl
+        data8 0                         // prctl                // 1170
+        data8 0                         // getpagesize
+        data8 0                         // mmap2
+        data8 0                         // pciconfig_read
+        data8 0                         // pciconfig_write
+        data8 0                         // perfmonctl           // 1175
+        data8 0                         // sigaltstack
+        data8 0                         // rt_sigaction
+        data8 0                         // rt_sigpending
+        data8 fsys_rt_sigprocmask       // rt_sigprocmask
+        data8 0                         // rt_sigqueueinfo      // 1180
+        data8 0                         // rt_sigreturn
+        data8 0                         // rt_sigsuspend
+        data8 0                         // rt_sigtimedwait
+        data8 0                         // getcwd
+        data8 0                         // capget               // 1185
+        data8 0                         // capset
+        data8 0                         // sendfile
+        data8 0
+        data8 0
+        data8 0                         // socket               // 1190
+        data8 0                         // bind
+        data8 0                         // connect
+        data8 0                         // listen
+        data8 0                         // accept
+        data8 0                         // getsockname          // 1195
+        data8 0                         // getpeername
+        data8 0                         // socketpair
+        data8 0                         // send
+        data8 0                         // sendto
+        data8 0                         // recv                 // 1200
+        data8 0                         // recvfrom
+        data8 0                         // shutdown
+        data8 0                         // setsockopt
+        data8 0                         // getsockopt
+        data8 0                         // sendmsg              // 1205
+        data8 0                         // recvmsg
+        data8 0                         // pivot_root
+        data8 0                         // mincore
+        data8 0                         // madvise
+        data8 0                         // newstat              // 1210
+        data8 0                         // newlstat
+        data8 0                         // newfstat
+        data8 0                         // clone2
+        data8 0                         // getdents64
+        data8 0                         // getunwind            // 1215
+        data8 0                         // readahead
+        data8 0                         // setxattr
+        data8 0                         // lsetxattr
+        data8 0                         // fsetxattr
+        data8 0                         // getxattr             // 1220
+        data8 0                         // lgetxattr
+        data8 0                         // fgetxattr
+        data8 0                         // listxattr
+        data8 0                         // llistxattr
+        data8 0                         // flistxattr           // 1225
+        data8 0                         // removexattr
+        data8 0                         // lremovexattr
+        data8 0                         // fremovexattr
+        data8 0                         // tkill
+        data8 0                         // futex                // 1230
+        data8 0                         // sched_setaffinity
+        data8 0                         // sched_getaffinity
+        data8 fsys_set_tid_address      // set_tid_address
+        data8 0                         // fadvise64_64
+        data8 0                         // tgkill               // 1235
+        data8 0                         // exit_group
+        data8 0                         // lookup_dcookie
+        data8 0                         // io_setup
+        data8 0                         // io_destroy
+        data8 0                         // io_getevents         // 1240
+        data8 0                         // io_submit
+        data8 0                         // io_cancel
+        data8 0                         // epoll_create
+        data8 0                         // epoll_ctl
+        data8 0                         // epoll_wait           // 1245
+        data8 0                         // restart_syscall
+        data8 0                         // semtimedop
+        data8 0                         // timer_create
+        data8 0                         // timer_settime
+        data8 0                         // timer_gettime        // 1250
+        data8 0                         // timer_getoverrun
+        data8 0                         // timer_delete
+        data8 0                         // clock_settime
+        data8 fsys_clock_gettime        // clock_gettime
+        data8 0                         // clock_getres         // 1255
+        data8 0                         // clock_nanosleep
+        data8 0                         // fstatfs64
+        data8 0                         // statfs64
+        data8 0
+        data8 0                                                 // 1260
+        data8 0
+        data8 0                         // mq_open
+        data8 0                         // mq_unlink
+        data8 0                         // mq_timedsend
+        data8 0                         // mq_timedreceive      // 1265
+        data8 0                         // mq_notify
+        data8 0                         // mq_getsetattr
+        data8 0                         // kexec_load
+        data8 0
+        data8 0                                                 // 1270
+        data8 0
+        data8 0
+        data8 0
+        data8 0
+        data8 0                                                 // 1275
+        data8 0
+        data8 0
+        data8 0
+        data8 0
+        .org fsyscall_table + 8*NR_syscalls     // guard against failures to increase NR_syscalls
diff --git a/arch/ia64/kernel/gate-data.S b/arch/ia64/kernel/gate-data.S
new file mode 100644
index 000000000000..258c0a3238fb
--- /dev/null
+++ b/arch/ia64/kernel/gate-data.S
@@ -0,0 +1,3 @@
+        .section .data.gate, "aw"
+        .incbin "arch/ia64/kernel/gate.so"
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
new file mode 100644
index 000000000000..facf75acdc85
--- /dev/null
+++ b/arch/ia64/kernel/gate.S
@@ -0,0 +1,372 @@
+/*
+ * This file contains the code that gets mapped at the upper end of each task's text
+ * region.  For now, it contains the signal trampoline code only.
+ *
+ * Copyright (C) 1999-2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/config.h>
+#include <asm/asmmacro.h>
+#include <asm/errno.h>
+#include <asm/offsets.h>
+#include <asm/sigcontext.h>
+#include <asm/system.h>
+#include <asm/unistd.h>
+/*
+ * We can't easily refer to symbols inside the kernel.  To avoid full runtime relocation,
+ * complications with the linker (which likes to create PLT stubs for branches
+ * to targets outside the shared object) and to avoid multi-phase kernel builds, we
+ * simply create minimalistic "patch lists" in special ELF sections.
+ */
+        .section ".data.patch.fsyscall_table", "a"
+        .previous
+#define LOAD_FSYSCALL_TABLE(reg)                        \
+[1:]    movl reg=0;                                     \
+        .xdata4 ".data.patch.fsyscall_table", 1b-.
+        .section ".data.patch.brl_fsys_bubble_down", "a"
+        .previous
+#define BRL_COND_FSYS_BUBBLE_DOWN(pr)                   \
+[1:](pr)brl.cond.sptk 0;                                \
+        .xdata4 ".data.patch.brl_fsys_bubble_down", 1b-.
+GLOBAL_ENTRY(__kernel_syscall_via_break)
+        .prologue
+        .altrp b6
+        .body
+        /*
+         * Note: for (fast) syscall restart to work, the break instruction must be
+         *       the first one in the bundle addressed by syscall_via_break.
+         */
+{ .mib
+        break 0x100000
+        nop.i 0
+        br.ret.sptk.many b6
+}
+END(__kernel_syscall_via_break)
+/*
+ * On entry:
+ *      r11 = saved ar.pfs
+ *      r15 = system call #
+ *      b0  = saved return address
+ *      b6  = return address
+ * On exit:
+ *      r11 = saved ar.pfs
+ *      r15 = system call #
+ *      b0  = saved return address
+ *      all other "scratch" registers:  undefined
+ *      all "preserved" registers:      same as on entry
+ */
+GLOBAL_ENTRY(__kernel_syscall_via_epc)
+        .prologue
+        .altrp b6
+        .body
+{
+        /*
+         * Note: the kernel cannot assume that the first two instructions in this
+         * bundle get executed.  The remaining code must be safe even if
+         * they do not get executed.
+         */
+        adds r17=-1024,r15
+        mov r10=0                               // default to successful syscall execution
+        epc
+}
+        ;;
+        rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be"
+        LOAD_FSYSCALL_TABLE(r14)
+        mov r16=IA64_KR(CURRENT)                // 12 cycle read latency
+        tnat.nz p10,p9=r15
+        mov r19=NR_syscalls-1
+        ;;
+        shladd r18=r17,3,r14
+        srlz.d
+        cmp.ne p8,p0=r0,r0                      // p8 <- FALSE
+        /* Note: if r17 is a NaT, p6 will be set to zero.  */
+        cmp.geu p6,p7=r19,r17                   // (syscall > 0 && syscall < 1024+NR_syscalls)?
+        ;;
+(p6)    ld8 r18=[r18]
+        mov r21=ar.fpsr
+        add r14=-8,r14                          // r14 <- addr of fsys_bubble_down entry
+        ;;
+(p6)    mov b7=r18
+(p6)    tbit.z p8,p0=r18,0
+(p8)    br.dptk.many b7
+(p6)    rsm psr.i
+        mov r27=ar.rsc
+        mov r26=ar.pfs
+        ;;
+        mov r29=psr                             // read psr (12 cyc load latency)
+/*
+ * brl.cond doesn't work as intended because the linker would convert this branch
+ * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
+ * future version of the linker.  In the meantime, we just use an indirect branch
+ * instead.
+ */
+#ifdef CONFIG_ITANIUM
+(p6)    ld8 r14=[r14]                           // r14 <- fsys_bubble_down
+        ;;
+(p6)    mov b7=r14
+(p6)    br.sptk.many b7
+#else
+        BRL_COND_FSYS_BUBBLE_DOWN(p6)
+#endif
+        mov r10=-1
+(p10)   mov r8=EINVAL
+(p9)    mov r8=ENOSYS
+        FSYS_RETURN
+END(__kernel_syscall_via_epc)
+#       define ARG0_OFF         (16 + IA64_SIGFRAME_ARG0_OFFSET)
+#       define ARG1_OFF         (16 + IA64_SIGFRAME_ARG1_OFFSET)
+#       define ARG2_OFF         (16 + IA64_SIGFRAME_ARG2_OFFSET)
+#       define SIGHANDLER_OFF   (16 + IA64_SIGFRAME_HANDLER_OFFSET)
+#       define SIGCONTEXT_OFF   (16 + IA64_SIGFRAME_SIGCONTEXT_OFFSET)
+#       define FLAGS_OFF        IA64_SIGCONTEXT_FLAGS_OFFSET
+#       define CFM_OFF          IA64_SIGCONTEXT_CFM_OFFSET
+#       define FR6_OFF          IA64_SIGCONTEXT_FR6_OFFSET
+#       define BSP_OFF          IA64_SIGCONTEXT_AR_BSP_OFFSET
+#       define RNAT_OFF         IA64_SIGCONTEXT_AR_RNAT_OFFSET
+#       define UNAT_OFF         IA64_SIGCONTEXT_AR_UNAT_OFFSET
+#       define FPSR_OFF         IA64_SIGCONTEXT_AR_FPSR_OFFSET
+#       define PR_OFF           IA64_SIGCONTEXT_PR_OFFSET
+#       define RP_OFF           IA64_SIGCONTEXT_IP_OFFSET
+#       define SP_OFF           IA64_SIGCONTEXT_R12_OFFSET
+#       define RBS_BASE_OFF     IA64_SIGCONTEXT_RBS_BASE_OFFSET
+#       define LOADRS_OFF       IA64_SIGCONTEXT_LOADRS_OFFSET
+#       define base0            r2
+#       define base1            r3
+        /*
+         * When we get here, the memory stack looks like this:
+         *
+         *   +===============================+
+         *   |                               |
+         *   //     struct sigframe          //
+         *   |                               |
+         *   +-------------------------------+ <-- sp+16
+         *   |      16 byte of scratch       |
+         *   |            space              |
+         *   +-------------------------------+ <-- sp
+         *
+         * The register stack looks _exactly_ the way it looked at the time the signal
+         * occurred.  In other words, we're treading on a potential mine-field: each
+         * incoming general register may be a NaT value (including sp, in which case the
+         * process ends up dying with a SIGSEGV).
+         *
+         * The first thing need to do is a cover to get the registers onto the backing
+         * store.  Once that is done, we invoke the signal handler which may modify some
+         * of the machine state.  After returning from the signal handler, we return
+         * control to the previous context by executing a sigreturn system call.  A signal
+         * handler may call the rt_sigreturn() function to directly return to a given
+         * sigcontext.  However, the user-level sigreturn() needs to do much more than
+         * calling the rt_sigreturn() system call as it needs to unwind the stack to
+         * restore preserved registers that may have been saved on the signal handler's
+         * call stack.
+         */
+#define SIGTRAMP_SAVES                                                                          \
+        .unwabi 3, 's';         /* mark this as a sigtramp handler (saves scratch regs) */      \
+        .unwabi @svr4, 's'; /* backwards compatibility with old unwinders (remove in v2.7) */   \
+        .savesp ar.unat, UNAT_OFF+SIGCONTEXT_OFF;                                               \
+        .savesp ar.fpsr, FPSR_OFF+SIGCONTEXT_OFF;                                               \
+        .savesp pr, PR_OFF+SIGCONTEXT_OFF;                                                      \
+        .savesp rp, RP_OFF+SIGCONTEXT_OFF;                                                      \
+        .savesp ar.pfs, CFM_OFF+SIGCONTEXT_OFF;                                                 \
+        .vframesp SP_OFF+SIGCONTEXT_OFF
+GLOBAL_ENTRY(__kernel_sigtramp)
+        // describe the state that is active when we get here:
+        .prologue
+        SIGTRAMP_SAVES
+        .body
+        .label_state 1
+        adds base0=SIGHANDLER_OFF,sp
+        adds base1=RBS_BASE_OFF+SIGCONTEXT_OFF,sp
+        br.call.sptk.many rp=1f
+1:
+        ld8 r17=[base0],(ARG0_OFF-SIGHANDLER_OFF)       // get pointer to signal handler's plabel
+        ld8 r15=[base1]                                 // get address of new RBS base (or NULL)
+        cover                           // push args in interrupted frame onto backing store
+        ;;
+        cmp.ne p1,p0=r15,r0             // do we need to switch rbs? (note: pr is saved by kernel)
+        mov.m r9=ar.bsp                 // fetch ar.bsp
+        .spillsp.p p1, ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
+(p1)    br.cond.spnt setup_rbs          // yup -> (clobbers p8, r14-r16, and r18-r20)
+back_from_setup_rbs:
+        alloc r8=ar.pfs,0,0,3,0
+        ld8 out0=[base0],16             // load arg0 (signum)
+        adds base1=(ARG1_OFF-(RBS_BASE_OFF+SIGCONTEXT_OFF)),base1
+        ;;
+        ld8 out1=[base1]                // load arg1 (siginfop)
+        ld8 r10=[r17],8                 // get signal handler entry point
+        ;;
+        ld8 out2=[base0]                // load arg2 (sigcontextp)
+        ld8 gp=[r17]                    // get signal handler's global pointer
+        adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp
+        ;;
+        .spillsp ar.bsp, BSP_OFF+SIGCONTEXT_OFF
+        st8 [base0]=r9                  // save sc_ar_bsp
+        adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
+        adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp
+        ;;
+        stf.spill [base0]=f6,32
+        stf.spill [base1]=f7,32
+        ;;
+        stf.spill [base0]=f8,32
+        stf.spill [base1]=f9,32
+        mov b6=r10
+        ;;
+        stf.spill [base0]=f10,32
+        stf.spill [base1]=f11,32
+        ;;
+        stf.spill [base0]=f12,32
+        stf.spill [base1]=f13,32
+        ;;
+        stf.spill [base0]=f14,32
+        stf.spill [base1]=f15,32
+        br.call.sptk.many rp=b6                 // call the signal handler
+.ret0:  adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp
+        ;;
+        ld8 r15=[base0]                         // fetch sc_ar_bsp
+        mov r14=ar.bsp
+        ;;
+        cmp.ne p1,p0=r14,r15                    // do we need to restore the rbs?
+(p1)    br.cond.spnt restore_rbs                // yup -> (clobbers r14-r18, f6 & f7)
+        ;;
+back_from_restore_rbs:
+        adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
+        adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp
+        ;;
+        ldf.fill f6=[base0],32
+        ldf.fill f7=[base1],32
+        ;;
+        ldf.fill f8=[base0],32
+        ldf.fill f9=[base1],32
+        ;;
+        ldf.fill f10=[base0],32
+        ldf.fill f11=[base1],32
+        ;;
+        ldf.fill f12=[base0],32
+        ldf.fill f13=[base1],32
+        ;;
+        ldf.fill f14=[base0],32
+        ldf.fill f15=[base1],32
+        mov r15=__NR_rt_sigreturn
+        .restore sp                             // pop .prologue
+        break __BREAK_SYSCALL
+        .prologue
+        SIGTRAMP_SAVES
+setup_rbs:
+        mov ar.rsc=0                            // put RSE into enforced lazy mode
+        ;;
+        .save ar.rnat, r19
+        mov r19=ar.rnat                         // save RNaT before switching backing store area
+        adds r14=(RNAT_OFF+SIGCONTEXT_OFF),sp
+        mov r18=ar.bspstore
+        mov ar.bspstore=r15                     // switch over to new register backing store area
+        ;;
+        .spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
+        st8 [r14]=r19                           // save sc_ar_rnat
+        .body
+        mov.m r16=ar.bsp                        // sc_loadrs <- (new bsp - new bspstore) << 16
+        adds r14=(LOADRS_OFF+SIGCONTEXT_OFF),sp
+        ;;
+        invala
+        sub r15=r16,r15
+        extr.u r20=r18,3,6
+        ;;
+        mov ar.rsc=0xf                          // set RSE into eager mode, pl 3
+        cmp.eq p8,p0=63,r20
+        shl r15=r15,16
+        ;;
+        st8 [r14]=r15                           // save sc_loadrs
+(p8)    st8 [r18]=r19           // if bspstore points at RNaT slot, store RNaT there now
+        .restore sp                             // pop .prologue
+        br.cond.sptk back_from_setup_rbs
+        .prologue
+        SIGTRAMP_SAVES
+        .spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
+        .body
+restore_rbs:
+        // On input:
+        //      r14 = bsp1 (bsp at the time of return from signal handler)
+        //      r15 = bsp0 (bsp at the time the signal occurred)
+        //
+        // Here, we need to calculate bspstore0, the value that ar.bspstore needs
+        // to be set to, based on bsp0 and the size of the dirty partition on
+        // the alternate stack (sc_loadrs >> 16).  This can be done with the
+        // following algorithm:
+        //
+        //  bspstore0 = rse_skip_regs(bsp0, -rse_num_regs(bsp1 - (loadrs >> 19), bsp1));
+        //
+        // This is what the code below does.
+        //
+        alloc r2=ar.pfs,0,0,0,0                 // alloc null frame
+        adds r16=(LOADRS_OFF+SIGCONTEXT_OFF),sp
+        adds r18=(RNAT_OFF+SIGCONTEXT_OFF),sp
+        ;;
+        ld8 r17=[r16]
+        ld8 r16=[r18]                   // get new rnat
+        extr.u r18=r15,3,6      // r18 <- rse_slot_num(bsp0)
+        ;;
+        mov ar.rsc=r17                  // put RSE into enforced lazy mode
+        shr.u r17=r17,16
+        ;;
+        sub r14=r14,r17         // r14 (bspstore1) <- bsp1 - (sc_loadrs >> 16)
+        shr.u r17=r17,3         // r17 <- (sc_loadrs >> 19)
+        ;;
+        loadrs                  // restore dirty partition
+        extr.u r14=r14,3,6      // r14 <- rse_slot_num(bspstore1)
+        ;;
+        add r14=r14,r17         // r14 <- rse_slot_num(bspstore1) + (sc_loadrs >> 19)
+        ;;
+        shr.u r14=r14,6         // r14 <- (rse_slot_num(bspstore1) + (sc_loadrs >> 19))/0x40
+        ;;
+        sub r14=r14,r17         // r14 <- -rse_num_regs(bspstore1, bsp1)
+        movl r17=0x8208208208208209
+        ;;
+        add r18=r18,r14         // r18 (delta) <- rse_slot_num(bsp0) - rse_num_regs(bspstore1,bsp1)
+        setf.sig f7=r17
+        cmp.lt p7,p0=r14,r0     // p7 <- (r14 < 0)?
+        ;;
+(p7)    adds r18=-62,r18        // delta -= 62
+        ;;
+        setf.sig f6=r18
+        ;;
+        xmpy.h f6=f6,f7
+        ;;
+        getf.sig r17=f6
+        ;;
+        add r17=r17,r18
+        shr r18=r18,63
+        ;;
+        shr r17=r17,5
+        ;;
+        sub r17=r17,r18         // r17 = delta/63
+        ;;
+        add r17=r14,r17         // r17 <- delta/63 - rse_num_regs(bspstore1, bsp1)
+        ;;
+        shladd r15=r17,3,r15    // r15 <- bsp0 + 8*(delta/63 - rse_num_regs(bspstore1, bsp1))
+        ;;
+        mov ar.bspstore=r15                     // switch back to old register backing store area
+        ;;
+        mov ar.rnat=r16                         // restore RNaT
+        mov ar.rsc=0xf                          // (will be restored later on from sc_ar_rsc)
+        // invala not necessary as that will happen when returning to user-mode
+        br.cond.sptk back_from_restore_rbs
+END(__kernel_sigtramp)
diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S
new file mode 100644
index 000000000000..e1e4aba9ecd0
--- /dev/null
+++ b/arch/ia64/kernel/gate.lds.S
@@ -0,0 +1,95 @@
+/*
+ * Linker script for gate DSO.  The gate pages are an ELF shared object prelinked to its
+ * virtual address, with only one read-only segment and one execute-only segment (both fit
+ * in one page).  This script controls its layout.
+ */
+#include <linux/config.h>
+#include <asm/system.h>
+SECTIONS
+{
+  . = GATE_ADDR + SIZEOF_HEADERS;
+  .hash                         : { *(.hash) }                          :readable
+  .dynsym                       : { *(.dynsym) }
+  .dynstr                       : { *(.dynstr) }
+  .gnu.version                  : { *(.gnu.version) }
+  .gnu.version_d                : { *(.gnu.version_d) }
+  .gnu.version_r                : { *(.gnu.version_r) }
+  .dynamic                      : { *(.dynamic) }                       :readable :dynamic
+  /*
+   * This linker script is used both with -r and with -shared.  For the layouts to match,
+   * we need to skip more than enough space for the dynamic symbol table et al.  If this
+   * amount is insufficient, ld -shared will barf.  Just increase it here.
+   */
+  . = GATE_ADDR + 0x500;
+  .data.patch                   : {
+                                    __start_gate_mckinley_e9_patchlist = .;
+                                    *(.data.patch.mckinley_e9)
+                                    __end_gate_mckinley_e9_patchlist = .;
+                                    __start_gate_vtop_patchlist = .;
+                                    *(.data.patch.vtop)
+                                    __end_gate_vtop_patchlist = .;
+                                    __start_gate_fsyscall_patchlist = .;
+                                    *(.data.patch.fsyscall_table)
+                                    __end_gate_fsyscall_patchlist = .;
+                                    __start_gate_brl_fsys_bubble_down_patchlist = .;
+                                    *(.data.patch.brl_fsys_bubble_down)
+                                    __end_gate_brl_fsys_bubble_down_patchlist = .;
+  }                                                                     :readable
+  .IA_64.unwind_info            : { *(.IA_64.unwind_info*) }
+  .IA_64.unwind                 : { *(.IA_64.unwind*) }                 :readable :unwind
+#ifdef HAVE_BUGGY_SEGREL
+  .text (GATE_ADDR + PAGE_SIZE) : { *(.text) *(.text.*) }               :readable
+#else
+  . = ALIGN (PERCPU_PAGE_SIZE) + (. & (PERCPU_PAGE_SIZE - 1));
+  .text                         : { *(.text) *(.text.*) }               :epc
+#endif
+  /DISCARD/                     : {
+        *(.got.plt) *(.got)
+        *(.data .data.* .gnu.linkonce.d.*)
+        *(.dynbss)
+        *(.bss .bss.* .gnu.linkonce.b.*)
+        *(__ex_table)
+  }
+}
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+  readable  PT_LOAD     FILEHDR PHDRS   FLAGS(4);       /* PF_R */
+#ifndef HAVE_BUGGY_SEGREL
+  epc       PT_LOAD     FILEHDR PHDRS   FLAGS(1);       /* PF_X */
+#endif
+  dynamic   PT_DYNAMIC                  FLAGS(4);       /* PF_R */
+  unwind    0x70000001; /* PT_IA_64_UNWIND, but ld doesn't match the name */
+}
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+  LINUX_2.5 {
+    global:
+        __kernel_syscall_via_break;
+        __kernel_syscall_via_epc;
+        __kernel_sigtramp;
+    local: *;
+  };
+}
+/* The ELF entry point can be used to set the AT_SYSINFO value.  */
+ENTRY(__kernel_syscall_via_epc)
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
new file mode 100644
index 000000000000..105c7fec8c6d
--- /dev/null
+++ b/arch/ia64/kernel/head.S
@@ -0,0 +1,996 @@
+/*
+ * Here is where the ball gets rolling as far as the kernel is concerned.
+ * When control is transferred to _start, the bootload has already
+ * loaded us to the correct address.  All that's left to do here is
+ * to set up the kernel's global pointer and jump to the kernel
+ * entry point.
+ *
+ * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999 Intel Corp.
+ * Copyright (C) 1999 Asit Mallick <Asit.K.Mallick@intel.com>
+ * Copyright (C) 1999 Don Dugger <Don.Dugger@intel.com>
+ * Copyright (C) 2002 Fenghua Yu <fenghua.yu@intel.com>
+ *   -Optimize __ia64_save_fpu() and __ia64_load_fpu() for Itanium 2.
+ */
+#include <linux/config.h>
+#include <asm/asmmacro.h>
+#include <asm/fpu.h>
+#include <asm/kregs.h>
+#include <asm/mmu_context.h>
+#include <asm/offsets.h>
+#include <asm/pal.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/system.h>
+        .section __special_page_section,"ax"
+        .global empty_zero_page
+empty_zero_page:
+        .skip PAGE_SIZE
+        .global swapper_pg_dir
+swapper_pg_dir:
+        .skip PAGE_SIZE
+        .rodata
+halt_msg:
+        stringz "Halting kernel\n"
+        .text
+        .global start_ap
+        /*
+         * Start the kernel.  When the bootloader passes control to _start(), r28
+         * points to the address of the boot parameter area.  Execution reaches
+         * here in physical mode.
+         */
+GLOBAL_ENTRY(_start)
+start_ap:
+        .prologue
+        .save rp, r0            // terminate unwind chain with a NULL rp
+        .body
+        rsm psr.i | psr.ic
+        ;;
+        srlz.i
+        ;;
+        /*
+         * Initialize kernel region registers:
+         *      rr[0]: VHPT enabled, page size = PAGE_SHIFT
+         *      rr[1]: VHPT enabled, page size = PAGE_SHIFT
+         *      rr[2]: VHPT enabled, page size = PAGE_SHIFT
+         *      rr[3]: VHPT enabled, page size = PAGE_SHIFT
+         *      rr[4]: VHPT enabled, page size = PAGE_SHIFT
+         *      rr[5]: VHPT enabled, page size = PAGE_SHIFT
+         *      rr[6]: VHPT disabled, page size = IA64_GRANULE_SHIFT
+         *      rr[7]: VHPT disabled, page size = IA64_GRANULE_SHIFT
+         * We initialize all of them to prevent inadvertently assuming
+         * something about the state of address translation early in boot.
+         */
+        mov r6=((ia64_rid(IA64_REGION_ID_KERNEL, (0<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
+        movl r7=(0<<61)
+        mov r8=((ia64_rid(IA64_REGION_ID_KERNEL, (1<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
+        movl r9=(1<<61)
+        mov r10=((ia64_rid(IA64_REGION_ID_KERNEL, (2<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
+        movl r11=(2<<61)
+        mov r12=((ia64_rid(IA64_REGION_ID_KERNEL, (3<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
+        movl r13=(3<<61)
+        mov r14=((ia64_rid(IA64_REGION_ID_KERNEL, (4<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
+        movl r15=(4<<61)
+        mov r16=((ia64_rid(IA64_REGION_ID_KERNEL, (5<<61)) << 8) | (PAGE_SHIFT << 2) | 1)
+        movl r17=(5<<61)
+        mov r18=((ia64_rid(IA64_REGION_ID_KERNEL, (6<<61)) << 8) | (IA64_GRANULE_SHIFT << 2))
+        movl r19=(6<<61)
+        mov r20=((ia64_rid(IA64_REGION_ID_KERNEL, (7<<61)) << 8) | (IA64_GRANULE_SHIFT << 2))
+        movl r21=(7<<61)
+        ;;
+        mov rr[r7]=r6
+        mov rr[r9]=r8
+        mov rr[r11]=r10
+        mov rr[r13]=r12
+        mov rr[r15]=r14
+        mov rr[r17]=r16
+        mov rr[r19]=r18
+        mov rr[r21]=r20
+        ;;
+        /*
+         * Now pin mappings into the TLB for kernel text and data
+         */
+        mov r18=KERNEL_TR_PAGE_SHIFT<<2
+        movl r17=KERNEL_START
+        ;;
+        mov cr.itir=r18
+        mov cr.ifa=r17
+        mov r16=IA64_TR_KERNEL
+        mov r3=ip
+        movl r18=PAGE_KERNEL
+        ;;
+        dep r2=0,r3,0,KERNEL_TR_PAGE_SHIFT
+        ;;
+        or r18=r2,r18
+        ;;
+        srlz.i
+        ;;
+        itr.i itr[r16]=r18
+        ;;
+        itr.d dtr[r16]=r18
+        ;;
+        srlz.i
+        /*
+         * Switch into virtual mode:
+         */
+        movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN \
+                  |IA64_PSR_DI)
+        ;;
+        mov cr.ipsr=r16
+        movl r17=1f
+        ;;
+        mov cr.iip=r17
+        mov cr.ifs=r0
+        ;;
+        rfi
+        ;;
+1:      // now we are in virtual mode
+        // set IVT entry point---can't access I/O ports without it
+        movl r3=ia64_ivt
+        ;;
+        mov cr.iva=r3
+        movl r2=FPSR_DEFAULT
+        ;;
+        srlz.i
+        movl gp=__gp
+        mov ar.fpsr=r2
+        ;;
+#define isAP    p2      // are we an Application Processor?
+#define isBP    p3      // are we the Bootstrap Processor?
+#ifdef CONFIG_SMP
+        /*
+         * Find the init_task for the currently booting CPU.  At poweron, and in
+         * UP mode, task_for_booting_cpu is NULL.
+         */
+        movl r3=task_for_booting_cpu
+        ;;
+        ld8 r3=[r3]
+        movl r2=init_task
+        ;;
+        cmp.eq isBP,isAP=r3,r0
+        ;;
+(isAP)  mov r2=r3
+#else
+        movl r2=init_task
+        cmp.eq isBP,isAP=r0,r0
+#endif
+        ;;
+        tpa r3=r2               // r3 == phys addr of task struct
+        mov r16=-1
+(isBP)  br.cond.dpnt .load_current // BP stack is on region 5 --- no need to map it
+        // load mapping for stack (virtaddr in r2, physaddr in r3)
+        rsm psr.ic
+        movl r17=PAGE_KERNEL
+        ;;
+        srlz.d
+        dep r18=0,r3,0,12
+        ;;
+        or r18=r17,r18
+        dep r2=-1,r3,61,3       // IMVA of task
+        ;;
+        mov r17=rr[r2]
+        shr.u r16=r3,IA64_GRANULE_SHIFT
+        ;;
+        dep r17=0,r17,8,24
+        ;;
+        mov cr.itir=r17
+        mov cr.ifa=r2
+        mov r19=IA64_TR_CURRENT_STACK
+        ;;
+        itr.d dtr[r19]=r18
+        ;;
+        ssm psr.ic
+        srlz.d
+        ;;
+.load_current:
+        // load the "current" pointer (r13) and ar.k6 with the current task
+        mov IA64_KR(CURRENT)=r2         // virtual address
+        mov IA64_KR(CURRENT_STACK)=r16
+        mov r13=r2
+        /*
+         * Reserve space at the top of the stack for "struct pt_regs".  Kernel threads
+         * don't store interesting values in that structure, but the space still needs
+         * to be there because time-critical stuff such as the context switching can
+         * be implemented more efficiently (for example, __switch_to()
+         * always sets the psr.dfh bit of the task it is switching to).
+         */
+        addl r12=IA64_STK_OFFSET-IA64_PT_REGS_SIZE-16,r2
+        addl r2=IA64_RBS_OFFSET,r2      // initialize the RSE
+        mov ar.rsc=0            // place RSE in enforced lazy mode
+        ;;
+        loadrs                  // clear the dirty partition
+        ;;
+        mov ar.bspstore=r2      // establish the new RSE stack
+        ;;
+        mov ar.rsc=0x3          // place RSE in eager mode
+(isBP)  dep r28=-1,r28,61,3     // make address virtual
+(isBP)  movl r2=ia64_boot_param
+        ;;
+(isBP)  st8 [r2]=r28            // save the address of the boot param area passed by the bootloader
+#ifdef CONFIG_SMP
+(isAP)  br.call.sptk.many rp=start_secondary
+.ret0:
+(isAP)  br.cond.sptk self
+#endif
+        // This is executed by the bootstrap processor (bsp) only:
+#ifdef CONFIG_IA64_FW_EMU
+        // initialize PAL & SAL emulator:
+        br.call.sptk.many rp=sys_fw_init
+.ret1:
+#endif
+        br.call.sptk.many rp=start_kernel
+.ret2:  addl r3=@ltoff(halt_msg),gp
+        ;;
+        alloc r2=ar.pfs,8,0,2,0
+        ;;
+        ld8 out0=[r3]
+        br.call.sptk.many b0=console_print
+self:   hint @pause
+        br.sptk.many self               // endless loop
+END(_start)
+GLOBAL_ENTRY(ia64_save_debug_regs)
+        alloc r16=ar.pfs,1,0,0,0
+        mov r20=ar.lc                   // preserve ar.lc
+        mov ar.lc=IA64_NUM_DBG_REGS-1
+        mov r18=0
+        add r19=IA64_NUM_DBG_REGS*8,in0
+        ;;
+1:      mov r16=dbr[r18]
+#ifdef CONFIG_ITANIUM
+        ;;
+        srlz.d
+#endif
+        mov r17=ibr[r18]
+        add r18=1,r18
+        ;;
+        st8.nta [in0]=r16,8
+        st8.nta [r19]=r17,8
+        br.cloop.sptk.many 1b
+        ;;
+        mov ar.lc=r20                   // restore ar.lc
+        br.ret.sptk.many rp
+END(ia64_save_debug_regs)
+GLOBAL_ENTRY(ia64_load_debug_regs)
+        alloc r16=ar.pfs,1,0,0,0
+        lfetch.nta [in0]
+        mov r20=ar.lc                   // preserve ar.lc
+        add r19=IA64_NUM_DBG_REGS*8,in0
+        mov ar.lc=IA64_NUM_DBG_REGS-1
+        mov r18=-1
+        ;;
+1:      ld8.nta r16=[in0],8
+        ld8.nta r17=[r19],8
+        add r18=1,r18
+        ;;
+        mov dbr[r18]=r16
+#ifdef CONFIG_ITANIUM
+        ;;
+        srlz.d                          // Errata 132 (NoFix status)
+#endif
+        mov ibr[r18]=r17
+        br.cloop.sptk.many 1b
+        ;;
+        mov ar.lc=r20                   // restore ar.lc
+        br.ret.sptk.many rp
+END(ia64_load_debug_regs)
+GLOBAL_ENTRY(__ia64_save_fpu)
+        alloc r2=ar.pfs,1,4,0,0
+        adds loc0=96*16-16,in0
+        adds loc1=96*16-16-128,in0
+        ;;
+        stf.spill.nta [loc0]=f127,-256
+        stf.spill.nta [loc1]=f119,-256
+        ;;
+        stf.spill.nta [loc0]=f111,-256
+        stf.spill.nta [loc1]=f103,-256
+        ;;
+        stf.spill.nta [loc0]=f95,-256
+        stf.spill.nta [loc1]=f87,-256
+        ;;
+        stf.spill.nta [loc0]=f79,-256
+        stf.spill.nta [loc1]=f71,-256
+        ;;
+        stf.spill.nta [loc0]=f63,-256
+        stf.spill.nta [loc1]=f55,-256
+        adds loc2=96*16-32,in0
+        ;;
+        stf.spill.nta [loc0]=f47,-256
+        stf.spill.nta [loc1]=f39,-256
+        adds loc3=96*16-32-128,in0
+        ;;
+        stf.spill.nta [loc2]=f126,-256
+        stf.spill.nta [loc3]=f118,-256
+        ;;
+        stf.spill.nta [loc2]=f110,-256
+        stf.spill.nta [loc3]=f102,-256
+        ;;
+        stf.spill.nta [loc2]=f94,-256
+        stf.spill.nta [loc3]=f86,-256
+        ;;
+        stf.spill.nta [loc2]=f78,-256
+        stf.spill.nta [loc3]=f70,-256
+        ;;
+        stf.spill.nta [loc2]=f62,-256
+        stf.spill.nta [loc3]=f54,-256
+        adds loc0=96*16-48,in0
+        ;;
+        stf.spill.nta [loc2]=f46,-256
+        stf.spill.nta [loc3]=f38,-256
+        adds loc1=96*16-48-128,in0
+        ;;
+        stf.spill.nta [loc0]=f125,-256
+        stf.spill.nta [loc1]=f117,-256
+        ;;
+        stf.spill.nta [loc0]=f109,-256
+        stf.spill.nta [loc1]=f101,-256
+        ;;
+        stf.spill.nta [loc0]=f93,-256
+        stf.spill.nta [loc1]=f85,-256
+        ;;
+        stf.spill.nta [loc0]=f77,-256
+        stf.spill.nta [loc1]=f69,-256
+        ;;
+        stf.spill.nta [loc0]=f61,-256
+        stf.spill.nta [loc1]=f53,-256
+        adds loc2=96*16-64,in0
+        ;;
+        stf.spill.nta [loc0]=f45,-256
+        stf.spill.nta [loc1]=f37,-256
+        adds loc3=96*16-64-128,in0
+        ;;
+        stf.spill.nta [loc2]=f124,-256
+        stf.spill.nta [loc3]=f116,-256
+        ;;
+        stf.spill.nta [loc2]=f108,-256
+        stf.spill.nta [loc3]=f100,-256
+        ;;
+        stf.spill.nta [loc2]=f92,-256
+        stf.spill.nta [loc3]=f84,-256
+        ;;
+        stf.spill.nta [loc2]=f76,-256
+        stf.spill.nta [loc3]=f68,-256
+        ;;
+        stf.spill.nta [loc2]=f60,-256
+        stf.spill.nta [loc3]=f52,-256
+        adds loc0=96*16-80,in0
+        ;;
+        stf.spill.nta [loc2]=f44,-256
+        stf.spill.nta [loc3]=f36,-256
+        adds loc1=96*16-80-128,in0
+        ;;
+        stf.spill.nta [loc0]=f123,-256
+        stf.spill.nta [loc1]=f115,-256
+        ;;
+        stf.spill.nta [loc0]=f107,-256
+        stf.spill.nta [loc1]=f99,-256
+        ;;
+        stf.spill.nta [loc0]=f91,-256
+        stf.spill.nta [loc1]=f83,-256
+        ;;
+        stf.spill.nta [loc0]=f75,-256
+        stf.spill.nta [loc1]=f67,-256
+        ;;
+        stf.spill.nta [loc0]=f59,-256
+        stf.spill.nta [loc1]=f51,-256
+        adds loc2=96*16-96,in0
+        ;;
+        stf.spill.nta [loc0]=f43,-256
+        stf.spill.nta [loc1]=f35,-256
+        adds loc3=96*16-96-128,in0
+        ;;
+        stf.spill.nta [loc2]=f122,-256
+        stf.spill.nta [loc3]=f114,-256
+        ;;
+        stf.spill.nta [loc2]=f106,-256
+        stf.spill.nta [loc3]=f98,-256
+        ;;
+        stf.spill.nta [loc2]=f90,-256
+        stf.spill.nta [loc3]=f82,-256
+        ;;
+        stf.spill.nta [loc2]=f74,-256
+        stf.spill.nta [loc3]=f66,-256
+        ;;
+        stf.spill.nta [loc2]=f58,-256
+        stf.spill.nta [loc3]=f50,-256
+        adds loc0=96*16-112,in0
+        ;;
+        stf.spill.nta [loc2]=f42,-256
+        stf.spill.nta [loc3]=f34,-256
+        adds loc1=96*16-112-128,in0
+        ;;
+        stf.spill.nta [loc0]=f121,-256
+        stf.spill.nta [loc1]=f113,-256
+        ;;
+        stf.spill.nta [loc0]=f105,-256
+        stf.spill.nta [loc1]=f97,-256
+        ;;
+        stf.spill.nta [loc0]=f89,-256
+        stf.spill.nta [loc1]=f81,-256
+        ;;
+        stf.spill.nta [loc0]=f73,-256
+        stf.spill.nta [loc1]=f65,-256
+        ;;
+        stf.spill.nta [loc0]=f57,-256
+        stf.spill.nta [loc1]=f49,-256
+        adds loc2=96*16-128,in0
+        ;;
+        stf.spill.nta [loc0]=f41,-256
+        stf.spill.nta [loc1]=f33,-256
+        adds loc3=96*16-128-128,in0
+        ;;
+        stf.spill.nta [loc2]=f120,-256
+        stf.spill.nta [loc3]=f112,-256
+        ;;
+        stf.spill.nta [loc2]=f104,-256
+        stf.spill.nta [loc3]=f96,-256
+        ;;
+        stf.spill.nta [loc2]=f88,-256
+        stf.spill.nta [loc3]=f80,-256
+        ;;
+        stf.spill.nta [loc2]=f72,-256
+        stf.spill.nta [loc3]=f64,-256
+        ;;
+        stf.spill.nta [loc2]=f56,-256
+        stf.spill.nta [loc3]=f48,-256
+        ;;
+        stf.spill.nta [loc2]=f40
+        stf.spill.nta [loc3]=f32
+        br.ret.sptk.many rp
+END(__ia64_save_fpu)
+GLOBAL_ENTRY(__ia64_load_fpu)
+        alloc r2=ar.pfs,1,2,0,0
+        adds r3=128,in0
+        adds r14=256,in0
+        adds r15=384,in0
+        mov loc0=512
+        mov loc1=-1024+16
+        ;;
+        ldf.fill.nta f32=[in0],loc0
+        ldf.fill.nta f40=[ r3],loc0
+        ldf.fill.nta f48=[r14],loc0
+        ldf.fill.nta f56=[r15],loc0
+        ;;
+        ldf.fill.nta f64=[in0],loc0
+        ldf.fill.nta f72=[ r3],loc0
+        ldf.fill.nta f80=[r14],loc0
+        ldf.fill.nta f88=[r15],loc0
+        ;;
+        ldf.fill.nta f96=[in0],loc1
+        ldf.fill.nta f104=[ r3],loc1
+        ldf.fill.nta f112=[r14],loc1
+        ldf.fill.nta f120=[r15],loc1
+        ;;
+        ldf.fill.nta f33=[in0],loc0
+        ldf.fill.nta f41=[ r3],loc0
+        ldf.fill.nta f49=[r14],loc0
+        ldf.fill.nta f57=[r15],loc0
+        ;;
+        ldf.fill.nta f65=[in0],loc0
+        ldf.fill.nta f73=[ r3],loc0
+        ldf.fill.nta f81=[r14],loc0
+        ldf.fill.nta f89=[r15],loc0
+        ;;
+        ldf.fill.nta f97=[in0],loc1
+        ldf.fill.nta f105=[ r3],loc1
+        ldf.fill.nta f113=[r14],loc1
+        ldf.fill.nta f121=[r15],loc1
+        ;;
+        ldf.fill.nta f34=[in0],loc0
+        ldf.fill.nta f42=[ r3],loc0
+        ldf.fill.nta f50=[r14],loc0
+        ldf.fill.nta f58=[r15],loc0
+        ;;
+        ldf.fill.nta f66=[in0],loc0
+        ldf.fill.nta f74=[ r3],loc0
+        ldf.fill.nta f82=[r14],loc0
+        ldf.fill.nta f90=[r15],loc0
+        ;;
+        ldf.fill.nta f98=[in0],loc1
+        ldf.fill.nta f106=[ r3],loc1
+        ldf.fill.nta f114=[r14],loc1
+        ldf.fill.nta f122=[r15],loc1
+        ;;
+        ldf.fill.nta f35=[in0],loc0
+        ldf.fill.nta f43=[ r3],loc0
+        ldf.fill.nta f51=[r14],loc0
+        ldf.fill.nta f59=[r15],loc0
+        ;;
+        ldf.fill.nta f67=[in0],loc0
+        ldf.fill.nta f75=[ r3],loc0
+        ldf.fill.nta f83=[r14],loc0
+        ldf.fill.nta f91=[r15],loc0
+        ;;
+        ldf.fill.nta f99=[in0],loc1
+        ldf.fill.nta f107=[ r3],loc1
+        ldf.fill.nta f115=[r14],loc1
+        ldf.fill.nta f123=[r15],loc1
+        ;;
+        ldf.fill.nta f36=[in0],loc0
+        ldf.fill.nta f44=[ r3],loc0
+        ldf.fill.nta f52=[r14],loc0
+        ldf.fill.nta f60=[r15],loc0
+        ;;
+        ldf.fill.nta f68=[in0],loc0
+        ldf.fill.nta f76=[ r3],loc0
+        ldf.fill.nta f84=[r14],loc0
+        ldf.fill.nta f92=[r15],loc0
+        ;;
+        ldf.fill.nta f100=[in0],loc1
+        ldf.fill.nta f108=[ r3],loc1
+        ldf.fill.nta f116=[r14],loc1
+        ldf.fill.nta f124=[r15],loc1
+        ;;
+        ldf.fill.nta f37=[in0],loc0
+        ldf.fill.nta f45=[ r3],loc0
+        ldf.fill.nta f53=[r14],loc0
+        ldf.fill.nta f61=[r15],loc0
+        ;;
+        ldf.fill.nta f69=[in0],loc0
+        ldf.fill.nta f77=[ r3],loc0
+        ldf.fill.nta f85=[r14],loc0
+        ldf.fill.nta f93=[r15],loc0
+        ;;
+        ldf.fill.nta f101=[in0],loc1
+        ldf.fill.nta f109=[ r3],loc1
+        ldf.fill.nta f117=[r14],loc1
+        ldf.fill.nta f125=[r15],loc1
+        ;;
+        ldf.fill.nta f38 =[in0],loc0
+        ldf.fill.nta f46 =[ r3],loc0
+        ldf.fill.nta f54 =[r14],loc0
+        ldf.fill.nta f62 =[r15],loc0
+        ;;
+        ldf.fill.nta f70 =[in0],loc0
+        ldf.fill.nta f78 =[ r3],loc0
+        ldf.fill.nta f86 =[r14],loc0
+        ldf.fill.nta f94 =[r15],loc0
+        ;;
+        ldf.fill.nta f102=[in0],loc1
+        ldf.fill.nta f110=[ r3],loc1
+        ldf.fill.nta f118=[r14],loc1
+        ldf.fill.nta f126=[r15],loc1
+        ;;
+        ldf.fill.nta f39 =[in0],loc0
+        ldf.fill.nta f47 =[ r3],loc0
+        ldf.fill.nta f55 =[r14],loc0
+        ldf.fill.nta f63 =[r15],loc0
+        ;;
+        ldf.fill.nta f71 =[in0],loc0
+        ldf.fill.nta f79 =[ r3],loc0
+        ldf.fill.nta f87 =[r14],loc0
+        ldf.fill.nta f95 =[r15],loc0
+        ;;
+        ldf.fill.nta f103=[in0]
+        ldf.fill.nta f111=[ r3]
+        ldf.fill.nta f119=[r14]
+        ldf.fill.nta f127=[r15]
+        br.ret.sptk.many rp
+END(__ia64_load_fpu)
+GLOBAL_ENTRY(__ia64_init_fpu)
+        stf.spill [sp]=f0               // M3
+        mov      f32=f0                 // F
+        nop.b    0
+        ldfps    f33,f34=[sp]           // M0
+        ldfps    f35,f36=[sp]           // M1
+        mov      f37=f0                 // F
+        ;;
+        setf.s   f38=r0                 // M2
+        setf.s   f39=r0                 // M3
+        mov      f40=f0                 // F
+        ldfps    f41,f42=[sp]           // M0
+        ldfps    f43,f44=[sp]           // M1
+        mov      f45=f0                 // F
+        setf.s   f46=r0                 // M2
+        setf.s   f47=r0                 // M3
+        mov      f48=f0                 // F
+        ldfps    f49,f50=[sp]           // M0
+        ldfps    f51,f52=[sp]           // M1
+        mov      f53=f0                 // F
+        setf.s   f54=r0                 // M2
+        setf.s   f55=r0                 // M3
+        mov      f56=f0                 // F
+        ldfps    f57,f58=[sp]           // M0
+        ldfps    f59,f60=[sp]           // M1
+        mov      f61=f0                 // F
+        setf.s   f62=r0                 // M2
+        setf.s   f63=r0                 // M3
+        mov      f64=f0                 // F
+        ldfps    f65,f66=[sp]           // M0
+        ldfps    f67,f68=[sp]           // M1
+        mov      f69=f0                 // F
+        setf.s   f70=r0                 // M2
+        setf.s   f71=r0                 // M3
+        mov      f72=f0                 // F
+        ldfps    f73,f74=[sp]           // M0
+        ldfps    f75,f76=[sp]           // M1
+        mov      f77=f0                 // F
+        setf.s   f78=r0                 // M2
+        setf.s   f79=r0                 // M3
+        mov      f80=f0                 // F
+        ldfps    f81,f82=[sp]           // M0
+        ldfps    f83,f84=[sp]           // M1
+        mov      f85=f0                 // F
+        setf.s   f86=r0                 // M2
+        setf.s   f87=r0                 // M3
+        mov      f88=f0                 // F
+        /*
+         * When the instructions are cached, it would be faster to initialize
+         * the remaining registers with simply mov instructions (F-unit).
+         * This gets the time down to ~29 cycles.  However, this would use up
+         * 33 bundles, whereas continuing with the above pattern yields
+         * 10 bundles and ~30 cycles.
+         */
+        ldfps    f89,f90=[sp]           // M0
+        ldfps    f91,f92=[sp]           // M1
+        mov      f93=f0                 // F
+        setf.s   f94=r0                 // M2
+        setf.s   f95=r0                 // M3
+        mov      f96=f0                 // F
+        ldfps    f97,f98=[sp]           // M0
+        ldfps    f99,f100=[sp]          // M1
+        mov      f101=f0                // F
+        setf.s   f102=r0                // M2
+        setf.s   f103=r0                // M3
+        mov      f104=f0                // F
+        ldfps    f105,f106=[sp]         // M0
+        ldfps    f107,f108=[sp]         // M1
+        mov      f109=f0                // F
+        setf.s   f110=r0                // M2
+        setf.s   f111=r0                // M3
+        mov      f112=f0                // F
+        ldfps    f113,f114=[sp]         // M0
+        ldfps    f115,f116=[sp]         // M1
+        mov      f117=f0                // F
+        setf.s   f118=r0                // M2
+        setf.s   f119=r0                // M3
+        mov      f120=f0                // F
+        ldfps    f121,f122=[sp]         // M0
+        ldfps    f123,f124=[sp]         // M1
+        mov      f125=f0                // F
+        setf.s   f126=r0                // M2
+        setf.s   f127=r0                // M3
+        br.ret.sptk.many rp             // F
+END(__ia64_init_fpu)
+/*
+ * Switch execution mode from virtual to physical
+ *
+ * Inputs:
+ *      r16 = new psr to establish
+ * Output:
+ *      r19 = old virtual address of ar.bsp
+ *      r20 = old virtual address of sp
+ *
+ * Note: RSE must already be in enforced lazy mode
+ */
+GLOBAL_ENTRY(ia64_switch_mode_phys)
+ {
+        alloc r2=ar.pfs,0,0,0,0
+        rsm psr.i | psr.ic              // disable interrupts and interrupt collection
+        mov r15=ip
+ }
+        ;;
+ {
+        flushrs                         // must be first insn in group
+        srlz.i
+ }
+        ;;
+        mov cr.ipsr=r16                 // set new PSR
+        add r3=1f-ia64_switch_mode_phys,r15
+        mov r19=ar.bsp
+        mov r20=sp
+        mov r14=rp                      // get return address into a general register
+        ;;
+        // going to physical mode, use tpa to translate virt->phys
+        tpa r17=r19
+        tpa r3=r3
+        tpa sp=sp
+        tpa r14=r14
+        ;;
+        mov r18=ar.rnat                 // save ar.rnat
+        mov ar.bspstore=r17             // this steps on ar.rnat
+        mov cr.iip=r3
+        mov cr.ifs=r0
+        ;;
+        mov ar.rnat=r18                 // restore ar.rnat
+        rfi                             // must be last insn in group
+        ;;
+1:      mov rp=r14
+        br.ret.sptk.many rp
+END(ia64_switch_mode_phys)
+/*
+ * Switch execution mode from physical to virtual
+ *
+ * Inputs:
+ *      r16 = new psr to establish
+ *      r19 = new bspstore to establish
+ *      r20 = new sp to establish
+ *
+ * Note: RSE must already be in enforced lazy mode
+ */
+GLOBAL_ENTRY(ia64_switch_mode_virt)
+ {
+        alloc r2=ar.pfs,0,0,0,0
+        rsm psr.i | psr.ic              // disable interrupts and interrupt collection
+        mov r15=ip
+ }
+        ;;
+ {
+        flushrs                         // must be first insn in group
+        srlz.i
+ }
+        ;;
+        mov cr.ipsr=r16                 // set new PSR
+        add r3=1f-ia64_switch_mode_virt,r15
+        mov r14=rp                      // get return address into a general register
+        ;;
+        // going to virtual
+        //   - for code addresses, set upper bits of addr to KERNEL_START
+        //   - for stack addresses, copy from input argument
+        movl r18=KERNEL_START
+        dep r3=0,r3,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
+        dep r14=0,r14,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
+        mov sp=r20
+        ;;
+        or r3=r3,r18
+        or r14=r14,r18
+        ;;
+        mov r18=ar.rnat                 // save ar.rnat
+        mov ar.bspstore=r19             // this steps on ar.rnat
+        mov cr.iip=r3
+        mov cr.ifs=r0
+        ;;
+        mov ar.rnat=r18                 // restore ar.rnat
+        rfi                             // must be last insn in group
+        ;;
+1:      mov rp=r14
+        br.ret.sptk.many rp
+END(ia64_switch_mode_virt)
+GLOBAL_ENTRY(ia64_delay_loop)
+        .prologue
+{       nop 0                   // work around GAS unwind info generation bug...
+        .save ar.lc,r2
+        mov r2=ar.lc
+        .body
+        ;;
+        mov ar.lc=r32
+}
+        ;;
+        // force loop to be 32-byte aligned (GAS bug means we cannot use .align
+        // inside function body without corrupting unwind info).
+{       nop 0 }
+1:      br.cloop.sptk.few 1b
+        ;;
+        mov ar.lc=r2
+        br.ret.sptk.many rp
+END(ia64_delay_loop)
+/*
+ * Return a CPU-local timestamp in nano-seconds.  This timestamp is
+ * NOT synchronized across CPUs its return value must never be
+ * compared against the values returned on another CPU.  The usage in
+ * kernel/sched.c ensures that.
+ *
+ * The return-value of sched_clock() is NOT supposed to wrap-around.
+ * If it did, it would cause some scheduling hiccups (at the worst).
+ * Fortunately, with a 64-bit cycle-counter ticking at 100GHz, even
+ * that would happen only once every 5+ years.
+ *
+ * The code below basically calculates:
+ *
+ *   (ia64_get_itc() * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT
+ *
+ * except that the multiplication and the shift are done with 128-bit
+ * intermediate precision so that we can produce a full 64-bit result.
+ */
+GLOBAL_ENTRY(sched_clock)
+        addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
+        mov.m r9=ar.itc         // fetch cycle-counter                          (35 cyc)
+        ;;
+        ldf8 f8=[r8]
+        ;;
+        setf.sig f9=r9          // certain to stall, so issue it _after_ ldf8...
+        ;;
+        xmpy.lu f10=f9,f8       // calculate low 64 bits of 128-bit product     (4 cyc)
+        xmpy.hu f11=f9,f8       // calculate high 64 bits of 128-bit product
+        ;;
+        getf.sig r8=f10         //                                              (5 cyc)
+        getf.sig r9=f11
+        ;;
+        shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
+        br.ret.sptk.many rp
+END(sched_clock)
+GLOBAL_ENTRY(start_kernel_thread)
+        .prologue
+        .save rp, r0                            // this is the end of the call-chain
+        .body
+        alloc r2 = ar.pfs, 0, 0, 2, 0
+        mov out0 = r9
+        mov out1 = r11;;
+        br.call.sptk.many rp = kernel_thread_helper;;
+        mov out0 = r8
+        br.call.sptk.many rp = sys_exit;;
+1:      br.sptk.few 1b                          // not reached
+END(start_kernel_thread)
+#ifdef CONFIG_IA64_BRL_EMU
+/*
+ *  Assembly routines used by brl_emu.c to set preserved register state.
+ */
+#define SET_REG(reg)                            \
+ GLOBAL_ENTRY(ia64_set_##reg);                  \
+        alloc r16=ar.pfs,1,0,0,0;               \
+        mov reg=r32;                            \
+        ;;                                      \
+        br.ret.sptk.many rp;                    \
+ END(ia64_set_##reg)
+SET_REG(b1);
+SET_REG(b2);
+SET_REG(b3);
+SET_REG(b4);
+SET_REG(b5);
+#endif /* CONFIG_IA64_BRL_EMU */
+#ifdef CONFIG_SMP
+        /*
+         * This routine handles spinlock contention.  It uses a non-standard calling
+         * convention to avoid converting leaf routines into interior routines.  Because
+         * of this special convention, there are several restrictions:
+         *
+         * - do not use gp relative variables, this code is called from the kernel
+         *   and from modules, r1 is undefined.
+         * - do not use stacked registers, the caller owns them.
+         * - do not use the scratch stack space, the caller owns it.
+         * - do not use any registers other than the ones listed below
+         *
+         * Inputs:
+         *   ar.pfs - saved CFM of caller
+         *   ar.ccv - 0 (and available for use)
+         *   r27    - flags from spin_lock_irqsave or 0.  Must be preserved.
+         *   r28    - available for use.
+         *   r29    - available for use.
+         *   r30    - available for use.
+         *   r31    - address of lock, available for use.
+         *   b6     - return address
+         *   p14    - available for use.
+         *   p15    - used to track flag status.
+         *
+         * If you patch this code to use more registers, do not forget to update
+         * the clobber lists for spin_lock() in include/asm-ia64/spinlock.h.
+         */
+#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+GLOBAL_ENTRY(ia64_spinlock_contention_pre3_4)
+        .prologue
+        .save ar.pfs, r0        // this code effectively has a zero frame size
+        .save rp, r28
+        .body
+        nop 0
+        tbit.nz p15,p0=r27,IA64_PSR_I_BIT
+        .restore sp             // pop existing prologue after next insn
+        mov b6 = r28
+        .prologue
+        .save ar.pfs, r0
+        .altrp b6
+        .body
+        ;;
+(p15)   ssm psr.i               // reenable interrupts if they were on
+                                // DavidM says that srlz.d is slow and is not required in this case
+.wait:
+        // exponential backoff, kdb, lockmeter etc. go in here
+        hint @pause
+        ld4 r30=[r31]           // don't use ld4.bias; if it's contended, we won't write the word
+        nop 0
+        ;;
+        cmp4.ne p14,p0=r30,r0
+(p14)   br.cond.sptk.few .wait
+(p15)   rsm psr.i               // disable interrupts if we reenabled them
+        br.cond.sptk.few b6     // lock is now free, try to acquire
+        .global ia64_spinlock_contention_pre3_4_end     // for kernprof
+ia64_spinlock_contention_pre3_4_end:
+END(ia64_spinlock_contention_pre3_4)
+#else
+GLOBAL_ENTRY(ia64_spinlock_contention)
+        .prologue
+        .altrp b6
+        .body
+        tbit.nz p15,p0=r27,IA64_PSR_I_BIT
+        ;;
+.wait:
+(p15)   ssm psr.i               // reenable interrupts if they were on
+                                // DavidM says that srlz.d is slow and is not required in this case
+.wait2:
+        // exponential backoff, kdb, lockmeter etc. go in here
+        hint @pause
+        ld4 r30=[r31]           // don't use ld4.bias; if it's contended, we won't write the word
+        ;;
+        cmp4.ne p14,p0=r30,r0
+        mov r30 = 1
+(p14)   br.cond.sptk.few .wait2
+(p15)   rsm psr.i               // disable interrupts if we reenabled them
+        ;;
+        cmpxchg4.acq r30=[r31], r30, ar.ccv
+        ;;
+        cmp4.ne p14,p0=r0,r30
+(p14)   br.cond.sptk.few .wait
+        br.ret.sptk.many b6     // lock is now taken
+END(ia64_spinlock_contention)
+#endif
+#endif /* CONFIG_SMP */
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
new file mode 100644
index 000000000000..7bbf019c9867
--- /dev/null
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -0,0 +1,127 @@
+/*
+ * Architecture-specific kernel symbols
+ *
+ * Don't put any exports here unless it's defined in an assembler file.
+ * All other exports should be put directly after the definition.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/string.h>
+EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(memchr);
+EXPORT_SYMBOL(memcmp);
+EXPORT_SYMBOL(memcpy);
+EXPORT_SYMBOL(memmove);
+EXPORT_SYMBOL(memscan);
+EXPORT_SYMBOL(strcat);
+EXPORT_SYMBOL(strchr);
+EXPORT_SYMBOL(strcmp);
+EXPORT_SYMBOL(strcpy);
+EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(strncat);
+EXPORT_SYMBOL(strncmp);
+EXPORT_SYMBOL(strncpy);
+EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(strrchr);
+EXPORT_SYMBOL(strstr);
+EXPORT_SYMBOL(strpbrk);
+#include <asm/checksum.h>
+EXPORT_SYMBOL(ip_fast_csum);            /* hand-coded assembly */
+#include <asm/semaphore.h>
+EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__down_trylock);
+EXPORT_SYMBOL(__up);
+#include <asm/page.h>
+EXPORT_SYMBOL(clear_page);
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+#include <linux/bootmem.h>
+EXPORT_SYMBOL(max_low_pfn);     /* defined by bootmem.c, but not exported by generic code */
+#endif
+#include <asm/processor.h>
+EXPORT_SYMBOL(per_cpu__cpu_info);
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(per_cpu__local_per_cpu_offset);
+#endif
+#include <asm/uaccess.h>
+EXPORT_SYMBOL(__copy_user);
+EXPORT_SYMBOL(__do_clear_user);
+EXPORT_SYMBOL(__strlen_user);
+EXPORT_SYMBOL(__strncpy_from_user);
+EXPORT_SYMBOL(__strnlen_user);
+#include <asm/unistd.h>
+EXPORT_SYMBOL(__ia64_syscall);
+/* from arch/ia64/lib */
+extern void __divsi3(void);
+extern void __udivsi3(void);
+extern void __modsi3(void);
+extern void __umodsi3(void);
+extern void __divdi3(void);
+extern void __udivdi3(void);
+extern void __moddi3(void);
+extern void __umoddi3(void);
+EXPORT_SYMBOL(__divsi3);
+EXPORT_SYMBOL(__udivsi3);
+EXPORT_SYMBOL(__modsi3);
+EXPORT_SYMBOL(__umodsi3);
+EXPORT_SYMBOL(__divdi3);
+EXPORT_SYMBOL(__udivdi3);
+EXPORT_SYMBOL(__moddi3);
+EXPORT_SYMBOL(__umoddi3);
+#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE)
+extern void xor_ia64_2(void);
+extern void xor_ia64_3(void);
+extern void xor_ia64_4(void);
+extern void xor_ia64_5(void);
+EXPORT_SYMBOL(xor_ia64_2);
+EXPORT_SYMBOL(xor_ia64_3);
+EXPORT_SYMBOL(xor_ia64_4);
+EXPORT_SYMBOL(xor_ia64_5);
+#endif
+#include <asm/pal.h>
+EXPORT_SYMBOL(ia64_pal_call_phys_stacked);
+EXPORT_SYMBOL(ia64_pal_call_phys_static);
+EXPORT_SYMBOL(ia64_pal_call_stacked);
+EXPORT_SYMBOL(ia64_pal_call_static);
+EXPORT_SYMBOL(ia64_load_scratch_fpregs);
+EXPORT_SYMBOL(ia64_save_scratch_fpregs);
+#include <asm/unwind.h>
+EXPORT_SYMBOL(unw_init_running);
+#ifdef ASM_SUPPORTED
+# ifdef CONFIG_SMP
+#  if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
+/*
+ * This is not a normal routine and we don't want a function descriptor for it, so we use
+ * a fake declaration here.
+ */
+extern char ia64_spinlock_contention_pre3_4;
+EXPORT_SYMBOL(ia64_spinlock_contention_pre3_4);
+#  else
+/*
+ * This is not a normal routine and we don't want a function descriptor for it, so we use
+ * a fake declaration here.
+ */
+extern char ia64_spinlock_contention;
+EXPORT_SYMBOL(ia64_spinlock_contention);
+#  endif
+# endif
+#endif
+extern char ia64_ivt[];
+EXPORT_SYMBOL(ia64_ivt);
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
new file mode 100644
index 000000000000..b69c397ed1bf
--- /dev/null
+++ b/arch/ia64/kernel/init_task.c
@@ -0,0 +1,46 @@
+/*
+ * This is where we statically allocate and initialize the initial
+ * task.
+ *
+ * Copyright (C) 1999, 2002-2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init_task.h>
+#include <linux/mqueue.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+struct mm_struct init_mm = INIT_MM(init_mm);
+EXPORT_SYMBOL(init_mm);
+/*
+ * Initial task structure.
+ *
+ * We need to make sure that this is properly aligned due to the way process stacks are
+ * handled. This is done by having a special ".data.init_task" section...
+ */
+#define init_thread_info        init_task_mem.s.thread_info
+union {
+        struct {
+                struct task_struct task;
+                struct thread_info thread_info;
+        } s;
+        unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)];
+} init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{
+        .task =         INIT_TASK(init_task_mem.s.task),
+        .thread_info =  INIT_THREAD_INFO(init_task_mem.s.task)
+}};
+EXPORT_SYMBOL(init_task);
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
new file mode 100644
index 000000000000..c15be5c38f56
--- /dev/null
+++ b/arch/ia64/kernel/iosapic.c
@@ -0,0 +1,827 @@
+/*
+ * I/O SAPIC support.
+ *
+ * Copyright (C) 1999 Intel Corp.
+ * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
+ * Copyright (C) 2000-2002 J.I. Lee <jung-ik.lee@intel.com>
+ * Copyright (C) 1999-2000, 2002-2003 Hewlett-Packard Co.
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com>
+ *
+ * 00/04/19     D. Mosberger    Rewritten to mirror more closely the x86 I/O APIC code.
+ *                              In particular, we now have separate handlers for edge
+ *                              and level triggered interrupts.
+ * 00/10/27     Asit Mallick, Goutham Rao <goutham.rao@intel.com> IRQ vector allocation
+ *                              PCI to vector mapping, shared PCI interrupts.
+ * 00/10/27     D. Mosberger    Document things a bit more to make them more understandable.
+ *                              Clean up much of the old IOSAPIC cruft.
+ * 01/07/27     J.I. Lee        PCI irq routing, Platform/Legacy interrupts and fixes for
+ *                              ACPI S5(SoftOff) support.
+ * 02/01/23     J.I. Lee        iosapic pgm fixes for PCI irq routing from _PRT
+ * 02/01/07     E. Focht        <efocht@ess.nec.de> Redirectable interrupt vectors in
+ *                              iosapic_set_affinity(), initializations for
+ *                              /proc/irq/#/smp_affinity
+ * 02/04/02     P. Diefenbaugh  Cleaned up ACPI PCI IRQ routing.
+ * 02/04/18     J.I. Lee        bug fix in iosapic_init_pci_irq
+ * 02/04/30     J.I. Lee        bug fix in find_iosapic to fix ACPI PCI IRQ to IOSAPIC mapping
+ *                              error
+ * 02/07/29     T. Kochi        Allocate interrupt vectors dynamically
+ * 02/08/04     T. Kochi        Cleaned up terminology (irq, global system interrupt, vector, etc.)
+ * 02/09/20     D. Mosberger    Simplified by taking advantage of ACPI's pci_irq code.
+ * 03/02/19     B. Helgaas      Make pcat_compat system-wide, not per-IOSAPIC.
+ *                              Remove iosapic_address & gsi_base from external interfaces.
+ *                              Rationalize __init/__devinit attributes.
+ * 04/12/04 Ashok Raj   <ashok.raj@intel.com> Intel Corporation 2004
+ *                              Updated to work with irq migration necessary for CPU Hotplug
+ */
+/*
+ * Here is what the interrupt logic between a PCI device and the kernel looks like:
+ *
+ * (1) A PCI device raises one of the four interrupt pins (INTA, INTB, INTC, INTD).  The
+ *     device is uniquely identified by its bus--, and slot-number (the function
+ *     number does not matter here because all functions share the same interrupt
+ *     lines).
+ *
+ * (2) The motherboard routes the interrupt line to a pin on a IOSAPIC controller.
+ *     Multiple interrupt lines may have to share the same IOSAPIC pin (if they're level
+ *     triggered and use the same polarity).  Each interrupt line has a unique Global
+ *     System Interrupt (GSI) number which can be calculated as the sum of the controller's
+ *     base GSI number and the IOSAPIC pin number to which the line connects.
+ *
+ * (3) The IOSAPIC uses an internal routing table entries (RTEs) to map the IOSAPIC pin
+ *     into the IA-64 interrupt vector.  This interrupt vector is then sent to the CPU.
+ *
+ * (4) The kernel recognizes an interrupt as an IRQ.  The IRQ interface is used as
+ *     architecture-independent interrupt handling mechanism in Linux.  As an
+ *     IRQ is a number, we have to have IA-64 interrupt vector number <-> IRQ number
+ *     mapping.  On smaller systems, we use one-to-one mapping between IA-64 vector and
+ *     IRQ.  A platform can implement platform_irq_to_vector(irq) and
+ *     platform_local_vector_to_irq(vector) APIs to differentiate the mapping.
+ *     Please see also include/asm-ia64/hw_irq.h for those APIs.
+ *
+ * To sum up, there are three levels of mappings involved:
+ *
+ *      PCI pin -> global system interrupt (GSI) -> IA-64 vector <-> IRQ
+ *
+ * Note: The term "IRQ" is loosely used everywhere in Linux kernel to describe interrupts.
+ * Now we use "IRQ" only for Linux IRQ's.  ISA IRQ (isa_irq) is the only exception in this
+ * source code.
+ */
+#include <linux/config.h>
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/string.h>
+#include <asm/delay.h>
+#include <asm/hw_irq.h>
+#include <asm/io.h>
+#include <asm/iosapic.h>
+#include <asm/machvec.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/system.h>
+#undef DEBUG_INTERRUPT_ROUTING
+#ifdef DEBUG_INTERRUPT_ROUTING
+#define DBG(fmt...)     printk(fmt)
+#else
+#define DBG(fmt...)
+#endif
+static DEFINE_SPINLOCK(iosapic_lock);
+/* These tables map IA-64 vectors to the IOSAPIC pin that generates this vector. */
+static struct iosapic_intr_info {
+        char __iomem    *addr;          /* base address of IOSAPIC */
+        u32             low32;          /* current value of low word of Redirection table entry */
+        unsigned int    gsi_base;       /* first GSI assigned to this IOSAPIC */
+        char            rte_index;      /* IOSAPIC RTE index (-1 => not an IOSAPIC interrupt) */
+        unsigned char   dmode   : 3;    /* delivery mode (see iosapic.h) */
+        unsigned char   polarity: 1;    /* interrupt polarity (see iosapic.h) */
+        unsigned char   trigger : 1;    /* trigger mode (see iosapic.h) */
+        int             refcnt;         /* reference counter */
+} iosapic_intr_info[IA64_NUM_VECTORS];
+static struct iosapic {
+        char __iomem    *addr;          /* base address of IOSAPIC */
+        unsigned int    gsi_base;       /* first GSI assigned to this IOSAPIC */
+        unsigned short  num_rte;        /* number of RTE in this IOSAPIC */
+#ifdef CONFIG_NUMA
+        unsigned short  node;           /* numa node association via pxm */
+#endif
+} iosapic_lists[NR_IOSAPICS];
+static int num_iosapic;
+static unsigned char pcat_compat __initdata;    /* 8259 compatibility flag */
+/*
+ * Find an IOSAPIC associated with a GSI
+ */
+static inline int
+find_iosapic (unsigned int gsi)
+{
+        int i;
+        for (i = 0; i < num_iosapic; i++) {
+                if ((unsigned) (gsi - iosapic_lists[i].gsi_base) < iosapic_lists[i].num_rte)
+                        return i;
+        }
+        return -1;
+}
+static inline int
+_gsi_to_vector (unsigned int gsi)
+{
+        struct iosapic_intr_info *info;
+        for (info = iosapic_intr_info; info < iosapic_intr_info + IA64_NUM_VECTORS; ++info)
+                if (info->gsi_base + info->rte_index == gsi)
+                        return info - iosapic_intr_info;
+        return -1;
+}
+/*
+ * Translate GSI number to the corresponding IA-64 interrupt vector.  If no
+ * entry exists, return -1.
+ */
+inline int
+gsi_to_vector (unsigned int gsi)
+{
+        return _gsi_to_vector(gsi);
+}
+int
+gsi_to_irq (unsigned int gsi)
+{
+        /*
+         * XXX fix me: this assumes an identity mapping vetween IA-64 vector and Linux irq
+         * numbers...
+         */
+        return _gsi_to_vector(gsi);
+}
+static void
+set_rte (unsigned int vector, unsigned int dest, int mask)
+{
+        unsigned long pol, trigger, dmode;
+        u32 low32, high32;
+        char __iomem *addr;
+        int rte_index;
+        char redir;
+        DBG(KERN_DEBUG"IOSAPIC: routing vector %d to 0x%x\n", vector, dest);
+        rte_index = iosapic_intr_info[vector].rte_index;
+        if (rte_index < 0)
+                return;         /* not an IOSAPIC interrupt */
+        addr    = iosapic_intr_info[vector].addr;
+        pol     = iosapic_intr_info[vector].polarity;
+        trigger = iosapic_intr_info[vector].trigger;
+        dmode   = iosapic_intr_info[vector].dmode;
+        vector &= (~IA64_IRQ_REDIRECTED);
+        redir = (dmode == IOSAPIC_LOWEST_PRIORITY) ? 1 : 0;
+#ifdef CONFIG_SMP
+        {
+                unsigned int irq;
+                for (irq = 0; irq < NR_IRQS; ++irq)
+                        if (irq_to_vector(irq) == vector) {
+                                set_irq_affinity_info(irq, (int)(dest & 0xffff), redir);
+                                break;
+                        }
+        }
+#endif
+        low32 = ((pol << IOSAPIC_POLARITY_SHIFT) |
+                 (trigger << IOSAPIC_TRIGGER_SHIFT) |
+                 (dmode << IOSAPIC_DELIVERY_SHIFT) |
+                 ((mask ? 1 : 0) << IOSAPIC_MASK_SHIFT) |
+                 vector);
+        /* dest contains both id and eid */
+        high32 = (dest << IOSAPIC_DEST_SHIFT);
+        iosapic_write(addr, IOSAPIC_RTE_HIGH(rte_index), high32);
+        iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
+        iosapic_intr_info[vector].low32 = low32;
+}
+static void
+nop (unsigned int vector)
+{
+        /* do nothing... */
+}
+static void
+mask_irq (unsigned int irq)
+{
+        unsigned long flags;
+        char __iomem *addr;
+        u32 low32;
+        int rte_index;
+        ia64_vector vec = irq_to_vector(irq);
+        addr = iosapic_intr_info[vec].addr;
+        rte_index = iosapic_intr_info[vec].rte_index;
+        if (rte_index < 0)
+                return;                 /* not an IOSAPIC interrupt! */
+        spin_lock_irqsave(&iosapic_lock, flags);
+        {
+                /* set only the mask bit */
+                low32 = iosapic_intr_info[vec].low32 |= IOSAPIC_MASK;
+                iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
+        }
+        spin_unlock_irqrestore(&iosapic_lock, flags);
+}
+static void
+unmask_irq (unsigned int irq)
+{
+        unsigned long flags;
+        char __iomem *addr;
+        u32 low32;
+        int rte_index;
+        ia64_vector vec = irq_to_vector(irq);
+        addr = iosapic_intr_info[vec].addr;
+        rte_index = iosapic_intr_info[vec].rte_index;
+        if (rte_index < 0)
+                return;                 /* not an IOSAPIC interrupt! */
+        spin_lock_irqsave(&iosapic_lock, flags);
+        {
+                low32 = iosapic_intr_info[vec].low32 &= ~IOSAPIC_MASK;
+                iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
+        }
+        spin_unlock_irqrestore(&iosapic_lock, flags);
+}
+static void
+iosapic_set_affinity (unsigned int irq, cpumask_t mask)
+{
+#ifdef CONFIG_SMP
+        unsigned long flags;
+        u32 high32, low32;
+        int dest, rte_index;
+        char __iomem *addr;
+        int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0;
+        ia64_vector vec;
+        irq &= (~IA64_IRQ_REDIRECTED);
+        vec = irq_to_vector(irq);
+        if (cpus_empty(mask))
+                return;
+        dest = cpu_physical_id(first_cpu(mask));
+        rte_index = iosapic_intr_info[vec].rte_index;
+        addr = iosapic_intr_info[vec].addr;
+        if (rte_index < 0)
+                return;                 /* not an IOSAPIC interrupt */
+        set_irq_affinity_info(irq, dest, redir);
+        /* dest contains both id and eid */
+        high32 = dest << IOSAPIC_DEST_SHIFT;
+        spin_lock_irqsave(&iosapic_lock, flags);
+        {
+                low32 = iosapic_intr_info[vec].low32 & ~(7 << IOSAPIC_DELIVERY_SHIFT);
+                if (redir)
+                        /* change delivery mode to lowest priority */
+                        low32 |= (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
+                else
+                        /* change delivery mode to fixed */
+                        low32 |= (IOSAPIC_FIXED << IOSAPIC_DELIVERY_SHIFT);
+                iosapic_intr_info[vec].low32 = low32;
+                iosapic_write(addr, IOSAPIC_RTE_HIGH(rte_index), high32);
+                iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32);
+        }
+        spin_unlock_irqrestore(&iosapic_lock, flags);
+#endif
+}
+/*
+ * Handlers for level-triggered interrupts.
+ */
+static unsigned int
+iosapic_startup_level_irq (unsigned int irq)
+{
+        unmask_irq(irq);
+        return 0;
+}
+static void
+iosapic_end_level_irq (unsigned int irq)
+{
+        ia64_vector vec = irq_to_vector(irq);
+        move_irq(irq);
+        iosapic_eoi(iosapic_intr_info[vec].addr, vec);
+}
+#define iosapic_shutdown_level_irq      mask_irq
+#define iosapic_enable_level_irq        unmask_irq
+#define iosapic_disable_level_irq       mask_irq
+#define iosapic_ack_level_irq           nop
+struct hw_interrupt_type irq_type_iosapic_level = {
+        .typename =     "IO-SAPIC-level",
+        .startup =      iosapic_startup_level_irq,
+        .shutdown =     iosapic_shutdown_level_irq,
+        .enable =       iosapic_enable_level_irq,
+        .disable =      iosapic_disable_level_irq,
+        .ack =          iosapic_ack_level_irq,
+        .end =          iosapic_end_level_irq,
+        .set_affinity = iosapic_set_affinity
+};
+/*
+ * Handlers for edge-triggered interrupts.
+ */
+static unsigned int
+iosapic_startup_edge_irq (unsigned int irq)
+{
+        unmask_irq(irq);
+        /*
+         * IOSAPIC simply drops interrupts pended while the
+         * corresponding pin was masked, so we can't know if an
+         * interrupt is pending already.  Let's hope not...
+         */
+        return 0;
+}
+static void
+iosapic_ack_edge_irq (unsigned int irq)
+{
+        irq_desc_t *idesc = irq_descp(irq);
+        move_irq(irq);
+        /*
+         * Once we have recorded IRQ_PENDING already, we can mask the
+         * interrupt for real. This prevents IRQ storms from unhandled
+         * devices.
+         */
+        if ((idesc->status & (IRQ_PENDING|IRQ_DISABLED)) == (IRQ_PENDING|IRQ_DISABLED))
+                mask_irq(irq);
+}
+#define iosapic_enable_edge_irq         unmask_irq
+#define iosapic_disable_edge_irq        nop
+#define iosapic_end_edge_irq            nop
+struct hw_interrupt_type irq_type_iosapic_edge = {
+        .typename =     "IO-SAPIC-edge",
+        .startup =      iosapic_startup_edge_irq,
+        .shutdown =     iosapic_disable_edge_irq,
+        .enable =       iosapic_enable_edge_irq,
+        .disable =      iosapic_disable_edge_irq,
+        .ack =          iosapic_ack_edge_irq,
+        .end =          iosapic_end_edge_irq,
+        .set_affinity = iosapic_set_affinity
+};
+unsigned int
+iosapic_version (char __iomem *addr)
+{
+        /*
+         * IOSAPIC Version Register return 32 bit structure like:
+         * {
+         *      unsigned int version   : 8;
+         *      unsigned int reserved1 : 8;
+         *      unsigned int max_redir : 8;
+         *      unsigned int reserved2 : 8;
+         * }
+         */
+        return iosapic_read(addr, IOSAPIC_VERSION);
+}
+/*
+ * if the given vector is already owned by other,
+ *  assign a new vector for the other and make the vector available
+ */
+static void __init
+iosapic_reassign_vector (int vector)
+{
+        int new_vector;
+        if (iosapic_intr_info[vector].rte_index >= 0 || iosapic_intr_info[vector].addr
+            || iosapic_intr_info[vector].gsi_base || iosapic_intr_info[vector].dmode
+            || iosapic_intr_info[vector].polarity || iosapic_intr_info[vector].trigger)
+        {
+                new_vector = assign_irq_vector(AUTO_ASSIGN);
+                printk(KERN_INFO "Reassigning vector %d to %d\n", vector, new_vector);
+                memcpy(&iosapic_intr_info[new_vector], &iosapic_intr_info[vector],
+                       sizeof(struct iosapic_intr_info));
+                memset(&iosapic_intr_info[vector], 0, sizeof(struct iosapic_intr_info));
+                iosapic_intr_info[vector].rte_index = -1;
+        }
+}
+static void
+register_intr (unsigned int gsi, int vector, unsigned char delivery,
+               unsigned long polarity, unsigned long trigger)
+{
+        irq_desc_t *idesc;
+        struct hw_interrupt_type *irq_type;
+        int rte_index;
+        int index;
+        unsigned long gsi_base;
+        void __iomem *iosapic_address;
+        index = find_iosapic(gsi);
+        if (index < 0) {
+                printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n", __FUNCTION__, gsi);
+                return;
+        }
+        iosapic_address = iosapic_lists[index].addr;
+        gsi_base = iosapic_lists[index].gsi_base;
+        rte_index = gsi - gsi_base;
+        iosapic_intr_info[vector].rte_index = rte_index;
+        iosapic_intr_info[vector].polarity = polarity;
+        iosapic_intr_info[vector].dmode    = delivery;
+        iosapic_intr_info[vector].addr     = iosapic_address;
+        iosapic_intr_info[vector].gsi_base = gsi_base;
+        iosapic_intr_info[vector].trigger  = trigger;
+        iosapic_intr_info[vector].refcnt++;
+        if (trigger == IOSAPIC_EDGE)
+                irq_type = &irq_type_iosapic_edge;
+        else
+                irq_type = &irq_type_iosapic_level;
+        idesc = irq_descp(vector);
+        if (idesc->handler != irq_type) {
+                if (idesc->handler != &no_irq_type)
+                        printk(KERN_WARNING "%s: changing vector %d from %s to %s\n",
+                               __FUNCTION__, vector, idesc->handler->typename, irq_type->typename);
+                idesc->handler = irq_type;
+        }
+}
+static unsigned int
+get_target_cpu (unsigned int gsi, int vector)
+{
+#ifdef CONFIG_SMP
+        static int cpu = -1;
+        /*
+         * If the platform supports redirection via XTP, let it
+         * distribute interrupts.
+         */
+        if (smp_int_redirect & SMP_IRQ_REDIRECTION)
+                return cpu_physical_id(smp_processor_id());
+        /*
+         * Some interrupts (ACPI SCI, for instance) are registered
+         * before the BSP is marked as online.
+         */
+        if (!cpu_online(smp_processor_id()))
+                return cpu_physical_id(smp_processor_id());
+#ifdef CONFIG_NUMA
+        {
+                int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0;
+                cpumask_t cpu_mask;
+                iosapic_index = find_iosapic(gsi);
+                if (iosapic_index < 0 ||
+                    iosapic_lists[iosapic_index].node == MAX_NUMNODES)
+                        goto skip_numa_setup;
+                cpu_mask = node_to_cpumask(iosapic_lists[iosapic_index].node);
+                for_each_cpu_mask(numa_cpu, cpu_mask) {
+                        if (!cpu_online(numa_cpu))
+                                cpu_clear(numa_cpu, cpu_mask);
+                }
+                num_cpus = cpus_weight(cpu_mask);
+                if (!num_cpus)
+                        goto skip_numa_setup;
+                /* Use vector assigment to distribute across cpus in node */
+                cpu_index = vector % num_cpus;
+                for (numa_cpu = first_cpu(cpu_mask) ; i < cpu_index ; i++)
+                        numa_cpu = next_cpu(numa_cpu, cpu_mask);
+                if (numa_cpu != NR_CPUS)
+                        return cpu_physical_id(numa_cpu);
+        }
+skip_numa_setup:
+#endif
+        /*
+         * Otherwise, round-robin interrupt vectors across all the
+         * processors.  (It'd be nice if we could be smarter in the
+         * case of NUMA.)
+         */
+        do {
+                if (++cpu >= NR_CPUS)
+                        cpu = 0;
+        } while (!cpu_online(cpu));
+        return cpu_physical_id(cpu);
+#else
+        return cpu_physical_id(smp_processor_id());
+#endif
+}
+/*
+ * ACPI can describe IOSAPIC interrupts via static tables and namespace
+ * methods.  This provides an interface to register those interrupts and
+ * program the IOSAPIC RTE.
+ */
+int
+iosapic_register_intr (unsigned int gsi,
+                       unsigned long polarity, unsigned long trigger)
+{
+        int vector;
+        unsigned int dest;
+        unsigned long flags;
+        /*
+         * If this GSI has already been registered (i.e., it's a
+         * shared interrupt, or we lost a race to register it),
+         * don't touch the RTE.
+         */
+        spin_lock_irqsave(&iosapic_lock, flags);
+        {
+                vector = gsi_to_vector(gsi);
+                if (vector > 0) {
+                        iosapic_intr_info[vector].refcnt++;
+                        spin_unlock_irqrestore(&iosapic_lock, flags);
+                        return vector;
+                }
+                vector = assign_irq_vector(AUTO_ASSIGN);
+                dest = get_target_cpu(gsi, vector);
+                register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY,
+                        polarity, trigger);
+                set_rte(vector, dest, 1);
+        }
+        spin_unlock_irqrestore(&iosapic_lock, flags);
+        printk(KERN_INFO "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n",
+               gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
+               (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
+               cpu_logical_id(dest), dest, vector);
+        return vector;
+}
+#ifdef CONFIG_ACPI_DEALLOCATE_IRQ
+void
+iosapic_unregister_intr (unsigned int gsi)
+{
+        unsigned long flags;
+        int irq, vector;
+        irq_desc_t *idesc;
+        int rte_index;
+        unsigned long trigger, polarity;
+        /*
+         * If the irq associated with the gsi is not found,
+         * iosapic_unregister_intr() is unbalanced. We need to check
+         * this again after getting locks.
+         */
+        irq = gsi_to_irq(gsi);
+        if (irq < 0) {
+                printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", gsi);
+                WARN_ON(1);
+                return;
+        }
+        vector = irq_to_vector(irq);
+        idesc = irq_descp(irq);
+        spin_lock_irqsave(&idesc->lock, flags);
+        spin_lock(&iosapic_lock);
+        {
+                rte_index = iosapic_intr_info[vector].rte_index;
+                if (rte_index < 0) {
+                        spin_unlock(&iosapic_lock);
+                        spin_unlock_irqrestore(&idesc->lock, flags);
+                        printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", gsi);
+                        WARN_ON(1);
+                        return;
+                }
+                if (--iosapic_intr_info[vector].refcnt > 0) {
+                        spin_unlock(&iosapic_lock);
+                        spin_unlock_irqrestore(&idesc->lock, flags);
+                        return;
+                }
+                /*
+                 * If interrupt handlers still exist on the irq
+                 * associated with the gsi, don't unregister the
+                 * interrupt.
+                 */
+                if (idesc->action) {
+                        iosapic_intr_info[vector].refcnt++;
+                        spin_unlock(&iosapic_lock);
+                        spin_unlock_irqrestore(&idesc->lock, flags);
+                        printk(KERN_WARNING "Cannot unregister GSI. IRQ %u is still in use.\n", irq);
+                        return;
+                }
+                /* Clear the interrupt controller descriptor. */
+                idesc->handler = &no_irq_type;
+                trigger  = iosapic_intr_info[vector].trigger;
+                polarity = iosapic_intr_info[vector].polarity;
+                /* Clear the interrupt information. */
+                memset(&iosapic_intr_info[vector], 0, sizeof(struct iosapic_intr_info));
+                iosapic_intr_info[vector].rte_index = -1;       /* mark as unused */
+        }
+        spin_unlock(&iosapic_lock);
+        spin_unlock_irqrestore(&idesc->lock, flags);
+        /* Free the interrupt vector */
+        free_irq_vector(vector);
+        printk(KERN_INFO "GSI %u (%s, %s) -> vector %d unregisterd.\n",
+               gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
+               (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
+               vector);
+}
+#endif /* CONFIG_ACPI_DEALLOCATE_IRQ */
+/*
+ * ACPI calls this when it finds an entry for a platform interrupt.
+ * Note that the irq_base and IOSAPIC address must be set in iosapic_init().
+ */
+int __init
+iosapic_register_platform_intr (u32 int_type, unsigned int gsi,
+                                int iosapic_vector, u16 eid, u16 id,
+                                unsigned long polarity, unsigned long trigger)
+{
+        static const char * const name[] = {"unknown", "PMI", "INIT", "CPEI"};
+        unsigned char delivery;
+        int vector, mask = 0;
+        unsigned int dest = ((id << 8) | eid) & 0xffff;
+        switch (int_type) {
+              case ACPI_INTERRUPT_PMI:
+                vector = iosapic_vector;
+                /*
+                 * since PMI vector is alloc'd by FW(ACPI) not by kernel,
+                 * we need to make sure the vector is available
+                 */
+                iosapic_reassign_vector(vector);
+                delivery = IOSAPIC_PMI;
+                break;
+              case ACPI_INTERRUPT_INIT:
+                vector = assign_irq_vector(AUTO_ASSIGN);
+                delivery = IOSAPIC_INIT;
+                break;
+              case ACPI_INTERRUPT_CPEI:
+                vector = IA64_CPE_VECTOR;
+                delivery = IOSAPIC_LOWEST_PRIORITY;
+                mask = 1;
+                break;
+              default:
+                printk(KERN_ERR "iosapic_register_platform_irq(): invalid int type 0x%x\n", int_type);
+                return -1;
+        }
+        register_intr(gsi, vector, delivery, polarity, trigger);
+        printk(KERN_INFO "PLATFORM int %s (0x%x): GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n",
+               int_type < ARRAY_SIZE(name) ? name[int_type] : "unknown",
+               int_type, gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
+               (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
+               cpu_logical_id(dest), dest, vector);
+        set_rte(vector, dest, mask);
+        return vector;
+}
+/*
+ * ACPI calls this when it finds an entry for a legacy ISA IRQ override.
+ * Note that the gsi_base and IOSAPIC address must be set in iosapic_init().
+ */
+void __init
+iosapic_override_isa_irq (unsigned int isa_irq, unsigned int gsi,
+                          unsigned long polarity,
+                          unsigned long trigger)
+{
+        int vector;
+        unsigned int dest = cpu_physical_id(smp_processor_id());
+        vector = isa_irq_to_vector(isa_irq);
+        register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY, polarity, trigger);
+        DBG("ISA: IRQ %u -> GSI %u (%s,%s) -> CPU %d (0x%04x) vector %d\n",
+            isa_irq, gsi, trigger == IOSAPIC_EDGE ? "edge" : "level",
+            polarity == IOSAPIC_POL_HIGH ? "high" : "low",
+            cpu_logical_id(dest), dest, vector);
+        set_rte(vector, dest, 1);
+}
+void __init
+iosapic_system_init (int system_pcat_compat)
+{
+        int vector;
+        for (vector = 0; vector < IA64_NUM_VECTORS; ++vector)
+                iosapic_intr_info[vector].rte_index = -1;       /* mark as unused */
+        pcat_compat = system_pcat_compat;
+        if (pcat_compat) {
+                /*
+                 * Disable the compatibility mode interrupts (8259 style), needs IN/OUT support
+                 * enabled.
+                 */
+                printk(KERN_INFO "%s: Disabling PC-AT compatible 8259 interrupts\n", __FUNCTION__);
+                outb(0xff, 0xA1);
+                outb(0xff, 0x21);
+        }
+}
+void __init
+iosapic_init (unsigned long phys_addr, unsigned int gsi_base)
+{
+        int num_rte;
+        unsigned int isa_irq, ver;
+        char __iomem *addr;
+        addr = ioremap(phys_addr, 0);
+        ver = iosapic_version(addr);
+        /*
+         * The MAX_REDIR register holds the highest input pin
+         * number (starting from 0).
+         * We add 1 so that we can use it for number of pins (= RTEs)
+         */
+        num_rte = ((ver >> 16) & 0xff) + 1;
+        iosapic_lists[num_iosapic].addr = addr;
+        iosapic_lists[num_iosapic].gsi_base = gsi_base;
+        iosapic_lists[num_iosapic].num_rte = num_rte;
+#ifdef CONFIG_NUMA
+        iosapic_lists[num_iosapic].node = MAX_NUMNODES;
+#endif
+        num_iosapic++;
+        if ((gsi_base == 0) && pcat_compat) {
+                /*
+                 * Map the legacy ISA devices into the IOSAPIC data.  Some of these may
+                 * get reprogrammed later on with data from the ACPI Interrupt Source
+                 * Override table.
+                 */
+                for (isa_irq = 0; isa_irq < 16; ++isa_irq)
+                        iosapic_override_isa_irq(isa_irq, isa_irq, IOSAPIC_POL_HIGH, IOSAPIC_EDGE);
+        }
+}
+#ifdef CONFIG_NUMA
+void __init
+map_iosapic_to_node(unsigned int gsi_base, int node)
+{
+        int index;
+        index = find_iosapic(gsi_base);
+        if (index < 0) {
+                printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n",
+                       __FUNCTION__, gsi_base);
+                return;
+        }
+        iosapic_lists[index].node = node;
+        return;
+}
+#endif
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
new file mode 100644
index 000000000000..28f2aadc38d0
--- /dev/null
+++ b/arch/ia64/kernel/irq.c
@@ -0,0 +1,238 @@
+/*
+ *      linux/arch/ia64/kernel/irq.c
+ *
+ *      Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the code used by various IRQ handling routines:
+ * asking for different IRQ's should be done through these routines
+ * instead of just grabbing them. Thus setups with different IRQ numbers
+ * shouldn't result in any weird surprises, and installing new handlers
+ * should be easier.
+ *
+ * Copyright (C) Ashok Raj<ashok.raj@intel.com>, Intel Corporation 2004
+ *
+ * 4/14/2004: Added code to handle cpu migration and do safe irq
+ *                      migration without lossing interrupts for iosapic
+ *                      architecture.
+ */
+#include <asm/delay.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+        printk(KERN_ERR "Unexpected irq vector 0x%x on CPU %u!\n", irq, smp_processor_id());
+}
+#ifdef CONFIG_IA64_GENERIC
+unsigned int __ia64_local_vector_to_irq (ia64_vector vec)
+{
+        return (unsigned int) vec;
+}
+#endif
+/*
+ * Interrupt statistics:
+ */
+atomic_t irq_err_count;
+/*
+ * /proc/interrupts printing:
+ */
+int show_interrupts(struct seq_file *p, void *v)
+{
+        int i = *(loff_t *) v, j;
+        struct irqaction * action;
+        unsigned long flags;
+        if (i == 0) {
+                seq_printf(p, "           ");
+                for (j=0; j<NR_CPUS; j++)
+                        if (cpu_online(j))
+                                seq_printf(p, "CPU%d       ",j);
+                seq_putc(p, '\n');
+        }
+        if (i < NR_IRQS) {
+                spin_lock_irqsave(&irq_desc[i].lock, flags);
+                action = irq_desc[i].action;
+                if (!action)
+                        goto skip;
+                seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+                seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+                for (j = 0; j < NR_CPUS; j++)
+                        if (cpu_online(j))
+                                seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+#endif
+                seq_printf(p, " %14s", irq_desc[i].handler->typename);
+                seq_printf(p, "  %s", action->name);
+                for (action=action->next; action; action = action->next)
+                        seq_printf(p, ", %s", action->name);
+                seq_putc(p, '\n');
+skip:
+                spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+        } else if (i == NR_IRQS)
+                seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+        return 0;
+}
+#ifdef CONFIG_SMP
+/*
+ * This is updated when the user sets irq affinity via /proc
+ */
+static cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+static unsigned long pending_irq_redir[BITS_TO_LONGS(NR_IRQS)];
+static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
+/*
+ * Arch specific routine for deferred write to iosapic rte to reprogram
+ * intr destination.
+ */
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+        pending_irq_cpumask[irq] = mask_val;
+}
+void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
+{
+        cpumask_t mask = CPU_MASK_NONE;
+        cpu_set(cpu_logical_id(hwid), mask);
+        if (irq < NR_IRQS) {
+                irq_affinity[irq] = mask;
+                irq_redir[irq] = (char) (redir & 0xff);
+        }
+}
+void move_irq(int irq)
+{
+        /* note - we hold desc->lock */
+        cpumask_t tmp;
+        irq_desc_t *desc = irq_descp(irq);
+        int redir = test_bit(irq, pending_irq_redir);
+        if (unlikely(!desc->handler->set_affinity))
+                return;
+        if (!cpus_empty(pending_irq_cpumask[irq])) {
+                cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+                if (unlikely(!cpus_empty(tmp))) {
+                        desc->handler->set_affinity(irq | (redir ? IA64_IRQ_REDIRECTED : 0),
+                                                    pending_irq_cpumask[irq]);
+                }
+                cpus_clear(pending_irq_cpumask[irq]);
+        }
+}
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_HOTPLUG_CPU
+unsigned int vectors_in_migration[NR_IRQS];
+/*
+ * Since cpu_online_map is already updated, we just need to check for
+ * affinity that has zeros
+ */
+static void migrate_irqs(void)
+{
+        cpumask_t       mask;
+        irq_desc_t *desc;
+        int             irq, new_cpu;
+        for (irq=0; irq < NR_IRQS; irq++) {
+                desc = irq_descp(irq);
+                /*
+                 * No handling for now.
+                 * TBD: Implement a disable function so we can now
+                 * tell CPU not to respond to these local intr sources.
+                 * such as ITV,CPEI,MCA etc.
+                 */
+                if (desc->status == IRQ_PER_CPU)
+                        continue;
+                cpus_and(mask, irq_affinity[irq], cpu_online_map);
+                if (any_online_cpu(mask) == NR_CPUS) {
+                        /*
+                         * Save it for phase 2 processing
+                         */
+                        vectors_in_migration[irq] = irq;
+                        new_cpu = any_online_cpu(cpu_online_map);
+                        mask = cpumask_of_cpu(new_cpu);
+                        /*
+                         * Al three are essential, currently WARN_ON.. maybe panic?
+                         */
+                        if (desc->handler && desc->handler->disable &&
+                                desc->handler->enable && desc->handler->set_affinity) {
+                                desc->handler->disable(irq);
+                                desc->handler->set_affinity(irq, mask);
+                                desc->handler->enable(irq);
+                        } else {
+                                WARN_ON((!(desc->handler) || !(desc->handler->disable) ||
+                                                !(desc->handler->enable) ||
+                                                !(desc->handler->set_affinity)));
+                        }
+                }
+        }
+}
+void fixup_irqs(void)
+{
+        unsigned int irq;
+        extern void ia64_process_pending_intr(void);
+        ia64_set_itv(1<<16);
+        /*
+         * Phase 1: Locate irq's bound to this cpu and
+         * relocate them for cpu removal.
+         */
+        migrate_irqs();
+        /*
+         * Phase 2: Perform interrupt processing for all entries reported in
+         * local APIC.
+         */
+        ia64_process_pending_intr();
+        /*
+         * Phase 3: Now handle any interrupts not captured in local APIC.
+         * This is to account for cases that device interrupted during the time the
+         * rte was being disabled and re-programmed.
+         */
+        for (irq=0; irq < NR_IRQS; irq++) {
+                if (vectors_in_migration[irq]) {
+                        vectors_in_migration[irq]=0;
+                        __do_IRQ(irq, NULL);
+                }
+        }
+        /*
+         * Now let processor die. We do irq disable and max_xtp() to
+         * ensure there is no more interrupts routed to this processor.
+         * But the local timer interrupt can have 1 pending which we
+         * take care in timer_interrupt().
+         */
+        max_xtp();
+        local_irq_disable();
+}
+#endif
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
new file mode 100644
index 000000000000..5ba06ebe355b
--- /dev/null
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -0,0 +1,278 @@
+/*
+ * linux/arch/ia64/kernel/irq.c
+ *
+ * Copyright (C) 1998-2001 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ *  6/10/99: Updated to bring in sync with x86 version to facilitate
+ *           support for SMP and different interrupt controllers.
+ *
+ * 09/15/00 Goutham Rao <goutham.rao@intel.com> Implemented pci_irq_to_vector
+ *                      PCI to vector allocation routine.
+ * 04/14/2004 Ashok Raj <ashok.raj@intel.com>
+ *                                              Added CPU Hotplug handling for IPF.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/kernel_stat.h>
+#include <linux/slab.h>
+#include <linux/ptrace.h>
+#include <linux/random.h>       /* for rand_initialize_irq() */
+#include <linux/signal.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/threads.h>
+#include <linux/bitops.h>
+#include <asm/delay.h>
+#include <asm/intrinsics.h>
+#include <asm/io.h>
+#include <asm/hw_irq.h>
+#include <asm/machvec.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#ifdef CONFIG_PERFMON
+# include <asm/perfmon.h>
+#endif
+#define IRQ_DEBUG       0
+/* default base addr of IPI table */
+void __iomem *ipi_base_addr = ((void __iomem *)
+                               (__IA64_UNCACHED_OFFSET | IA64_IPI_DEFAULT_BASE_ADDR));
+/*
+ * Legacy IRQ to IA-64 vector translation table.
+ */
+__u8 isa_irq_to_vector_map[16] = {
+        /* 8259 IRQ translation, first 16 entries */
+        0x2f, 0x20, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29,
+        0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21
+};
+EXPORT_SYMBOL(isa_irq_to_vector_map);
+static unsigned long ia64_vector_mask[BITS_TO_LONGS(IA64_NUM_DEVICE_VECTORS)];
+int
+assign_irq_vector (int irq)
+{
+        int pos, vector;
+ again:
+        pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS);
+        vector = IA64_FIRST_DEVICE_VECTOR + pos;
+        if (vector > IA64_LAST_DEVICE_VECTOR)
+                /* XXX could look for sharable vectors instead of panic'ing... */
+                panic("assign_irq_vector: out of interrupt vectors!");
+        if (test_and_set_bit(pos, ia64_vector_mask))
+                goto again;
+        return vector;
+}
+void
+free_irq_vector (int vector)
+{
+        int pos;
+        if (vector < IA64_FIRST_DEVICE_VECTOR || vector > IA64_LAST_DEVICE_VECTOR)
+                return;
+        pos = vector - IA64_FIRST_DEVICE_VECTOR;
+        if (!test_and_clear_bit(pos, ia64_vector_mask))
+                printk(KERN_WARNING "%s: double free!\n", __FUNCTION__);
+}
+#ifdef CONFIG_SMP
+#       define IS_RESCHEDULE(vec)       (vec == IA64_IPI_RESCHEDULE)
+#else
+#       define IS_RESCHEDULE(vec)       (0)
+#endif
+/*
+ * That's where the IVT branches when we get an external
+ * interrupt. This branches to the correct hardware IRQ handler via
+ * function ptr.
+ */
+void
+ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
+{
+        unsigned long saved_tpr;
+#if IRQ_DEBUG
+        {
+                unsigned long bsp, sp;
+                /*
+                 * Note: if the interrupt happened while executing in
+                 * the context switch routine (ia64_switch_to), we may
+                 * get a spurious stack overflow here.  This is
+                 * because the register and the memory stack are not
+                 * switched atomically.
+                 */
+                bsp = ia64_getreg(_IA64_REG_AR_BSP);
+                sp = ia64_getreg(_IA64_REG_SP);
+                if ((sp - bsp) < 1024) {
+                        static unsigned char count;
+                        static long last_time;
+                        if (jiffies - last_time > 5*HZ)
+                                count = 0;
+                        if (++count < 5) {
+                                last_time = jiffies;
+                                printk("ia64_handle_irq: DANGER: less than "
+                                       "1KB of free stack space!!\n"
+                                       "(bsp=0x%lx, sp=%lx)\n", bsp, sp);
+                        }
+                }
+        }
+#endif /* IRQ_DEBUG */
+        /*
+         * Always set TPR to limit maximum interrupt nesting depth to
+         * 16 (without this, it would be ~240, which could easily lead
+         * to kernel stack overflows).
+         */
+        irq_enter();
+        saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
+        ia64_srlz_d();
+        while (vector != IA64_SPURIOUS_INT_VECTOR) {
+                if (!IS_RESCHEDULE(vector)) {
+                        ia64_setreg(_IA64_REG_CR_TPR, vector);
+                        ia64_srlz_d();
+                        __do_IRQ(local_vector_to_irq(vector), regs);
+                        /*
+                         * Disable interrupts and send EOI:
+                         */
+                        local_irq_disable();
+                        ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
+                }
+                ia64_eoi();
+                vector = ia64_get_ivr();
+        }
+        /*
+         * This must be done *after* the ia64_eoi().  For example, the keyboard softirq
+         * handler needs to be able to wait for further keyboard interrupts, which can't
+         * come through until ia64_eoi() has been done.
+         */
+        irq_exit();
+}
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * This function emulates a interrupt processing when a cpu is about to be
+ * brought down.
+ */
+void ia64_process_pending_intr(void)
+{
+        ia64_vector vector;
+        unsigned long saved_tpr;
+        extern unsigned int vectors_in_migration[NR_IRQS];
+        vector = ia64_get_ivr();
+         irq_enter();
+         saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
+         ia64_srlz_d();
+         /*
+          * Perform normal interrupt style processing
+          */
+        while (vector != IA64_SPURIOUS_INT_VECTOR) {
+                if (!IS_RESCHEDULE(vector)) {
+                        ia64_setreg(_IA64_REG_CR_TPR, vector);
+                        ia64_srlz_d();
+                        /*
+                         * Now try calling normal ia64_handle_irq as it would have got called
+                         * from a real intr handler. Try passing null for pt_regs, hopefully
+                         * it will work. I hope it works!.
+                         * Probably could shared code.
+                         */
+                        vectors_in_migration[local_vector_to_irq(vector)]=0;
+                        __do_IRQ(local_vector_to_irq(vector), NULL);
+                        /*
+                         * Disable interrupts and send EOI
+                         */
+                        local_irq_disable();
+                        ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
+                }
+                ia64_eoi();
+                vector = ia64_get_ivr();
+        }
+        irq_exit();
+}
+#endif
+#ifdef CONFIG_SMP
+extern irqreturn_t handle_IPI (int irq, void *dev_id, struct pt_regs *regs);
+static struct irqaction ipi_irqaction = {
+        .handler =      handle_IPI,
+        .flags =        SA_INTERRUPT,
+        .name =         "IPI"
+};
+#endif
+void
+register_percpu_irq (ia64_vector vec, struct irqaction *action)
+{
+        irq_desc_t *desc;
+        unsigned int irq;
+        for (irq = 0; irq < NR_IRQS; ++irq)
+                if (irq_to_vector(irq) == vec) {
+                        desc = irq_descp(irq);
+                        desc->status |= IRQ_PER_CPU;
+                        desc->handler = &irq_type_ia64_lsapic;
+                        if (action)
+                                setup_irq(irq, action);
+                }
+}
+void __init
+init_IRQ (void)
+{
+        register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL);
+#ifdef CONFIG_SMP
+        register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction);
+#endif
+#ifdef CONFIG_PERFMON
+        pfm_init_percpu();
+#endif
+        platform_irq_init();
+}
+void
+ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect)
+{
+        void __iomem *ipi_addr;
+        unsigned long ipi_data;
+        unsigned long phys_cpu_id;
+#ifdef CONFIG_SMP
+        phys_cpu_id = cpu_physical_id(cpu);
+#else
+        phys_cpu_id = (ia64_getreg(_IA64_REG_CR_LID) >> 16) & 0xffff;
+#endif
+        /*
+         * cpu number is in 8bit ID and 8bit EID
+         */
+        ipi_data = (delivery_mode << 8) | (vector & 0xff);
+        ipi_addr = ipi_base_addr + ((phys_cpu_id << 4) | ((redirect & 1) << 3));
+        writeq(ipi_data, ipi_addr);
+}
diff --git a/arch/ia64/kernel/irq_lsapic.c b/arch/ia64/kernel/irq_lsapic.c
new file mode 100644
index 000000000000..ea14e6a04409
--- /dev/null
+++ b/arch/ia64/kernel/irq_lsapic.c
@@ -0,0 +1,37 @@
+/*
+ * LSAPIC Interrupt Controller
+ *
+ * This takes care of interrupts that are generated by the CPU's
+ * internal Streamlined Advanced Programmable Interrupt Controller
+ * (LSAPIC), such as the ITC and IPI interrupts.
+    *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 2000 Hewlett-Packard Co
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/sched.h>
+#include <linux/irq.h>
+static unsigned int
+lsapic_noop_startup (unsigned int irq)
+{
+        return 0;
+}
+static void
+lsapic_noop (unsigned int irq)
+{
+        /* nuthing to do... */
+}
+struct hw_interrupt_type irq_type_ia64_lsapic = {
+        .typename =     "LSAPIC",
+        .startup =      lsapic_noop_startup,
+        .shutdown =     lsapic_noop,
+        .enable =       lsapic_noop,
+        .disable =      lsapic_noop,
+        .ack =          lsapic_noop,
+        .end =          lsapic_noop
+};
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
new file mode 100644
index 000000000000..d9c05d53435b
--- /dev/null
+++ b/arch/ia64/kernel/ivt.S
@@ -0,0 +1,1619 @@
+/*
+ * arch/ia64/kernel/ivt.S
+ *
+ * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *      David Mosberger <davidm@hpl.hp.com>
+ * Copyright (C) 2000, 2002-2003 Intel Co
+ *      Asit Mallick <asit.k.mallick@intel.com>
+ *      Suresh Siddha <suresh.b.siddha@intel.com>
+ *      Kenneth Chen <kenneth.w.chen@intel.com>
+ *      Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> TLB handling for SMP
+ * 00/12/20 David Mosberger-Tang <davidm@hpl.hp.com> DTLB/ITLB handler now uses virtual PT.
+ */
+/*
+ * This file defines the interruption vector table used by the CPU.
+ * It does not include one entry per possible cause of interruption.
+ *
+ * The first 20 entries of the table contain 64 bundles each while the
+ * remaining 48 entries contain only 16 bundles each.
+ *
+ * The 64 bundles are used to allow inlining the whole handler for critical
+ * interruptions like TLB misses.
+ *
+ *  For each entry, the comment is as follows:
+ *
+ *              // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
+ *  entry offset ----/     /         /                  /          /
+ *  entry number ---------/         /                  /          /
+ *  size of the entry -------------/                  /          /
+ *  vector name -------------------------------------/          /
+ *  interruptions triggering this vector ----------------------/
+ *
+ * The table is 32KB in size and must be aligned on 32KB boundary.
+ * (The CPU ignores the 15 lower bits of the address)
+ *
+ * Table is based upon EAS2.6 (Oct 1999)
+ */
+#include <linux/config.h>
+#include <asm/asmmacro.h>
+#include <asm/break.h>
+#include <asm/ia32.h>
+#include <asm/kregs.h>
+#include <asm/offsets.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/system.h>
+#include <asm/thread_info.h>
+#include <asm/unistd.h>
+#include <asm/errno.h>
+#if 1
+# define PSR_DEFAULT_BITS       psr.ac
+#else
+# define PSR_DEFAULT_BITS       0
+#endif
+#if 0
+  /*
+   * This lets you track the last eight faults that occurred on the CPU.  Make sure ar.k2 isn't
+   * needed for something else before enabling this...
+   */
+# define DBG_FAULT(i)   mov r16=ar.k2;; shl r16=r16,8;; add r16=(i),r16;;mov ar.k2=r16
+#else
+# define DBG_FAULT(i)
+#endif
+#define MINSTATE_VIRT   /* needed by minstate.h */
+#include "minstate.h"
+#define FAULT(n)                                                                        \
+        mov r31=pr;                                                                     \
+        mov r19=n;;                     /* prepare to save predicates */                \
+        br.sptk.many dispatch_to_fault_handler
+        .section .text.ivt,"ax"
+        .align 32768    // align on 32KB boundary
+        .global ia64_ivt
+ia64_ivt:
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
+ENTRY(vhpt_miss)
+        DBG_FAULT(0)
+        /*
+         * The VHPT vector is invoked when the TLB entry for the virtual page table
+         * is missing.  This happens only as a result of a previous
+         * (the "original") TLB miss, which may either be caused by an instruction
+         * fetch or a data access (or non-access).
+         *
+         * What we do here is normal TLB miss handing for the _original_ miss, followed
+         * by inserting the TLB entry for the virtual page table page that the VHPT
+         * walker was attempting to access.  The latter gets inserted as long
+         * as both L1 and L2 have valid mappings for the faulting address.
+         * The TLB entry for the original miss gets inserted only if
+         * the L3 entry indicates that the page is present.
+         *
+         * do_page_fault gets invoked in the following cases:
+         *      - the faulting virtual address uses unimplemented address bits
+         *      - the faulting virtual address has no L1, L2, or L3 mapping
+         */
+        mov r16=cr.ifa                          // get address that caused the TLB miss
+#ifdef CONFIG_HUGETLB_PAGE
+        movl r18=PAGE_SHIFT
+        mov r25=cr.itir
+#endif
+        ;;
+        rsm psr.dt                              // use physical addressing for data
+        mov r31=pr                              // save the predicate registers
+        mov r19=IA64_KR(PT_BASE)                // get page table base address
+        shl r21=r16,3                           // shift bit 60 into sign bit
+        shr.u r17=r16,61                        // get the region number into r17
+        ;;
+        shr r22=r21,3
+#ifdef CONFIG_HUGETLB_PAGE
+        extr.u r26=r25,2,6
+        ;;
+        cmp.ne p8,p0=r18,r26
+        sub r27=r26,r18
+        ;;
+(p8)    dep r25=r18,r25,2,6
+(p8)    shr r22=r22,r27
+#endif
+        ;;
+        cmp.eq p6,p7=5,r17                      // is IFA pointing into to region 5?
+        shr.u r18=r22,PGDIR_SHIFT               // get bits 33-63 of the faulting address
+        ;;
+(p7)    dep r17=r17,r19,(PAGE_SHIFT-3),3        // put region number bits in place
+        srlz.d
+        LOAD_PHYSICAL(p6, r19, swapper_pg_dir)  // region 5 is rooted at swapper_pg_dir
+        .pred.rel "mutex", p6, p7
+(p6)    shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
+(p7)    shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
+        ;;
+(p6)    dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=PTA + IFA(33,42)*8
+(p7)    dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
+        cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
+        shr.u r18=r22,PMD_SHIFT                 // shift L2 index into position
+        ;;
+        ld8 r17=[r17]                           // fetch the L1 entry (may be 0)
+        ;;
+(p7)    cmp.eq p6,p7=r17,r0                     // was L1 entry NULL?
+        dep r17=r18,r17,3,(PAGE_SHIFT-3)        // compute address of L2 page table entry
+        ;;
+(p7)    ld8 r20=[r17]                           // fetch the L2 entry (may be 0)
+        shr.u r19=r22,PAGE_SHIFT                // shift L3 index into position
+        ;;
+(p7)    cmp.eq.or.andcm p6,p7=r20,r0            // was L2 entry NULL?
+        dep r21=r19,r20,3,(PAGE_SHIFT-3)        // compute address of L3 page table entry
+        ;;
+(p7)    ld8 r18=[r21]                           // read the L3 PTE
+        mov r19=cr.isr                          // cr.isr bit 0 tells us if this is an insn miss
+        ;;
+(p7)    tbit.z p6,p7=r18,_PAGE_P_BIT            // page present bit cleared?
+        mov r22=cr.iha                          // get the VHPT address that caused the TLB miss
+        ;;                                      // avoid RAW on p7
+(p7)    tbit.nz.unc p10,p11=r19,32              // is it an instruction TLB miss?
+        dep r23=0,r20,0,PAGE_SHIFT              // clear low bits to get page address
+        ;;
+(p10)   itc.i r18                               // insert the instruction TLB entry
+(p11)   itc.d r18                               // insert the data TLB entry
+(p6)    br.cond.spnt.many page_fault            // handle bad address/page not present (page fault)
+        mov cr.ifa=r22
+#ifdef CONFIG_HUGETLB_PAGE
+(p8)    mov cr.itir=r25                         // change to default page-size for VHPT
+#endif
+        /*
+         * Now compute and insert the TLB entry for the virtual page table.  We never
+         * execute in a page table page so there is no need to set the exception deferral
+         * bit.
+         */
+        adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
+        ;;
+(p7)    itc.d r24
+        ;;
+#ifdef CONFIG_SMP
+        /*
+         * Tell the assemblers dependency-violation checker that the above "itc" instructions
+         * cannot possibly affect the following loads:
+         */
+        dv_serialize_data
+        /*
+         * Re-check L2 and L3 pagetable.  If they changed, we may have received a ptc.g
+         * between reading the pagetable and the "itc".  If so, flush the entry we
+         * inserted and retry.
+         */
+        ld8 r25=[r21]                           // read L3 PTE again
+        ld8 r26=[r17]                           // read L2 entry again
+        ;;
+        cmp.ne p6,p7=r26,r20                    // did L2 entry change
+        mov r27=PAGE_SHIFT<<2
+        ;;
+(p6)    ptc.l r22,r27                           // purge PTE page translation
+(p7)    cmp.ne.or.andcm p6,p7=r25,r18           // did L3 PTE change
+        ;;
+(p6)    ptc.l r16,r27                           // purge translation
+#endif
+        mov pr=r31,-1                           // restore predicate registers
+        rfi
+END(vhpt_miss)
+        .org ia64_ivt+0x400
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
+ENTRY(itlb_miss)
+        DBG_FAULT(1)
+        /*
+         * The ITLB handler accesses the L3 PTE via the virtually mapped linear
+         * page table.  If a nested TLB miss occurs, we switch into physical
+         * mode, walk the page table, and then re-execute the L3 PTE read
+         * and go on normally after that.
+         */
+        mov r16=cr.ifa                          // get virtual address
+        mov r29=b0                              // save b0
+        mov r31=pr                              // save predicates
+.itlb_fault:
+        mov r17=cr.iha                          // get virtual address of L3 PTE
+        movl r30=1f                             // load nested fault continuation point
+        ;;
+1:      ld8 r18=[r17]                           // read L3 PTE
+        ;;
+        mov b0=r29
+        tbit.z p6,p0=r18,_PAGE_P_BIT            // page present bit cleared?
+(p6)    br.cond.spnt page_fault
+        ;;
+        itc.i r18
+        ;;
+#ifdef CONFIG_SMP
+        /*
+         * Tell the assemblers dependency-violation checker that the above "itc" instructions
+         * cannot possibly affect the following loads:
+         */
+        dv_serialize_data
+        ld8 r19=[r17]                           // read L3 PTE again and see if same
+        mov r20=PAGE_SHIFT<<2                   // setup page size for purge
+        ;;
+        cmp.ne p7,p0=r18,r19
+        ;;
+(p7)    ptc.l r16,r20
+#endif
+        mov pr=r31,-1
+        rfi
+END(itlb_miss)
+        .org ia64_ivt+0x0800
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
+ENTRY(dtlb_miss)
+        DBG_FAULT(2)
+        /*
+         * The DTLB handler accesses the L3 PTE via the virtually mapped linear
+         * page table.  If a nested TLB miss occurs, we switch into physical
+         * mode, walk the page table, and then re-execute the L3 PTE read
+         * and go on normally after that.
+         */
+        mov r16=cr.ifa                          // get virtual address
+        mov r29=b0                              // save b0
+        mov r31=pr                              // save predicates
+dtlb_fault:
+        mov r17=cr.iha                          // get virtual address of L3 PTE
+        movl r30=1f                             // load nested fault continuation point
+        ;;
+1:      ld8 r18=[r17]                           // read L3 PTE
+        ;;
+        mov b0=r29
+        tbit.z p6,p0=r18,_PAGE_P_BIT            // page present bit cleared?
+(p6)    br.cond.spnt page_fault
+        ;;
+        itc.d r18
+        ;;
+#ifdef CONFIG_SMP
+        /*
+         * Tell the assemblers dependency-violation checker that the above "itc" instructions
+         * cannot possibly affect the following loads:
+         */
+        dv_serialize_data
+        ld8 r19=[r17]                           // read L3 PTE again and see if same
+        mov r20=PAGE_SHIFT<<2                   // setup page size for purge
+        ;;
+        cmp.ne p7,p0=r18,r19
+        ;;
+(p7)    ptc.l r16,r20
+#endif
+        mov pr=r31,-1
+        rfi
+END(dtlb_miss)
+        .org ia64_ivt+0x0c00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
+ENTRY(alt_itlb_miss)
+        DBG_FAULT(3)
+        mov r16=cr.ifa          // get address that caused the TLB miss
+        movl r17=PAGE_KERNEL
+        mov r21=cr.ipsr
+        movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+        mov r31=pr
+        ;;
+#ifdef CONFIG_DISABLE_VHPT
+        shr.u r22=r16,61                        // get the region number into r21
+        ;;
+        cmp.gt p8,p0=6,r22                      // user mode
+        ;;
+(p8)    thash r17=r16
+        ;;
+(p8)    mov cr.iha=r17
+(p8)    mov r29=b0                              // save b0
+(p8)    br.cond.dptk .itlb_fault
+#endif
+        extr.u r23=r21,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
+        and r19=r19,r16         // clear ed, reserved bits, and PTE control bits
+        shr.u r18=r16,57        // move address bit 61 to bit 4
+        ;;
+        andcm r18=0x10,r18      // bit 4=~address-bit(61)
+        cmp.ne p8,p0=r0,r23     // psr.cpl != 0?
+        or r19=r17,r19          // insert PTE control bits into r19
+        ;;
+        or r19=r19,r18          // set bit 4 (uncached) if the access was to region 6
+(p8)    br.cond.spnt page_fault
+        ;;
+        itc.i r19               // insert the TLB entry
+        mov pr=r31,-1
+        rfi
+END(alt_itlb_miss)
+        .org ia64_ivt+0x1000
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
+ENTRY(alt_dtlb_miss)
+        DBG_FAULT(4)
+        mov r16=cr.ifa          // get address that caused the TLB miss
+        movl r17=PAGE_KERNEL
+        mov r20=cr.isr
+        movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+        mov r21=cr.ipsr
+        mov r31=pr
+        ;;
+#ifdef CONFIG_DISABLE_VHPT
+        shr.u r22=r16,61                        // get the region number into r21
+        ;;
+        cmp.gt p8,p0=6,r22                      // access to region 0-5
+        ;;
+(p8)    thash r17=r16
+        ;;
+(p8)    mov cr.iha=r17
+(p8)    mov r29=b0                              // save b0
+(p8)    br.cond.dptk dtlb_fault
+#endif
+        extr.u r23=r21,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
+        and r22=IA64_ISR_CODE_MASK,r20          // get the isr.code field
+        tbit.nz p6,p7=r20,IA64_ISR_SP_BIT       // is speculation bit on?
+        shr.u r18=r16,57                        // move address bit 61 to bit 4
+        and r19=r19,r16                         // clear ed, reserved bits, and PTE control bits
+        tbit.nz p9,p0=r20,IA64_ISR_NA_BIT       // is non-access bit on?
+        ;;
+        andcm r18=0x10,r18      // bit 4=~address-bit(61)
+        cmp.ne p8,p0=r0,r23
+(p9)    cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22  // check isr.code field
+(p8)    br.cond.spnt page_fault
+        dep r21=-1,r21,IA64_PSR_ED_BIT,1
+        or r19=r19,r17          // insert PTE control bits into r19
+        ;;
+        or r19=r19,r18          // set bit 4 (uncached) if the access was to region 6
+(p6)    mov cr.ipsr=r21
+        ;;
+(p7)    itc.d r19               // insert the TLB entry
+        mov pr=r31,-1
+        rfi
+END(alt_dtlb_miss)
+        .org ia64_ivt+0x1400
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
+ENTRY(nested_dtlb_miss)
+        /*
+         * In the absence of kernel bugs, we get here when the virtually mapped linear
+         * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
+         * Access-bit, or Data Access-bit faults).  If the DTLB entry for the virtual page
+         * table is missing, a nested TLB miss fault is triggered and control is
+         * transferred to this point.  When this happens, we lookup the pte for the
+         * faulting address by walking the page table in physical mode and return to the
+         * continuation point passed in register r30 (or call page_fault if the address is
+         * not mapped).
+         *
+         * Input:       r16:    faulting address
+         *              r29:    saved b0
+         *              r30:    continuation address
+         *              r31:    saved pr
+         *
+         * Output:      r17:    physical address of L3 PTE of faulting address
+         *              r29:    saved b0
+         *              r30:    continuation address
+         *              r31:    saved pr
+         *
+         * Clobbered:   b0, r18, r19, r21, psr.dt (cleared)
+         */
+        rsm psr.dt                              // switch to using physical data addressing
+        mov r19=IA64_KR(PT_BASE)                // get the page table base address
+        shl r21=r16,3                           // shift bit 60 into sign bit
+        ;;
+        shr.u r17=r16,61                        // get the region number into r17
+        ;;
+        cmp.eq p6,p7=5,r17                      // is faulting address in region 5?
+        shr.u r18=r16,PGDIR_SHIFT               // get bits 33-63 of faulting address
+        ;;
+(p7)    dep r17=r17,r19,(PAGE_SHIFT-3),3        // put region number bits in place
+        srlz.d
+        LOAD_PHYSICAL(p6, r19, swapper_pg_dir)  // region 5 is rooted at swapper_pg_dir
+        .pred.rel "mutex", p6, p7
+(p6)    shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
+(p7)    shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
+        ;;
+(p6)    dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=PTA + IFA(33,42)*8
+(p7)    dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
+        cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
+        shr.u r18=r16,PMD_SHIFT                 // shift L2 index into position
+        ;;
+        ld8 r17=[r17]                           // fetch the L1 entry (may be 0)
+        ;;
+(p7)    cmp.eq p6,p7=r17,r0                     // was L1 entry NULL?
+        dep r17=r18,r17,3,(PAGE_SHIFT-3)        // compute address of L2 page table entry
+        ;;
+(p7)    ld8 r17=[r17]                           // fetch the L2 entry (may be 0)
+        shr.u r19=r16,PAGE_SHIFT                // shift L3 index into position
+        ;;
+(p7)    cmp.eq.or.andcm p6,p7=r17,r0            // was L2 entry NULL?
+        dep r17=r19,r17,3,(PAGE_SHIFT-3)        // compute address of L3 page table entry
+(p6)    br.cond.spnt page_fault
+        mov b0=r30
+        br.sptk.many b0                         // return to continuation point
+END(nested_dtlb_miss)
+        .org ia64_ivt+0x1800
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
+ENTRY(ikey_miss)
+        DBG_FAULT(6)
+        FAULT(6)
+END(ikey_miss)
+        //-----------------------------------------------------------------------------------
+        // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
+ENTRY(page_fault)
+        ssm psr.dt
+        ;;
+        srlz.i
+        ;;
+        SAVE_MIN_WITH_COVER
+        alloc r15=ar.pfs,0,0,3,0
+        mov out0=cr.ifa
+        mov out1=cr.isr
+        adds r3=8,r2                            // set up second base pointer
+        ;;
+        ssm psr.ic | PSR_DEFAULT_BITS
+        ;;
+        srlz.i                                  // guarantee that interruption collectin is on
+        ;;
+(p15)   ssm psr.i                               // restore psr.i
+        movl r14=ia64_leave_kernel
+        ;;
+        SAVE_REST
+        mov rp=r14
+        ;;
+        adds out2=16,r12                        // out2 = pointer to pt_regs
+        br.call.sptk.many b6=ia64_do_page_fault // ignore return address
+END(page_fault)
+        .org ia64_ivt+0x1c00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
+ENTRY(dkey_miss)
+        DBG_FAULT(7)
+        FAULT(7)
+END(dkey_miss)
+        .org ia64_ivt+0x2000
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
+ENTRY(dirty_bit)
+        DBG_FAULT(8)
+        /*
+         * What we do here is to simply turn on the dirty bit in the PTE.  We need to
+         * update both the page-table and the TLB entry.  To efficiently access the PTE,
+         * we address it through the virtual page table.  Most likely, the TLB entry for
+         * the relevant virtual page table page is still present in the TLB so we can
+         * normally do this without additional TLB misses.  In case the necessary virtual
+         * page table TLB entry isn't present, we take a nested TLB miss hit where we look
+         * up the physical address of the L3 PTE and then continue at label 1 below.
+         */
+        mov r16=cr.ifa                          // get the address that caused the fault
+        movl r30=1f                             // load continuation point in case of nested fault
+        ;;
+        thash r17=r16                           // compute virtual address of L3 PTE
+        mov r29=b0                              // save b0 in case of nested fault
+        mov r31=pr                              // save pr
+#ifdef CONFIG_SMP
+        mov r28=ar.ccv                          // save ar.ccv
+        ;;
+1:      ld8 r18=[r17]
+        ;;                                      // avoid RAW on r18
+        mov ar.ccv=r18                          // set compare value for cmpxchg
+        or r25=_PAGE_D|_PAGE_A,r18              // set the dirty and accessed bits
+        ;;
+        cmpxchg8.acq r26=[r17],r25,ar.ccv
+        mov r24=PAGE_SHIFT<<2
+        ;;
+        cmp.eq p6,p7=r26,r18
+        ;;
+(p6)    itc.d r25                               // install updated PTE
+        ;;
+        /*
+         * Tell the assemblers dependency-violation checker that the above "itc" instructions
+         * cannot possibly affect the following loads:
+         */
+        dv_serialize_data
+        ld8 r18=[r17]                           // read PTE again
+        ;;
+        cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
+        ;;
+(p7)    ptc.l r16,r24
+        mov b0=r29                              // restore b0
+        mov ar.ccv=r28
+#else
+        ;;
+1:      ld8 r18=[r17]
+        ;;                                      // avoid RAW on r18
+        or r18=_PAGE_D|_PAGE_A,r18              // set the dirty and accessed bits
+        mov b0=r29                              // restore b0
+        ;;
+        st8 [r17]=r18                           // store back updated PTE
+        itc.d r18                               // install updated PTE
+#endif
+        mov pr=r31,-1                           // restore pr
+        rfi
+END(dirty_bit)
+        .org ia64_ivt+0x2400
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
+ENTRY(iaccess_bit)
+        DBG_FAULT(9)
+        // Like Entry 8, except for instruction access
+        mov r16=cr.ifa                          // get the address that caused the fault
+        movl r30=1f                             // load continuation point in case of nested fault
+        mov r31=pr                              // save predicates
+#ifdef CONFIG_ITANIUM
+        /*
+         * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
+         */
+        mov r17=cr.ipsr
+        ;;
+        mov r18=cr.iip
+        tbit.z p6,p0=r17,IA64_PSR_IS_BIT        // IA64 instruction set?
+        ;;
+(p6)    mov r16=r18                             // if so, use cr.iip instead of cr.ifa
+#endif /* CONFIG_ITANIUM */
+        ;;
+        thash r17=r16                           // compute virtual address of L3 PTE
+        mov r29=b0                              // save b0 in case of nested fault)
+#ifdef CONFIG_SMP
+        mov r28=ar.ccv                          // save ar.ccv
+        ;;
+1:      ld8 r18=[r17]
+        ;;
+        mov ar.ccv=r18                          // set compare value for cmpxchg
+        or r25=_PAGE_A,r18                      // set the accessed bit
+        ;;
+        cmpxchg8.acq r26=[r17],r25,ar.ccv
+        mov r24=PAGE_SHIFT<<2
+        ;;
+        cmp.eq p6,p7=r26,r18
+        ;;
+(p6)    itc.i r25                               // install updated PTE
+        ;;
+        /*
+         * Tell the assemblers dependency-violation checker that the above "itc" instructions
+         * cannot possibly affect the following loads:
+         */
+        dv_serialize_data
+        ld8 r18=[r17]                           // read PTE again
+        ;;
+        cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
+        ;;
+(p7)    ptc.l r16,r24
+        mov b0=r29                              // restore b0
+        mov ar.ccv=r28
+#else /* !CONFIG_SMP */
+        ;;
+1:      ld8 r18=[r17]
+        ;;
+        or r18=_PAGE_A,r18                      // set the accessed bit
+        mov b0=r29                              // restore b0
+        ;;
+        st8 [r17]=r18                           // store back updated PTE
+        itc.i r18                               // install updated PTE
+#endif /* !CONFIG_SMP */
+        mov pr=r31,-1
+        rfi
+END(iaccess_bit)
+        .org ia64_ivt+0x2800
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
+ENTRY(daccess_bit)
+        DBG_FAULT(10)
+        // Like Entry 8, except for data access
+        mov r16=cr.ifa                          // get the address that caused the fault
+        movl r30=1f                             // load continuation point in case of nested fault
+        ;;
+        thash r17=r16                           // compute virtual address of L3 PTE
+        mov r31=pr
+        mov r29=b0                              // save b0 in case of nested fault)
+#ifdef CONFIG_SMP
+        mov r28=ar.ccv                          // save ar.ccv
+        ;;
+1:      ld8 r18=[r17]
+        ;;                                      // avoid RAW on r18
+        mov ar.ccv=r18                          // set compare value for cmpxchg
+        or r25=_PAGE_A,r18                      // set the dirty bit
+        ;;
+        cmpxchg8.acq r26=[r17],r25,ar.ccv
+        mov r24=PAGE_SHIFT<<2
+        ;;
+        cmp.eq p6,p7=r26,r18
+        ;;
+(p6)    itc.d r25                               // install updated PTE
+        /*
+         * Tell the assemblers dependency-violation checker that the above "itc" instructions
+         * cannot possibly affect the following loads:
+         */
+        dv_serialize_data
+        ;;
+        ld8 r18=[r17]                           // read PTE again
+        ;;
+        cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
+        ;;
+(p7)    ptc.l r16,r24
+        mov ar.ccv=r28
+#else
+        ;;
+1:      ld8 r18=[r17]
+        ;;                                      // avoid RAW on r18
+        or r18=_PAGE_A,r18                      // set the accessed bit
+        ;;
+        st8 [r17]=r18                           // store back updated PTE
+        itc.d r18                               // install updated PTE
+#endif
+        mov b0=r29                              // restore b0
+        mov pr=r31,-1
+        rfi
+END(daccess_bit)
+        .org ia64_ivt+0x2c00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
+ENTRY(break_fault)
+        /*
+         * The streamlined system call entry/exit paths only save/restore the initial part
+         * of pt_regs.  This implies that the callers of system-calls must adhere to the
+         * normal procedure calling conventions.
+         *
+         *   Registers to be saved & restored:
+         *      CR registers: cr.ipsr, cr.iip, cr.ifs
+         *      AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr
+         *      others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15
+         *   Registers to be restored only:
+         *      r8-r11: output value from the system call.
+         *
+         * During system call exit, scratch registers (including r15) are modified/cleared
+         * to prevent leaking bits from kernel to user level.
+         */
+        DBG_FAULT(11)
+        mov r16=IA64_KR(CURRENT)                // r16 = current task; 12 cycle read lat.
+        mov r17=cr.iim
+        mov r18=__IA64_BREAK_SYSCALL
+        mov r21=ar.fpsr
+        mov r29=cr.ipsr
+        mov r19=b6
+        mov r25=ar.unat
+        mov r27=ar.rsc
+        mov r26=ar.pfs
+        mov r28=cr.iip
+        mov r31=pr                              // prepare to save predicates
+        mov r20=r1
+        ;;
+        adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
+        cmp.eq p0,p7=r18,r17                    // is this a system call? (p7 <- false, if so)
+(p7)    br.cond.spnt non_syscall
+        ;;
+        ld1 r17=[r16]                           // load current->thread.on_ustack flag
+        st1 [r16]=r0                            // clear current->thread.on_ustack flag
+        add r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16   // set r1 for MINSTATE_START_SAVE_MIN_VIRT
+        ;;
+        invala
+        /* adjust return address so we skip over the break instruction: */
+        extr.u r8=r29,41,2                      // extract ei field from cr.ipsr
+        ;;
+        cmp.eq p6,p7=2,r8                       // isr.ei==2?
+        mov r2=r1                               // setup r2 for ia64_syscall_setup
+        ;;
+(p6)    mov r8=0                                // clear ei to 0
+(p6)    adds r28=16,r28                         // switch cr.iip to next bundle cr.ipsr.ei wrapped
+(p7)    adds r8=1,r8                            // increment ei to next slot
+        ;;
+        cmp.eq pKStk,pUStk=r0,r17               // are we in kernel mode already?
+        dep r29=r8,r29,41,2                     // insert new ei into cr.ipsr
+        ;;
+        // switch from user to kernel RBS:
+        MINSTATE_START_SAVE_MIN_VIRT
+        br.call.sptk.many b7=ia64_syscall_setup
+        ;;
+        MINSTATE_END_SAVE_MIN_VIRT              // switch to bank 1
+        ssm psr.ic | PSR_DEFAULT_BITS
+        ;;
+        srlz.i                                  // guarantee that interruption collection is on
+        mov r3=NR_syscalls - 1
+        ;;
+(p15)   ssm psr.i                               // restore psr.i
+        // p10==true means out registers are more than 8 or r15's Nat is true
+(p10)   br.cond.spnt.many ia64_ret_from_syscall
+        ;;
+        movl r16=sys_call_table
+        adds r15=-1024,r15                      // r15 contains the syscall number---subtract 1024
+        movl r2=ia64_ret_from_syscall
+        ;;
+        shladd r20=r15,3,r16                    // r20 = sys_call_table + 8*(syscall-1024)
+        cmp.leu p6,p7=r15,r3                    // (syscall > 0 && syscall < 1024 + NR_syscalls) ?
+        mov rp=r2                               // set the real return addr
+        ;;
+(p6)    ld8 r20=[r20]                           // load address of syscall entry point
+(p7)    movl r20=sys_ni_syscall
+        add r2=TI_FLAGS+IA64_TASK_SIZE,r13
+        ;;
+        ld4 r2=[r2]                             // r2 = current_thread_info()->flags
+        ;;
+        and r2=_TIF_SYSCALL_TRACEAUDIT,r2       // mask trace or audit
+        ;;
+        cmp.eq p8,p0=r2,r0
+        mov b6=r20
+        ;;
+(p8)    br.call.sptk.many b6=b6                 // ignore this return addr
+        br.cond.sptk ia64_trace_syscall
+        // NOT REACHED
+END(break_fault)
+        .org ia64_ivt+0x3000
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
+ENTRY(interrupt)
+        DBG_FAULT(12)
+        mov r31=pr              // prepare to save predicates
+        ;;
+        SAVE_MIN_WITH_COVER     // uses r31; defines r2 and r3
+        ssm psr.ic | PSR_DEFAULT_BITS
+        ;;
+        adds r3=8,r2            // set up second base pointer for SAVE_REST
+        srlz.i                  // ensure everybody knows psr.ic is back on
+        ;;
+        SAVE_REST
+        ;;
+        alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
+        mov out0=cr.ivr         // pass cr.ivr as first arg
+        add out1=16,sp          // pass pointer to pt_regs as second arg
+        ;;
+        srlz.d                  // make sure we see the effect of cr.ivr
+        movl r14=ia64_leave_kernel
+        ;;
+        mov rp=r14
+        br.call.sptk.many b6=ia64_handle_irq
+END(interrupt)
+        .org ia64_ivt+0x3400
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x3400 Entry 13 (size 64 bundles) Reserved
+        DBG_FAULT(13)
+        FAULT(13)
+        .org ia64_ivt+0x3800
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x3800 Entry 14 (size 64 bundles) Reserved
+        DBG_FAULT(14)
+        FAULT(14)
+        /*
+         * There is no particular reason for this code to be here, other than that
+         * there happens to be space here that would go unused otherwise.  If this
+         * fault ever gets "unreserved", simply moved the following code to a more
+         * suitable spot...
+         *
+         * ia64_syscall_setup() is a separate subroutine so that it can
+         *      allocate stacked registers so it can safely demine any
+         *      potential NaT values from the input registers.
+         *
+         * On entry:
+         *      - executing on bank 0 or bank 1 register set (doesn't matter)
+         *      -  r1: stack pointer
+         *      -  r2: current task pointer
+         *      -  r3: preserved
+         *      - r11: original contents (saved ar.pfs to be saved)
+         *      - r12: original contents (sp to be saved)
+         *      - r13: original contents (tp to be saved)
+         *      - r15: original contents (syscall # to be saved)
+         *      - r18: saved bsp (after switching to kernel stack)
+         *      - r19: saved b6
+         *      - r20: saved r1 (gp)
+         *      - r21: saved ar.fpsr
+         *      - r22: kernel's register backing store base (krbs_base)
+         *      - r23: saved ar.bspstore
+         *      - r24: saved ar.rnat
+         *      - r25: saved ar.unat
+         *      - r26: saved ar.pfs
+         *      - r27: saved ar.rsc
+         *      - r28: saved cr.iip
+         *      - r29: saved cr.ipsr
+         *      - r31: saved pr
+         *      -  b0: original contents (to be saved)
+         * On exit:
+         *      - executing on bank 1 registers
+         *      - psr.ic enabled, interrupts restored
+         *      -  p10: TRUE if syscall is invoked with more than 8 out
+         *              registers or r15's Nat is true
+         *      -  r1: kernel's gp
+         *      -  r3: preserved (same as on entry)
+         *      -  r8: -EINVAL if p10 is true
+         *      - r12: points to kernel stack
+         *      - r13: points to current task
+         *      - p15: TRUE if interrupts need to be re-enabled
+         *      - ar.fpsr: set to kernel settings
+         */
+GLOBAL_ENTRY(ia64_syscall_setup)
+#if PT(B6) != 0
+# error This code assumes that b6 is the first field in pt_regs.
+#endif
+        st8 [r1]=r19                            // save b6
+        add r16=PT(CR_IPSR),r1                  // initialize first base pointer
+        add r17=PT(R11),r1                      // initialize second base pointer
+        ;;
+        alloc r19=ar.pfs,8,0,0,0                // ensure in0-in7 are writable
+        st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR)    // save cr.ipsr
+        tnat.nz p8,p0=in0
+        st8.spill [r17]=r11,PT(CR_IIP)-PT(R11)  // save r11
+        tnat.nz p9,p0=in1
+(pKStk) mov r18=r0                              // make sure r18 isn't NaT
+        ;;
+        st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS)     // save ar.pfs
+        st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP)    // save cr.iip
+        mov r28=b0                              // save b0 (2 cyc)
+        ;;
+        st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT)    // save ar.unat
+        dep r19=0,r19,38,26                     // clear all bits but 0..37 [I0]
+(p8)    mov in0=-1
+        ;;
+        st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS)    // store ar.pfs.pfm in cr.ifs
+        extr.u r11=r19,7,7      // I0           // get sol of ar.pfs
+        and r8=0x7f,r19         // A            // get sof of ar.pfs
+        st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc
+        tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0
+(p9)    mov in1=-1
+        ;;
+(pUStk) sub r18=r18,r22                         // r18=RSE.ndirty*8
+        tnat.nz p10,p0=in2
+        add r11=8,r11
+        ;;
+(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16         // skip over ar_rnat field
+(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17     // skip over ar_bspstore field
+        tnat.nz p11,p0=in3
+        ;;
+(p10)   mov in2=-1
+        tnat.nz p12,p0=in4                              // [I0]
+(p11)   mov in3=-1
+        ;;
+(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT)        // save ar.rnat
+(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE)    // save ar.bspstore
+        shl r18=r18,16                          // compute ar.rsc to be used for "loadrs"
+        ;;
+        st8 [r16]=r31,PT(LOADRS)-PT(PR)         // save predicates
+        st8 [r17]=r28,PT(R1)-PT(B0)             // save b0
+        tnat.nz p13,p0=in5                              // [I0]
+        ;;
+        st8 [r16]=r18,PT(R12)-PT(LOADRS)        // save ar.rsc value for "loadrs"
+        st8.spill [r17]=r20,PT(R13)-PT(R1)      // save original r1
+(p12)   mov in4=-1
+        ;;
+.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12)        // save r12
+.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13)            // save r13
+(p13)   mov in5=-1
+        ;;
+        st8 [r16]=r21,PT(R8)-PT(AR_FPSR)        // save ar.fpsr
+        tnat.nz p14,p0=in6
+        cmp.lt p10,p9=r11,r8    // frame size can't be more than local+8
+        ;;
+        stf8 [r16]=f1           // ensure pt_regs.r8 != 0 (see handle_syscall_error)
+(p9)    tnat.nz p10,p0=r15
+        adds r12=-16,r1         // switch to kernel memory stack (with 16 bytes of scratch)
+        st8.spill [r17]=r15                     // save r15
+        tnat.nz p8,p0=in7
+        nop.i 0
+        mov r13=r2                              // establish `current'
+        movl r1=__gp                            // establish kernel global pointer
+        ;;
+(p14)   mov in6=-1
+(p8)    mov in7=-1
+        nop.i 0
+        cmp.eq pSys,pNonSys=r0,r0               // set pSys=1, pNonSys=0
+        movl r17=FPSR_DEFAULT
+        ;;
+        mov.m ar.fpsr=r17                       // set ar.fpsr to kernel default value
+(p10)   mov r8=-EINVAL
+        br.ret.sptk.many b7
+END(ia64_syscall_setup)
+        .org ia64_ivt+0x3c00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x3c00 Entry 15 (size 64 bundles) Reserved
+        DBG_FAULT(15)
+        FAULT(15)
+        /*
+         * Squatting in this space ...
+         *
+         * This special case dispatcher for illegal operation faults allows preserved
+         * registers to be modified through a callback function (asm only) that is handed
+         * back from the fault handler in r8. Up to three arguments can be passed to the
+         * callback function by returning an aggregate with the callback as its first
+         * element, followed by the arguments.
+         */
+ENTRY(dispatch_illegal_op_fault)
+        .prologue
+        .body
+        SAVE_MIN_WITH_COVER
+        ssm psr.ic | PSR_DEFAULT_BITS
+        ;;
+        srlz.i          // guarantee that interruption collection is on
+        ;;
+(p15)   ssm psr.i       // restore psr.i
+        adds r3=8,r2    // set up second base pointer for SAVE_REST
+        ;;
+        alloc r14=ar.pfs,0,0,1,0        // must be first in insn group
+        mov out0=ar.ec
+        ;;
+        SAVE_REST
+        PT_REGS_UNWIND_INFO(0)
+        ;;
+        br.call.sptk.many rp=ia64_illegal_op_fault
+.ret0:  ;;
+        alloc r14=ar.pfs,0,0,3,0        // must be first in insn group
+        mov out0=r9
+        mov out1=r10
+        mov out2=r11
+        movl r15=ia64_leave_kernel
+        ;;
+        mov rp=r15
+        mov b6=r8
+        ;;
+        cmp.ne p6,p0=0,r8
+(p6)    br.call.dpnt.many b6=b6         // call returns to ia64_leave_kernel
+        br.sptk.many ia64_leave_kernel
+END(dispatch_illegal_op_fault)
+        .org ia64_ivt+0x4000
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x4000 Entry 16 (size 64 bundles) Reserved
+        DBG_FAULT(16)
+        FAULT(16)
+        .org ia64_ivt+0x4400
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x4400 Entry 17 (size 64 bundles) Reserved
+        DBG_FAULT(17)
+        FAULT(17)
+ENTRY(non_syscall)
+        SAVE_MIN_WITH_COVER
+        // There is no particular reason for this code to be here, other than that
+        // there happens to be space here that would go unused otherwise.  If this
+        // fault ever gets "unreserved", simply moved the following code to a more
+        // suitable spot...
+        alloc r14=ar.pfs,0,0,2,0
+        mov out0=cr.iim
+        add out1=16,sp
+        adds r3=8,r2                    // set up second base pointer for SAVE_REST
+        ssm psr.ic | PSR_DEFAULT_BITS
+        ;;
+        srlz.i                          // guarantee that interruption collection is on
+        ;;
+(p15)   ssm psr.i                       // restore psr.i
+        movl r15=ia64_leave_kernel
+        ;;
+        SAVE_REST
+        mov rp=r15
+        ;;
+        br.call.sptk.many b6=ia64_bad_break     // avoid WAW on CFM and ignore return addr
+END(non_syscall)
+        .org ia64_ivt+0x4800
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x4800 Entry 18 (size 64 bundles) Reserved
+        DBG_FAULT(18)
+        FAULT(18)
+        /*
+         * There is no particular reason for this code to be here, other than that
+         * there happens to be space here that would go unused otherwise.  If this
+         * fault ever gets "unreserved", simply moved the following code to a more
+         * suitable spot...
+         */
+ENTRY(dispatch_unaligned_handler)
+        SAVE_MIN_WITH_COVER
+        ;;
+        alloc r14=ar.pfs,0,0,2,0                // now it's safe (must be first in insn group!)
+        mov out0=cr.ifa
+        adds out1=16,sp
+        ssm psr.ic | PSR_DEFAULT_BITS
+        ;;
+        srlz.i                                  // guarantee that interruption collection is on
+        ;;
+(p15)   ssm psr.i                               // restore psr.i
+        adds r3=8,r2                            // set up second base pointer
+        ;;
+        SAVE_REST
+        movl r14=ia64_leave_kernel
+        ;;
+        mov rp=r14
+        br.sptk.many ia64_prepare_handle_unaligned
+END(dispatch_unaligned_handler)
+        .org ia64_ivt+0x4c00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x4c00 Entry 19 (size 64 bundles) Reserved
+        DBG_FAULT(19)
+        FAULT(19)
+        /*
+         * There is no particular reason for this code to be here, other than that
+         * there happens to be space here that would go unused otherwise.  If this
+         * fault ever gets "unreserved", simply moved the following code to a more
+         * suitable spot...
+         */
+ENTRY(dispatch_to_fault_handler)
+        /*
+         * Input:
+         *      psr.ic: off
+         *      r19:    fault vector number (e.g., 24 for General Exception)
+         *      r31:    contains saved predicates (pr)
+         */
+        SAVE_MIN_WITH_COVER_R19
+        alloc r14=ar.pfs,0,0,5,0
+        mov out0=r15
+        mov out1=cr.isr
+        mov out2=cr.ifa
+        mov out3=cr.iim
+        mov out4=cr.itir
+        ;;
+        ssm psr.ic | PSR_DEFAULT_BITS
+        ;;
+        srlz.i                                  // guarantee that interruption collection is on
+        ;;
+(p15)   ssm psr.i                               // restore psr.i
+        adds r3=8,r2                            // set up second base pointer for SAVE_REST
+        ;;
+        SAVE_REST
+        movl r14=ia64_leave_kernel
+        ;;
+        mov rp=r14
+        br.call.sptk.many b6=ia64_fault
+END(dispatch_to_fault_handler)
+//
+// --- End of long entries, Beginning of short entries
+//
+        .org ia64_ivt+0x5000
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
+ENTRY(page_not_present)
+        DBG_FAULT(20)
+        mov r16=cr.ifa
+        rsm psr.dt
+        /*
+         * The Linux page fault handler doesn't expect non-present pages to be in
+         * the TLB.  Flush the existing entry now, so we meet that expectation.
+         */
+        mov r17=PAGE_SHIFT<<2
+        ;;
+        ptc.l r16,r17
+        ;;
+        mov r31=pr
+        srlz.d
+        br.sptk.many page_fault
+END(page_not_present)
+        .org ia64_ivt+0x5100
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52)
+ENTRY(key_permission)
+        DBG_FAULT(21)
+        mov r16=cr.ifa
+        rsm psr.dt
+        mov r31=pr
+        ;;
+        srlz.d
+        br.sptk.many page_fault
+END(key_permission)
+        .org ia64_ivt+0x5200
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
+ENTRY(iaccess_rights)
+        DBG_FAULT(22)
+        mov r16=cr.ifa
+        rsm psr.dt
+        mov r31=pr
+        ;;
+        srlz.d
+        br.sptk.many page_fault
+END(iaccess_rights)
+        .org ia64_ivt+0x5300
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
+ENTRY(daccess_rights)
+        DBG_FAULT(23)
+        mov r16=cr.ifa
+        rsm psr.dt
+        mov r31=pr
+        ;;
+        srlz.d
+        br.sptk.many page_fault
+END(daccess_rights)
+        .org ia64_ivt+0x5400
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
+ENTRY(general_exception)
+        DBG_FAULT(24)
+        mov r16=cr.isr
+        mov r31=pr
+        ;;
+        cmp4.eq p6,p0=0,r16
+(p6)    br.sptk.many dispatch_illegal_op_fault
+        ;;
+        mov r19=24              // fault number
+        br.sptk.many dispatch_to_fault_handler
+END(general_exception)
+        .org ia64_ivt+0x5500
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
+ENTRY(disabled_fp_reg)
+        DBG_FAULT(25)
+        rsm psr.dfh             // ensure we can access fph
+        ;;
+        srlz.d
+        mov r31=pr
+        mov r19=25
+        br.sptk.many dispatch_to_fault_handler
+END(disabled_fp_reg)
+        .org ia64_ivt+0x5600
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
+ENTRY(nat_consumption)
+        DBG_FAULT(26)
+        FAULT(26)
+END(nat_consumption)
+        .org ia64_ivt+0x5700
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5700 Entry 27 (size 16 bundles) Speculation (40)
+ENTRY(speculation_vector)
+        DBG_FAULT(27)
+        /*
+         * A [f]chk.[as] instruction needs to take the branch to the recovery code but
+         * this part of the architecture is not implemented in hardware on some CPUs, such
+         * as Itanium.  Thus, in general we need to emulate the behavior.  IIM contains
+         * the relative target (not yet sign extended).  So after sign extending it we
+         * simply add it to IIP.  We also need to reset the EI field of the IPSR to zero,
+         * i.e., the slot to restart into.
+         *
+         * cr.imm contains zero_ext(imm21)
+         */
+        mov r18=cr.iim
+        ;;
+        mov r17=cr.iip
+        shl r18=r18,43                  // put sign bit in position (43=64-21)
+        ;;
+        mov r16=cr.ipsr
+        shr r18=r18,39                  // sign extend (39=43-4)
+        ;;
+        add r17=r17,r18                 // now add the offset
+        ;;
+        mov cr.iip=r17
+        dep r16=0,r16,41,2              // clear EI
+        ;;
+        mov cr.ipsr=r16
+        ;;
+        rfi                             // and go back
+END(speculation_vector)
+        .org ia64_ivt+0x5800
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5800 Entry 28 (size 16 bundles) Reserved
+        DBG_FAULT(28)
+        FAULT(28)
+        .org ia64_ivt+0x5900
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
+ENTRY(debug_vector)
+        DBG_FAULT(29)
+        FAULT(29)
+END(debug_vector)
+        .org ia64_ivt+0x5a00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
+ENTRY(unaligned_access)
+        DBG_FAULT(30)
+        mov r16=cr.ipsr
+        mov r31=pr              // prepare to save predicates
+        ;;
+        br.sptk.many dispatch_unaligned_handler
+END(unaligned_access)
+        .org ia64_ivt+0x5b00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
+ENTRY(unsupported_data_reference)
+        DBG_FAULT(31)
+        FAULT(31)
+END(unsupported_data_reference)
+        .org ia64_ivt+0x5c00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64)
+ENTRY(floating_point_fault)
+        DBG_FAULT(32)
+        FAULT(32)
+END(floating_point_fault)
+        .org ia64_ivt+0x5d00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
+ENTRY(floating_point_trap)
+        DBG_FAULT(33)
+        FAULT(33)
+END(floating_point_trap)
+        .org ia64_ivt+0x5e00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
+ENTRY(lower_privilege_trap)
+        DBG_FAULT(34)
+        FAULT(34)
+END(lower_privilege_trap)
+        .org ia64_ivt+0x5f00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
+ENTRY(taken_branch_trap)
+        DBG_FAULT(35)
+        FAULT(35)
+END(taken_branch_trap)
+        .org ia64_ivt+0x6000
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
+ENTRY(single_step_trap)
+        DBG_FAULT(36)
+        FAULT(36)
+END(single_step_trap)
+        .org ia64_ivt+0x6100
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6100 Entry 37 (size 16 bundles) Reserved
+        DBG_FAULT(37)
+        FAULT(37)
+        .org ia64_ivt+0x6200
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6200 Entry 38 (size 16 bundles) Reserved
+        DBG_FAULT(38)
+        FAULT(38)
+        .org ia64_ivt+0x6300
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6300 Entry 39 (size 16 bundles) Reserved
+        DBG_FAULT(39)
+        FAULT(39)
+        .org ia64_ivt+0x6400
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6400 Entry 40 (size 16 bundles) Reserved
+        DBG_FAULT(40)
+        FAULT(40)
+        .org ia64_ivt+0x6500
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6500 Entry 41 (size 16 bundles) Reserved
+        DBG_FAULT(41)
+        FAULT(41)
+        .org ia64_ivt+0x6600
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6600 Entry 42 (size 16 bundles) Reserved
+        DBG_FAULT(42)
+        FAULT(42)
+        .org ia64_ivt+0x6700
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6700 Entry 43 (size 16 bundles) Reserved
+        DBG_FAULT(43)
+        FAULT(43)
+        .org ia64_ivt+0x6800
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6800 Entry 44 (size 16 bundles) Reserved
+        DBG_FAULT(44)
+        FAULT(44)
+        .org ia64_ivt+0x6900
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
+ENTRY(ia32_exception)
+        DBG_FAULT(45)
+        FAULT(45)
+END(ia32_exception)
+        .org ia64_ivt+0x6a00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
+ENTRY(ia32_intercept)
+        DBG_FAULT(46)
+#ifdef  CONFIG_IA32_SUPPORT
+        mov r31=pr
+        mov r16=cr.isr
+        ;;
+        extr.u r17=r16,16,8     // get ISR.code
+        mov r18=ar.eflag
+        mov r19=cr.iim          // old eflag value
+        ;;
+        cmp.ne p6,p0=2,r17
+(p6)    br.cond.spnt 1f         // not a system flag fault
+        xor r16=r18,r19
+        ;;
+        extr.u r17=r16,18,1     // get the eflags.ac bit
+        ;;
+        cmp.eq p6,p0=0,r17
+(p6)    br.cond.spnt 1f         // eflags.ac bit didn't change
+        ;;
+        mov pr=r31,-1           // restore predicate registers
+        rfi
+1:
+#endif  // CONFIG_IA32_SUPPORT
+        FAULT(46)
+END(ia32_intercept)
+        .org ia64_ivt+0x6b00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt  (74)
+ENTRY(ia32_interrupt)
+        DBG_FAULT(47)
+#ifdef CONFIG_IA32_SUPPORT
+        mov r31=pr
+        br.sptk.many dispatch_to_ia32_handler
+#else
+        FAULT(47)
+#endif
+END(ia32_interrupt)
+        .org ia64_ivt+0x6c00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6c00 Entry 48 (size 16 bundles) Reserved
+        DBG_FAULT(48)
+        FAULT(48)
+        .org ia64_ivt+0x6d00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6d00 Entry 49 (size 16 bundles) Reserved
+        DBG_FAULT(49)
+        FAULT(49)
+        .org ia64_ivt+0x6e00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6e00 Entry 50 (size 16 bundles) Reserved
+        DBG_FAULT(50)
+        FAULT(50)
+        .org ia64_ivt+0x6f00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x6f00 Entry 51 (size 16 bundles) Reserved
+        DBG_FAULT(51)
+        FAULT(51)
+        .org ia64_ivt+0x7000
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7000 Entry 52 (size 16 bundles) Reserved
+        DBG_FAULT(52)
+        FAULT(52)
+        .org ia64_ivt+0x7100
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7100 Entry 53 (size 16 bundles) Reserved
+        DBG_FAULT(53)
+        FAULT(53)
+        .org ia64_ivt+0x7200
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7200 Entry 54 (size 16 bundles) Reserved
+        DBG_FAULT(54)
+        FAULT(54)
+        .org ia64_ivt+0x7300
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7300 Entry 55 (size 16 bundles) Reserved
+        DBG_FAULT(55)
+        FAULT(55)
+        .org ia64_ivt+0x7400
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7400 Entry 56 (size 16 bundles) Reserved
+        DBG_FAULT(56)
+        FAULT(56)
+        .org ia64_ivt+0x7500
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7500 Entry 57 (size 16 bundles) Reserved
+        DBG_FAULT(57)
+        FAULT(57)
+        .org ia64_ivt+0x7600
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7600 Entry 58 (size 16 bundles) Reserved
+        DBG_FAULT(58)
+        FAULT(58)
+        .org ia64_ivt+0x7700
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7700 Entry 59 (size 16 bundles) Reserved
+        DBG_FAULT(59)
+        FAULT(59)
+        .org ia64_ivt+0x7800
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7800 Entry 60 (size 16 bundles) Reserved
+        DBG_FAULT(60)
+        FAULT(60)
+        .org ia64_ivt+0x7900
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7900 Entry 61 (size 16 bundles) Reserved
+        DBG_FAULT(61)
+        FAULT(61)
+        .org ia64_ivt+0x7a00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7a00 Entry 62 (size 16 bundles) Reserved
+        DBG_FAULT(62)
+        FAULT(62)
+        .org ia64_ivt+0x7b00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7b00 Entry 63 (size 16 bundles) Reserved
+        DBG_FAULT(63)
+        FAULT(63)
+        .org ia64_ivt+0x7c00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7c00 Entry 64 (size 16 bundles) Reserved
+        DBG_FAULT(64)
+        FAULT(64)
+        .org ia64_ivt+0x7d00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7d00 Entry 65 (size 16 bundles) Reserved
+        DBG_FAULT(65)
+        FAULT(65)
+        .org ia64_ivt+0x7e00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7e00 Entry 66 (size 16 bundles) Reserved
+        DBG_FAULT(66)
+        FAULT(66)
+        .org ia64_ivt+0x7f00
+/////////////////////////////////////////////////////////////////////////////////////////
+// 0x7f00 Entry 67 (size 16 bundles) Reserved
+        DBG_FAULT(67)
+        FAULT(67)
+#ifdef CONFIG_IA32_SUPPORT
+        /*
+         * There is no particular reason for this code to be here, other than that
+         * there happens to be space here that would go unused otherwise.  If this
+         * fault ever gets "unreserved", simply moved the following code to a more
+         * suitable spot...
+         */
+        // IA32 interrupt entry point
+ENTRY(dispatch_to_ia32_handler)
+        SAVE_MIN
+        ;;
+        mov r14=cr.isr
+        ssm psr.ic | PSR_DEFAULT_BITS
+        ;;
+        srlz.i                                  // guarantee that interruption collection is on
+        ;;
+(p15)   ssm psr.i
+        adds r3=8,r2            // Base pointer for SAVE_REST
+        ;;
+        SAVE_REST
+        ;;
+        mov r15=0x80
+        shr r14=r14,16          // Get interrupt number
+        ;;
+        cmp.ne p6,p0=r14,r15
+(p6)    br.call.dpnt.many b6=non_ia32_syscall
+        adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp // 16 byte hole per SW conventions
+        adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp
+        ;;
+        cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
+        ld8 r8=[r14]            // get r8
+        ;;
+        st8 [r15]=r8            // save original EAX in r1 (IA32 procs don't use the GP)
+        ;;
+        alloc r15=ar.pfs,0,0,6,0        // must first in an insn group
+        ;;
+        ld4 r8=[r14],8          // r8 == eax (syscall number)
+        mov r15=IA32_NR_syscalls
+        ;;
+        cmp.ltu.unc p6,p7=r8,r15
+        ld4 out1=[r14],8        // r9 == ecx
+        ;;
+        ld4 out2=[r14],8        // r10 == edx
+        ;;
+        ld4 out0=[r14]          // r11 == ebx
+        adds r14=(IA64_PT_REGS_R13_OFFSET) + 16,sp
+        ;;
+        ld4 out5=[r14],PT(R14)-PT(R13)  // r13 == ebp
+        ;;
+        ld4 out3=[r14],PT(R15)-PT(R14)  // r14 == esi
+        adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
+        ;;
+        ld4 out4=[r14]          // r15 == edi
+        movl r16=ia32_syscall_table
+        ;;
+(p6)    shladd r16=r8,3,r16     // force ni_syscall if not valid syscall number
+        ld4 r2=[r2]             // r2 = current_thread_info()->flags
+        ;;
+        ld8 r16=[r16]
+        and r2=_TIF_SYSCALL_TRACEAUDIT,r2       // mask trace or audit
+        ;;
+        mov b6=r16
+        movl r15=ia32_ret_from_syscall
+        cmp.eq p8,p0=r2,r0
+        ;;
+        mov rp=r15
+(p8)    br.call.sptk.many b6=b6
+        br.cond.sptk ia32_trace_syscall
+non_ia32_syscall:
+        alloc r15=ar.pfs,0,0,2,0
+        mov out0=r14                            // interrupt #
+        add out1=16,sp                          // pointer to pt_regs
+        ;;                      // avoid WAW on CFM
+        br.call.sptk.many rp=ia32_bad_interrupt
+.ret1:  movl r15=ia64_leave_kernel
+        ;;
+        mov rp=r15
+        br.ret.sptk.many rp
+END(dispatch_to_ia32_handler)
+#endif /* CONFIG_IA32_SUPPORT */
diff --git a/arch/ia64/kernel/machvec.c b/arch/ia64/kernel/machvec.c
new file mode 100644
index 000000000000..c3a04ee7f4f6
--- /dev/null
+++ b/arch/ia64/kernel/machvec.c
@@ -0,0 +1,70 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/machvec.h>
+#include <asm/system.h>
+#ifdef CONFIG_IA64_GENERIC
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <asm/page.h>
+struct ia64_machine_vector ia64_mv;
+EXPORT_SYMBOL(ia64_mv);
+static struct ia64_machine_vector *
+lookup_machvec (const char *name)
+{
+        extern struct ia64_machine_vector machvec_start[];
+        extern struct ia64_machine_vector machvec_end[];
+        struct ia64_machine_vector *mv;
+        for (mv = machvec_start; mv < machvec_end; ++mv)
+                if (strcmp (mv->name, name) == 0)
+                        return mv;
+        return 0;
+}
+void
+machvec_init (const char *name)
+{
+        struct ia64_machine_vector *mv;
+        mv = lookup_machvec(name);
+        if (!mv) {
+                panic("generic kernel failed to find machine vector for platform %s!", name);
+        }
+        ia64_mv = *mv;
+        printk(KERN_INFO "booting generic kernel on platform %s\n", name);
+}
+#endif /* CONFIG_IA64_GENERIC */
+void
+machvec_setup (char **arg)
+{
+}
+EXPORT_SYMBOL(machvec_setup);
+void
+machvec_timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
+{
+}
+EXPORT_SYMBOL(machvec_timer_interrupt);
+void
+machvec_dma_sync_single (struct device *hwdev, dma_addr_t dma_handle, size_t size, int dir)
+{
+        mb();
+}
+EXPORT_SYMBOL(machvec_dma_sync_single);
+void
+machvec_dma_sync_sg (struct device *hwdev, struct scatterlist *sg, int n, int dir)
+{
+        mb();
+}
+EXPORT_SYMBOL(machvec_dma_sync_sg);
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
new file mode 100644
index 000000000000..4d6c7b8f667b
--- /dev/null
+++ b/arch/ia64/kernel/mca.c
@@ -0,0 +1,1470 @@
+/*
+ * File:        mca.c
+ * Purpose:     Generic MCA handling layer
+ *
+ * Updated for latest kernel
+ * Copyright (C) 2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * Copyright (C) 2002 Dell Inc.
+ * Copyright (C) Matt Domsch (Matt_Domsch@dell.com)
+ *
+ * Copyright (C) 2002 Intel
+ * Copyright (C) Jenna Hall (jenna.s.hall@intel.com)
+ *
+ * Copyright (C) 2001 Intel
+ * Copyright (C) Fred Lewis (frederick.v.lewis@intel.com)
+ *
+ * Copyright (C) 2000 Intel
+ * Copyright (C) Chuck Fleckenstein (cfleck@co.intel.com)
+ *
+ * Copyright (C) 1999, 2004 Silicon Graphics, Inc.
+ * Copyright (C) Vijay Chander(vijay@engr.sgi.com)
+ *
+ * 03/04/15 D. Mosberger Added INIT backtrace support.
+ * 02/03/25 M. Domsch   GUID cleanups
+ *
+ * 02/01/04 J. Hall     Aligned MCA stack to 16 bytes, added platform vs. CPU
+ *                      error flag, set SAL default return values, changed
+ *                      error record structure to linked list, added init call
+ *                      to sal_get_state_info_size().
+ *
+ * 01/01/03 F. Lewis    Added setup of CMCI and CPEI IRQs, logging of corrected
+ *                      platform errors, completed code for logging of
+ *                      corrected & uncorrected machine check errors, and
+ *                      updated for conformance with Nov. 2000 revision of the
+ *                      SAL 3.0 spec.
+ * 00/03/29 C. Fleckenstein  Fixed PAL/SAL update issues, began MCA bug fixes, logging issues,
+ *                           added min save state dump, added INIT handler.
+ *
+ * 2003-12-08 Keith Owens <kaos@sgi.com>
+ *            smp_call_function() must not be called from interrupt context (can
+ *            deadlock on tasklist_lock).  Use keventd to call smp_call_function().
+ *
+ * 2004-02-01 Keith Owens <kaos@sgi.com>
+ *            Avoid deadlock when using printk() for MCA and INIT records.
+ *            Delete all record printing code, moved to salinfo_decode in user space.
+ *            Mark variables and functions static where possible.
+ *            Delete dead variables and functions.
+ *            Reorder to remove the need for forward declarations and to consolidate
+ *            related code.
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kallsyms.h>
+#include <linux/smp_lock.h>
+#include <linux/bootmem.h>
+#include <linux/acpi.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/workqueue.h>
+#include <asm/delay.h>
+#include <asm/machvec.h>
+#include <asm/meminit.h>
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/system.h>
+#include <asm/sal.h>
+#include <asm/mca.h>
+#include <asm/irq.h>
+#include <asm/hw_irq.h>
+#if defined(IA64_MCA_DEBUG_INFO)
+# define IA64_MCA_DEBUG(fmt...) printk(fmt)
+#else
+# define IA64_MCA_DEBUG(fmt...)
+#endif
+/* Used by mca_asm.S */
+ia64_mca_sal_to_os_state_t      ia64_sal_to_os_handoff_state;
+ia64_mca_os_to_sal_state_t      ia64_os_to_sal_handoff_state;
+u64                             ia64_mca_serialize;
+DEFINE_PER_CPU(u64, ia64_mca_data); /* == __per_cpu_mca[smp_processor_id()] */
+DEFINE_PER_CPU(u64, ia64_mca_per_cpu_pte); /* PTE to map per-CPU area */
+DEFINE_PER_CPU(u64, ia64_mca_pal_pte);      /* PTE to map PAL code */
+DEFINE_PER_CPU(u64, ia64_mca_pal_base);    /* vaddr PAL code granule */
+unsigned long __per_cpu_mca[NR_CPUS];
+/* In mca_asm.S */
+extern void                     ia64_monarch_init_handler (void);
+extern void                     ia64_slave_init_handler (void);
+static ia64_mc_info_t           ia64_mc_info;
+#define MAX_CPE_POLL_INTERVAL (15*60*HZ) /* 15 minutes */
+#define MIN_CPE_POLL_INTERVAL (2*60*HZ)  /* 2 minutes */
+#define CMC_POLL_INTERVAL     (1*60*HZ)  /* 1 minute */
+#define CPE_HISTORY_LENGTH    5
+#define CMC_HISTORY_LENGTH    5
+static struct timer_list cpe_poll_timer;
+static struct timer_list cmc_poll_timer;
+/*
+ * This variable tells whether we are currently in polling mode.
+ * Start with this in the wrong state so we won't play w/ timers
+ * before the system is ready.
+ */
+static int cmc_polling_enabled = 1;
+/*
+ * Clearing this variable prevents CPE polling from getting activated
+ * in mca_late_init.  Use it if your system doesn't provide a CPEI,
+ * but encounters problems retrieving CPE logs.  This should only be
+ * necessary for debugging.
+ */
+static int cpe_poll_enabled = 1;
+extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
+static int mca_init;
+/*
+ * IA64_MCA log support
+ */
+#define IA64_MAX_LOGS           2       /* Double-buffering for nested MCAs */
+#define IA64_MAX_LOG_TYPES      4   /* MCA, INIT, CMC, CPE */
+typedef struct ia64_state_log_s
+{
+        spinlock_t      isl_lock;
+        int             isl_index;
+        unsigned long   isl_count;
+        ia64_err_rec_t  *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
+} ia64_state_log_t;
+static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES];
+#define IA64_LOG_ALLOCATE(it, size) \
+        {ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)] = \
+                (ia64_err_rec_t *)alloc_bootmem(size); \
+        ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)] = \
+                (ia64_err_rec_t *)alloc_bootmem(size);}
+#define IA64_LOG_LOCK_INIT(it) spin_lock_init(&ia64_state_log[it].isl_lock)
+#define IA64_LOG_LOCK(it)      spin_lock_irqsave(&ia64_state_log[it].isl_lock, s)
+#define IA64_LOG_UNLOCK(it)    spin_unlock_irqrestore(&ia64_state_log[it].isl_lock,s)
+#define IA64_LOG_NEXT_INDEX(it)    ia64_state_log[it].isl_index
+#define IA64_LOG_CURR_INDEX(it)    1 - ia64_state_log[it].isl_index
+#define IA64_LOG_INDEX_INC(it) \
+    {ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index; \
+    ia64_state_log[it].isl_count++;}
+#define IA64_LOG_INDEX_DEC(it) \
+    ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index
+#define IA64_LOG_NEXT_BUFFER(it)   (void *)((ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)]))
+#define IA64_LOG_CURR_BUFFER(it)   (void *)((ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)]))
+#define IA64_LOG_COUNT(it)         ia64_state_log[it].isl_count
+/*
+ * ia64_log_init
+ *      Reset the OS ia64 log buffer
+ * Inputs   :   info_type   (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE})
+ * Outputs      :       None
+ */
+static void
+ia64_log_init(int sal_info_type)
+{
+        u64     max_size = 0;
+        IA64_LOG_NEXT_INDEX(sal_info_type) = 0;
+        IA64_LOG_LOCK_INIT(sal_info_type);
+        // SAL will tell us the maximum size of any error record of this type
+        max_size = ia64_sal_get_state_info_size(sal_info_type);
+        if (!max_size)
+                /* alloc_bootmem() doesn't like zero-sized allocations! */
+                return;
+        // set up OS data structures to hold error info
+        IA64_LOG_ALLOCATE(sal_info_type, max_size);
+        memset(IA64_LOG_CURR_BUFFER(sal_info_type), 0, max_size);
+        memset(IA64_LOG_NEXT_BUFFER(sal_info_type), 0, max_size);
+}
+/*
+ * ia64_log_get
+ *
+ *      Get the current MCA log from SAL and copy it into the OS log buffer.
+ *
+ *  Inputs  :   info_type   (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE})
+ *              irq_safe    whether you can use printk at this point
+ *  Outputs :   size        (total record length)
+ *              *buffer     (ptr to error record)
+ *
+ */
+static u64
+ia64_log_get(int sal_info_type, u8 **buffer, int irq_safe)
+{
+        sal_log_record_header_t     *log_buffer;
+        u64                         total_len = 0;
+        int                         s;
+        IA64_LOG_LOCK(sal_info_type);
+        /* Get the process state information */
+        log_buffer = IA64_LOG_NEXT_BUFFER(sal_info_type);
+        total_len = ia64_sal_get_state_info(sal_info_type, (u64 *)log_buffer);
+        if (total_len) {
+                IA64_LOG_INDEX_INC(sal_info_type);
+                IA64_LOG_UNLOCK(sal_info_type);
+                if (irq_safe) {
+                        IA64_MCA_DEBUG("%s: SAL error record type %d retrieved. "
+                                       "Record length = %ld\n", __FUNCTION__, sal_info_type, total_len);
+                }
+                *buffer = (u8 *) log_buffer;
+                return total_len;
+        } else {
+                IA64_LOG_UNLOCK(sal_info_type);
+                return 0;
+        }
+}
+/*
+ *  ia64_mca_log_sal_error_record
+ *
+ *  This function retrieves a specified error record type from SAL
+ *  and wakes up any processes waiting for error records.
+ *
+ *  Inputs  :   sal_info_type   (Type of error record MCA/CMC/CPE/INIT)
+ */
+static void
+ia64_mca_log_sal_error_record(int sal_info_type)
+{
+        u8 *buffer;
+        sal_log_record_header_t *rh;
+        u64 size;
+        int irq_safe = sal_info_type != SAL_INFO_TYPE_MCA && sal_info_type != SAL_INFO_TYPE_INIT;
+#ifdef IA64_MCA_DEBUG_INFO
+        static const char * const rec_name[] = { "MCA", "INIT", "CMC", "CPE" };
+#endif
+        size = ia64_log_get(sal_info_type, &buffer, irq_safe);
+        if (!size)
+                return;
+        salinfo_log_wakeup(sal_info_type, buffer, size, irq_safe);
+        if (irq_safe)
+                IA64_MCA_DEBUG("CPU %d: SAL log contains %s error record\n",
+                        smp_processor_id(),
+                        sal_info_type < ARRAY_SIZE(rec_name) ? rec_name[sal_info_type] : "UNKNOWN");
+        /* Clear logs from corrected errors in case there's no user-level logger */
+        rh = (sal_log_record_header_t *)buffer;
+        if (rh->severity == sal_log_severity_corrected)
+                ia64_sal_clear_state_info(sal_info_type);
+}
+/*
+ * platform dependent error handling
+ */
+#ifndef PLATFORM_MCA_HANDLERS
+#ifdef CONFIG_ACPI
+static int cpe_vector = -1;
+static irqreturn_t
+ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
+{
+        static unsigned long    cpe_history[CPE_HISTORY_LENGTH];
+        static int              index;
+        static DEFINE_SPINLOCK(cpe_history_lock);
+        IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
+                       __FUNCTION__, cpe_irq, smp_processor_id());
+        /* SAL spec states this should run w/ interrupts enabled */
+        local_irq_enable();
+        /* Get the CPE error record and log it */
+        ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
+        spin_lock(&cpe_history_lock);
+        if (!cpe_poll_enabled && cpe_vector >= 0) {
+                int i, count = 1; /* we know 1 happened now */
+                unsigned long now = jiffies;
+                for (i = 0; i < CPE_HISTORY_LENGTH; i++) {
+                        if (now - cpe_history[i] <= HZ)
+                                count++;
+                }
+                IA64_MCA_DEBUG(KERN_INFO "CPE threshold %d/%d\n", count, CPE_HISTORY_LENGTH);
+                if (count >= CPE_HISTORY_LENGTH) {
+                        cpe_poll_enabled = 1;
+                        spin_unlock(&cpe_history_lock);
+                        disable_irq_nosync(local_vector_to_irq(IA64_CPE_VECTOR));
+                        /*
+                         * Corrected errors will still be corrected, but
+                         * make sure there's a log somewhere that indicates
+                         * something is generating more than we can handle.
+                         */
+                        printk(KERN_WARNING "WARNING: Switching to polling CPE handler; error records may be lost\n");
+                        mod_timer(&cpe_poll_timer, jiffies + MIN_CPE_POLL_INTERVAL);
+                        /* lock already released, get out now */
+                        return IRQ_HANDLED;
+                } else {
+                        cpe_history[index++] = now;
+                        if (index == CPE_HISTORY_LENGTH)
+                                index = 0;
+                }
+        }
+        spin_unlock(&cpe_history_lock);
+        return IRQ_HANDLED;
+}
+#endif /* CONFIG_ACPI */
+static void
+show_min_state (pal_min_state_area_t *minstate)
+{
+        u64 iip = minstate->pmsa_iip + ((struct ia64_psr *)(&minstate->pmsa_ipsr))->ri;
+        u64 xip = minstate->pmsa_xip + ((struct ia64_psr *)(&minstate->pmsa_xpsr))->ri;
+        printk("NaT bits\t%016lx\n", minstate->pmsa_nat_bits);
+        printk("pr\t\t%016lx\n", minstate->pmsa_pr);
+        printk("b0\t\t%016lx ", minstate->pmsa_br0); print_symbol("%s\n", minstate->pmsa_br0);
+        printk("ar.rsc\t\t%016lx\n", minstate->pmsa_rsc);
+        printk("cr.iip\t\t%016lx ", iip); print_symbol("%s\n", iip);
+        printk("cr.ipsr\t\t%016lx\n", minstate->pmsa_ipsr);
+        printk("cr.ifs\t\t%016lx\n", minstate->pmsa_ifs);
+        printk("xip\t\t%016lx ", xip); print_symbol("%s\n", xip);
+        printk("xpsr\t\t%016lx\n", minstate->pmsa_xpsr);
+        printk("xfs\t\t%016lx\n", minstate->pmsa_xfs);
+        printk("b1\t\t%016lx ", minstate->pmsa_br1);
+        print_symbol("%s\n", minstate->pmsa_br1);
+        printk("\nstatic registers r0-r15:\n");
+        printk(" r0- 3 %016lx %016lx %016lx %016lx\n",
+               0UL, minstate->pmsa_gr[0], minstate->pmsa_gr[1], minstate->pmsa_gr[2]);
+        printk(" r4- 7 %016lx %016lx %016lx %016lx\n",
+               minstate->pmsa_gr[3], minstate->pmsa_gr[4],
+               minstate->pmsa_gr[5], minstate->pmsa_gr[6]);
+        printk(" r8-11 %016lx %016lx %016lx %016lx\n",
+               minstate->pmsa_gr[7], minstate->pmsa_gr[8],
+               minstate->pmsa_gr[9], minstate->pmsa_gr[10]);
+        printk("r12-15 %016lx %016lx %016lx %016lx\n",
+               minstate->pmsa_gr[11], minstate->pmsa_gr[12],
+               minstate->pmsa_gr[13], minstate->pmsa_gr[14]);
+        printk("\nbank 0:\n");
+        printk("r16-19 %016lx %016lx %016lx %016lx\n",
+               minstate->pmsa_bank0_gr[0], minstate->pmsa_bank0_gr[1],
+               minstate->pmsa_bank0_gr[2], minstate->pmsa_bank0_gr[3]);
+        printk("r20-23 %016lx %016lx %016lx %016lx\n",
+               minstate->pmsa_bank0_gr[4], minstate->pmsa_bank0_gr[5],
+               minstate->pmsa_bank0_gr[6], minstate->pmsa_bank0_gr[7]);
+        printk("r24-27 %016lx %016lx %016lx %016lx\n",
+               minstate->pmsa_bank0_gr[8], minstate->pmsa_bank0_gr[9],
+               minstate->pmsa_bank0_gr[10], minstate->pmsa_bank0_gr[11]);
+        printk("r28-31 %016lx %016lx %016lx %016lx\n",
+               minstate->pmsa_bank0_gr[12], minstate->pmsa_bank0_gr[13],
+               minstate->pmsa_bank0_gr[14], minstate->pmsa_bank0_gr[15]);
+        printk("\nbank 1:\n");
+        printk("r16-19 %016lx %016lx %016lx %016lx\n",
+               minstate->pmsa_bank1_gr[0], minstate->pmsa_bank1_gr[1],
+               minstate->pmsa_bank1_gr[2], minstate->pmsa_bank1_gr[3]);
+        printk("r20-23 %016lx %016lx %016lx %016lx\n",
+               minstate->pmsa_bank1_gr[4], minstate->pmsa_bank1_gr[5],
+               minstate->pmsa_bank1_gr[6], minstate->pmsa_bank1_gr[7]);
+        printk("r24-27 %016lx %016lx %016lx %016lx\n",
+               minstate->pmsa_bank1_gr[8], minstate->pmsa_bank1_gr[9],
+               minstate->pmsa_bank1_gr[10], minstate->pmsa_bank1_gr[11]);
+        printk("r28-31 %016lx %016lx %016lx %016lx\n",
+               minstate->pmsa_bank1_gr[12], minstate->pmsa_bank1_gr[13],
+               minstate->pmsa_bank1_gr[14], minstate->pmsa_bank1_gr[15]);
+}
+static void
+fetch_min_state (pal_min_state_area_t *ms, struct pt_regs *pt, struct switch_stack *sw)
+{
+        u64 *dst_banked, *src_banked, bit, shift, nat_bits;
+        int i;
+        /*
+         * First, update the pt-regs and switch-stack structures with the contents stored
+         * in the min-state area:
+         */
+        if (((struct ia64_psr *) &ms->pmsa_ipsr)->ic == 0) {
+                pt->cr_ipsr = ms->pmsa_xpsr;
+                pt->cr_iip = ms->pmsa_xip;
+                pt->cr_ifs = ms->pmsa_xfs;
+        } else {
+                pt->cr_ipsr = ms->pmsa_ipsr;
+                pt->cr_iip = ms->pmsa_iip;
+                pt->cr_ifs = ms->pmsa_ifs;
+        }
+        pt->ar_rsc = ms->pmsa_rsc;
+        pt->pr = ms->pmsa_pr;
+        pt->r1 = ms->pmsa_gr[0];
+        pt->r2 = ms->pmsa_gr[1];
+        pt->r3 = ms->pmsa_gr[2];
+        sw->r4 = ms->pmsa_gr[3];
+        sw->r5 = ms->pmsa_gr[4];
+        sw->r6 = ms->pmsa_gr[5];
+        sw->r7 = ms->pmsa_gr[6];
+        pt->r8 = ms->pmsa_gr[7];
+        pt->r9 = ms->pmsa_gr[8];
+        pt->r10 = ms->pmsa_gr[9];
+        pt->r11 = ms->pmsa_gr[10];
+        pt->r12 = ms->pmsa_gr[11];
+        pt->r13 = ms->pmsa_gr[12];
+        pt->r14 = ms->pmsa_gr[13];
+        pt->r15 = ms->pmsa_gr[14];
+        dst_banked = &pt->r16;          /* r16-r31 are contiguous in struct pt_regs */
+        src_banked = ms->pmsa_bank1_gr;
+        for (i = 0; i < 16; ++i)
+                dst_banked[i] = src_banked[i];
+        pt->b0 = ms->pmsa_br0;
+        sw->b1 = ms->pmsa_br1;
+        /* construct the NaT bits for the pt-regs structure: */
+#       define PUT_NAT_BIT(dst, addr)                                   \
+        do {                                                            \
+                bit = nat_bits & 1; nat_bits >>= 1;                     \
+                shift = ((unsigned long) addr >> 3) & 0x3f;             \
+                dst = ((dst) & ~(1UL << shift)) | (bit << shift);       \
+        } while (0)
+        /* Rotate the saved NaT bits such that bit 0 corresponds to pmsa_gr[0]: */
+        shift = ((unsigned long) &ms->pmsa_gr[0] >> 3) & 0x3f;
+        nat_bits = (ms->pmsa_nat_bits >> shift) | (ms->pmsa_nat_bits << (64 - shift));
+        PUT_NAT_BIT(sw->caller_unat, &pt->r1);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r2);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r3);
+        PUT_NAT_BIT(sw->ar_unat, &sw->r4);
+        PUT_NAT_BIT(sw->ar_unat, &sw->r5);
+        PUT_NAT_BIT(sw->ar_unat, &sw->r6);
+        PUT_NAT_BIT(sw->ar_unat, &sw->r7);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r8);  PUT_NAT_BIT(sw->caller_unat, &pt->r9);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r10); PUT_NAT_BIT(sw->caller_unat, &pt->r11);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r12); PUT_NAT_BIT(sw->caller_unat, &pt->r13);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r14); PUT_NAT_BIT(sw->caller_unat, &pt->r15);
+        nat_bits >>= 16;        /* skip over bank0 NaT bits */
+        PUT_NAT_BIT(sw->caller_unat, &pt->r16); PUT_NAT_BIT(sw->caller_unat, &pt->r17);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r18); PUT_NAT_BIT(sw->caller_unat, &pt->r19);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r20); PUT_NAT_BIT(sw->caller_unat, &pt->r21);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r22); PUT_NAT_BIT(sw->caller_unat, &pt->r23);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r24); PUT_NAT_BIT(sw->caller_unat, &pt->r25);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r26); PUT_NAT_BIT(sw->caller_unat, &pt->r27);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r28); PUT_NAT_BIT(sw->caller_unat, &pt->r29);
+        PUT_NAT_BIT(sw->caller_unat, &pt->r30); PUT_NAT_BIT(sw->caller_unat, &pt->r31);
+}
+static void
+init_handler_platform (pal_min_state_area_t *ms,
+                       struct pt_regs *pt, struct switch_stack *sw)
+{
+        struct unw_frame_info info;
+        /* if a kernel debugger is available call it here else just dump the registers */
+        /*
+         * Wait for a bit.  On some machines (e.g., HP's zx2000 and zx6000, INIT can be
+         * generated via the BMC's command-line interface, but since the console is on the
+         * same serial line, the user will need some time to switch out of the BMC before
+         * the dump begins.
+         */
+        printk("Delaying for 5 seconds...\n");
+        udelay(5*1000000);
+        show_min_state(ms);
+        printk("Backtrace of current task (pid %d, %s)\n", current->pid, current->comm);
+        fetch_min_state(ms, pt, sw);
+        unw_init_from_interruption(&info, current, pt, sw);
+        ia64_do_show_stack(&info, NULL);
+#ifdef CONFIG_SMP
+        /* read_trylock() would be handy... */
+        if (!tasklist_lock.write_lock)
+                read_lock(&tasklist_lock);
+#endif
+        {
+                struct task_struct *g, *t;
+                do_each_thread (g, t) {
+                        if (t == current)
+                                continue;
+                        printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
+                        show_stack(t, NULL);
+                } while_each_thread (g, t);
+        }
+#ifdef CONFIG_SMP
+        if (!tasklist_lock.write_lock)
+                read_unlock(&tasklist_lock);
+#endif
+        printk("\nINIT dump complete.  Please reboot now.\n");
+        while (1);                      /* hang city if no debugger */
+}
+#ifdef CONFIG_ACPI
+/*
+ * ia64_mca_register_cpev
+ *
+ *  Register the corrected platform error vector with SAL.
+ *
+ *  Inputs
+ *      cpev        Corrected Platform Error Vector number
+ *
+ *  Outputs
+ *      None
+ */
+static void
+ia64_mca_register_cpev (int cpev)
+{
+        /* Register the CPE interrupt vector with SAL */
+        struct ia64_sal_retval isrv;
+        isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_CPE_INT, SAL_MC_PARAM_MECHANISM_INT, cpev, 0, 0);
+        if (isrv.status) {
+                printk(KERN_ERR "Failed to register Corrected Platform "
+                       "Error interrupt vector with SAL (status %ld)\n", isrv.status);
+                return;
+        }
+        IA64_MCA_DEBUG("%s: corrected platform error "
+                       "vector %#x registered\n", __FUNCTION__, cpev);
+}
+#endif /* CONFIG_ACPI */
+#endif /* PLATFORM_MCA_HANDLERS */
+/*
+ * ia64_mca_cmc_vector_setup
+ *
+ *  Setup the corrected machine check vector register in the processor.
+ *  (The interrupt is masked on boot. ia64_mca_late_init unmask this.)
+ *  This function is invoked on a per-processor basis.
+ *
+ * Inputs
+ *      None
+ *
+ * Outputs
+ *      None
+ */
+void
+ia64_mca_cmc_vector_setup (void)
+{
+        cmcv_reg_t      cmcv;
+        cmcv.cmcv_regval        = 0;
+        cmcv.cmcv_mask          = 1;        /* Mask/disable interrupt at first */
+        cmcv.cmcv_vector        = IA64_CMC_VECTOR;
+        ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
+        IA64_MCA_DEBUG("%s: CPU %d corrected "
+                       "machine check vector %#x registered.\n",
+                       __FUNCTION__, smp_processor_id(), IA64_CMC_VECTOR);
+        IA64_MCA_DEBUG("%s: CPU %d CMCV = %#016lx\n",
+                       __FUNCTION__, smp_processor_id(), ia64_getreg(_IA64_REG_CR_CMCV));
+}
+/*
+ * ia64_mca_cmc_vector_disable
+ *
+ *  Mask the corrected machine check vector register in the processor.
+ *  This function is invoked on a per-processor basis.
+ *
+ * Inputs
+ *      dummy(unused)
+ *
+ * Outputs
+ *      None
+ */
+static void
+ia64_mca_cmc_vector_disable (void *dummy)
+{
+        cmcv_reg_t      cmcv;
+        cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV);
+        cmcv.cmcv_mask = 1; /* Mask/disable interrupt */
+        ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
+        IA64_MCA_DEBUG("%s: CPU %d corrected "
+                       "machine check vector %#x disabled.\n",
+                       __FUNCTION__, smp_processor_id(), cmcv.cmcv_vector);
+}
+/*
+ * ia64_mca_cmc_vector_enable
+ *
+ *  Unmask the corrected machine check vector register in the processor.
+ *  This function is invoked on a per-processor basis.
+ *
+ * Inputs
+ *      dummy(unused)
+ *
+ * Outputs
+ *      None
+ */
+static void
+ia64_mca_cmc_vector_enable (void *dummy)
+{
+        cmcv_reg_t      cmcv;
+        cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV);
+        cmcv.cmcv_mask = 0; /* Unmask/enable interrupt */
+        ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
+        IA64_MCA_DEBUG("%s: CPU %d corrected "
+                       "machine check vector %#x enabled.\n",
+                       __FUNCTION__, smp_processor_id(), cmcv.cmcv_vector);
+}
+/*
+ * ia64_mca_cmc_vector_disable_keventd
+ *
+ * Called via keventd (smp_call_function() is not safe in interrupt context) to
+ * disable the cmc interrupt vector.
+ */
+static void
+ia64_mca_cmc_vector_disable_keventd(void *unused)
+{
+        on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 1, 0);
+}
+/*
+ * ia64_mca_cmc_vector_enable_keventd
+ *
+ * Called via keventd (smp_call_function() is not safe in interrupt context) to
+ * enable the cmc interrupt vector.
+ */
+static void
+ia64_mca_cmc_vector_enable_keventd(void *unused)
+{
+        on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 1, 0);
+}
+/*
+ * ia64_mca_wakeup_ipi_wait
+ *
+ *      Wait for the inter-cpu interrupt to be sent by the
+ *      monarch processor once it is done with handling the
+ *      MCA.
+ *
+ *  Inputs  :   None
+ *  Outputs :   None
+ */
+static void
+ia64_mca_wakeup_ipi_wait(void)
+{
+        int     irr_num = (IA64_MCA_WAKEUP_VECTOR >> 6);
+        int     irr_bit = (IA64_MCA_WAKEUP_VECTOR & 0x3f);
+        u64     irr = 0;
+        do {
+                switch(irr_num) {
+                      case 0:
+                        irr = ia64_getreg(_IA64_REG_CR_IRR0);
+                        break;
+                      case 1:
+                        irr = ia64_getreg(_IA64_REG_CR_IRR1);
+                        break;
+                      case 2:
+                        irr = ia64_getreg(_IA64_REG_CR_IRR2);
+                        break;
+                      case 3:
+                        irr = ia64_getreg(_IA64_REG_CR_IRR3);
+                        break;
+                }
+                cpu_relax();
+        } while (!(irr & (1UL << irr_bit))) ;
+}
+/*
+ * ia64_mca_wakeup
+ *
+ *      Send an inter-cpu interrupt to wake-up a particular cpu
+ *      and mark that cpu to be out of rendez.
+ *
+ *  Inputs  :   cpuid
+ *  Outputs :   None
+ */
+static void
+ia64_mca_wakeup(int cpu)
+{
+        platform_send_ipi(cpu, IA64_MCA_WAKEUP_VECTOR, IA64_IPI_DM_INT, 0);
+        ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
+}
+/*
+ * ia64_mca_wakeup_all
+ *
+ *      Wakeup all the cpus which have rendez'ed previously.
+ *
+ *  Inputs  :   None
+ *  Outputs :   None
+ */
+static void
+ia64_mca_wakeup_all(void)
+{
+        int cpu;
+        /* Clear the Rendez checkin flag for all cpus */
+        for(cpu = 0; cpu < NR_CPUS; cpu++) {
+                if (!cpu_online(cpu))
+                        continue;
+                if (ia64_mc_info.imi_rendez_checkin[cpu] == IA64_MCA_RENDEZ_CHECKIN_DONE)
+                        ia64_mca_wakeup(cpu);
+        }
+}
+/*
+ * ia64_mca_rendez_interrupt_handler
+ *
+ *      This is handler used to put slave processors into spinloop
+ *      while the monarch processor does the mca handling and later
+ *      wake each slave up once the monarch is done.
+ *
+ *  Inputs  :   None
+ *  Outputs :   None
+ */
+static irqreturn_t
+ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs)
+{
+        unsigned long flags;
+        int cpu = smp_processor_id();
+        /* Mask all interrupts */
+        local_irq_save(flags);
+        ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_DONE;
+        /* Register with the SAL monarch that the slave has
+         * reached SAL
+         */
+        ia64_sal_mc_rendez();
+        /* Wait for the wakeup IPI from the monarch
+         * This waiting is done by polling on the wakeup-interrupt
+         * vector bit in the processor's IRRs
+         */
+        ia64_mca_wakeup_ipi_wait();
+        /* Enable all interrupts */
+        local_irq_restore(flags);
+        return IRQ_HANDLED;
+}
+/*
+ * ia64_mca_wakeup_int_handler
+ *
+ *      The interrupt handler for processing the inter-cpu interrupt to the
+ *      slave cpu which was spinning in the rendez loop.
+ *      Since this spinning is done by turning off the interrupts and
+ *      polling on the wakeup-interrupt bit in the IRR, there is
+ *      nothing useful to be done in the handler.
+ *
+ *  Inputs  :   wakeup_irq  (Wakeup-interrupt bit)
+ *      arg             (Interrupt handler specific argument)
+ *      ptregs          (Exception frame at the time of the interrupt)
+ *  Outputs :   None
+ *
+ */
+static irqreturn_t
+ia64_mca_wakeup_int_handler(int wakeup_irq, void *arg, struct pt_regs *ptregs)
+{
+        return IRQ_HANDLED;
+}
+/*
+ * ia64_return_to_sal_check
+ *
+ *      This is function called before going back from the OS_MCA handler
+ *      to the OS_MCA dispatch code which finally takes the control back
+ *      to the SAL.
+ *      The main purpose of this routine is to setup the OS_MCA to SAL
+ *      return state which can be used by the OS_MCA dispatch code
+ *      just before going back to SAL.
+ *
+ *  Inputs  :   None
+ *  Outputs :   None
+ */
+static void
+ia64_return_to_sal_check(int recover)
+{
+        /* Copy over some relevant stuff from the sal_to_os_mca_handoff
+         * so that it can be used at the time of os_mca_to_sal_handoff
+         */
+        ia64_os_to_sal_handoff_state.imots_sal_gp =
+                ia64_sal_to_os_handoff_state.imsto_sal_gp;
+        ia64_os_to_sal_handoff_state.imots_sal_check_ra =
+                ia64_sal_to_os_handoff_state.imsto_sal_check_ra;
+        if (recover)
+                ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_CORRECTED;
+        else
+                ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_COLD_BOOT;
+        /* Default = tell SAL to return to same context */
+        ia64_os_to_sal_handoff_state.imots_context = IA64_MCA_SAME_CONTEXT;
+        ia64_os_to_sal_handoff_state.imots_new_min_state =
+                (u64 *)ia64_sal_to_os_handoff_state.pal_min_state;
+}
+/* Function pointer for extra MCA recovery */
+int (*ia64_mca_ucmc_extension)
+        (void*,ia64_mca_sal_to_os_state_t*,ia64_mca_os_to_sal_state_t*)
+        = NULL;
+int
+ia64_reg_MCA_extension(void *fn)
+{
+        if (ia64_mca_ucmc_extension)
+                return 1;
+        ia64_mca_ucmc_extension = fn;
+        return 0;
+}
+void
+ia64_unreg_MCA_extension(void)
+{
+        if (ia64_mca_ucmc_extension)
+                ia64_mca_ucmc_extension = NULL;
+}
+EXPORT_SYMBOL(ia64_reg_MCA_extension);
+EXPORT_SYMBOL(ia64_unreg_MCA_extension);
+/*
+ * ia64_mca_ucmc_handler
+ *
+ *      This is uncorrectable machine check handler called from OS_MCA
+ *      dispatch code which is in turn called from SAL_CHECK().
+ *      This is the place where the core of OS MCA handling is done.
+ *      Right now the logs are extracted and displayed in a well-defined
+ *      format. This handler code is supposed to be run only on the
+ *      monarch processor. Once the monarch is done with MCA handling
+ *      further MCA logging is enabled by clearing logs.
+ *      Monarch also has the duty of sending wakeup-IPIs to pull the
+ *      slave processors out of rendezvous spinloop.
+ *
+ *  Inputs  :   None
+ *  Outputs :   None
+ */
+void
+ia64_mca_ucmc_handler(void)
+{
+        pal_processor_state_info_t *psp = (pal_processor_state_info_t *)
+                &ia64_sal_to_os_handoff_state.proc_state_param;
+        int recover; 
+        /* Get the MCA error record and log it */
+        ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
+        /* TLB error is only exist in this SAL error record */
+        recover = (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc))
+        /* other error recovery */
+           || (ia64_mca_ucmc_extension 
+                && ia64_mca_ucmc_extension(
+                        IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA),
+                        &ia64_sal_to_os_handoff_state,
+                        &ia64_os_to_sal_handoff_state)); 
+        if (recover) {
+                sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA);
+                rh->severity = sal_log_severity_corrected;
+                ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
+        }
+        /*
+         *  Wakeup all the processors which are spinning in the rendezvous
+         *  loop.
+         */
+        ia64_mca_wakeup_all();
+        /* Return to SAL */
+        ia64_return_to_sal_check(recover);
+}
+static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd, NULL);
+static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd, NULL);
+/*
+ * ia64_mca_cmc_int_handler
+ *
+ *  This is corrected machine check interrupt handler.
+ *      Right now the logs are extracted and displayed in a well-defined
+ *      format.
+ *
+ * Inputs
+ *      interrupt number
+ *      client data arg ptr
+ *      saved registers ptr
+ *
+ * Outputs
+ *      None
+ */
+static irqreturn_t
+ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs)
+{
+        static unsigned long    cmc_history[CMC_HISTORY_LENGTH];
+        static int              index;
+        static DEFINE_SPINLOCK(cmc_history_lock);
+        IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
+                       __FUNCTION__, cmc_irq, smp_processor_id());
+        /* SAL spec states this should run w/ interrupts enabled */
+        local_irq_enable();
+        /* Get the CMC error record and log it */
+        ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC);
+        spin_lock(&cmc_history_lock);
+        if (!cmc_polling_enabled) {
+                int i, count = 1; /* we know 1 happened now */
+                unsigned long now = jiffies;
+                for (i = 0; i < CMC_HISTORY_LENGTH; i++) {
+                        if (now - cmc_history[i] <= HZ)
+                                count++;
+                }
+                IA64_MCA_DEBUG(KERN_INFO "CMC threshold %d/%d\n", count, CMC_HISTORY_LENGTH);
+                if (count >= CMC_HISTORY_LENGTH) {
+                        cmc_polling_enabled = 1;
+                        spin_unlock(&cmc_history_lock);
+                        schedule_work(&cmc_disable_work);
+                        /*
+                         * Corrected errors will still be corrected, but
+                         * make sure there's a log somewhere that indicates
+                         * something is generating more than we can handle.
+                         */
+                        printk(KERN_WARNING "WARNING: Switching to polling CMC handler; error records may be lost\n");
+                        mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL);
+                        /* lock already released, get out now */
+                        return IRQ_HANDLED;
+                } else {
+                        cmc_history[index++] = now;
+                        if (index == CMC_HISTORY_LENGTH)
+                                index = 0;
+                }
+        }
+        spin_unlock(&cmc_history_lock);
+        return IRQ_HANDLED;
+}
+/*
+ *  ia64_mca_cmc_int_caller
+ *
+ *      Triggered by sw interrupt from CMC polling routine.  Calls
+ *      real interrupt handler and either triggers a sw interrupt
+ *      on the next cpu or does cleanup at the end.
+ *
+ * Inputs
+ *      interrupt number
+ *      client data arg ptr
+ *      saved registers ptr
+ * Outputs
+ *      handled
+ */
+static irqreturn_t
+ia64_mca_cmc_int_caller(int cmc_irq, void *arg, struct pt_regs *ptregs)
+{
+        static int start_count = -1;
+        unsigned int cpuid;
+        cpuid = smp_processor_id();
+        /* If first cpu, update count */
+        if (start_count == -1)
+                start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC);
+        ia64_mca_cmc_int_handler(cmc_irq, arg, ptregs);
+        for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
+        if (cpuid < NR_CPUS) {
+                platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
+        } else {
+                /* If no log record, switch out of polling mode */
+                if (start_count == IA64_LOG_COUNT(SAL_INFO_TYPE_CMC)) {
+                        printk(KERN_WARNING "Returning to interrupt driven CMC handler\n");
+                        schedule_work(&cmc_enable_work);
+                        cmc_polling_enabled = 0;
+                } else {
+                        mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL);
+                }
+                start_count = -1;
+        }
+        return IRQ_HANDLED;
+}
+/*
+ *  ia64_mca_cmc_poll
+ *
+ *      Poll for Corrected Machine Checks (CMCs)
+ *
+ * Inputs   :   dummy(unused)
+ * Outputs  :   None
+ *
+ */
+static void
+ia64_mca_cmc_poll (unsigned long dummy)
+{
+        /* Trigger a CMC interrupt cascade  */
+        platform_send_ipi(first_cpu(cpu_online_map), IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
+}
+/*
+ *  ia64_mca_cpe_int_caller
+ *
+ *      Triggered by sw interrupt from CPE polling routine.  Calls
+ *      real interrupt handler and either triggers a sw interrupt
+ *      on the next cpu or does cleanup at the end.
+ *
+ * Inputs
+ *      interrupt number
+ *      client data arg ptr
+ *      saved registers ptr
+ * Outputs
+ *      handled
+ */
+#ifdef CONFIG_ACPI
+static irqreturn_t
+ia64_mca_cpe_int_caller(int cpe_irq, void *arg, struct pt_regs *ptregs)
+{
+        static int start_count = -1;
+        static int poll_time = MIN_CPE_POLL_INTERVAL;
+        unsigned int cpuid;
+        cpuid = smp_processor_id();
+        /* If first cpu, update count */
+        if (start_count == -1)
+                start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CPE);
+        ia64_mca_cpe_int_handler(cpe_irq, arg, ptregs);
+        for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
+        if (cpuid < NR_CPUS) {
+                platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
+        } else {
+                /*
+                 * If a log was recorded, increase our polling frequency,
+                 * otherwise, backoff or return to interrupt mode.
+                 */
+                if (start_count != IA64_LOG_COUNT(SAL_INFO_TYPE_CPE)) {
+                        poll_time = max(MIN_CPE_POLL_INTERVAL, poll_time / 2);
+                } else if (cpe_vector < 0) {
+                        poll_time = min(MAX_CPE_POLL_INTERVAL, poll_time * 2);
+                } else {
+                        poll_time = MIN_CPE_POLL_INTERVAL;
+                        printk(KERN_WARNING "Returning to interrupt driven CPE handler\n");
+                        enable_irq(local_vector_to_irq(IA64_CPE_VECTOR));
+                        cpe_poll_enabled = 0;
+                }
+                if (cpe_poll_enabled)
+                        mod_timer(&cpe_poll_timer, jiffies + poll_time);
+                start_count = -1;
+        }
+        return IRQ_HANDLED;
+}
+#endif /* CONFIG_ACPI */
+/*
+ *  ia64_mca_cpe_poll
+ *
+ *      Poll for Corrected Platform Errors (CPEs), trigger interrupt
+ *      on first cpu, from there it will trickle through all the cpus.
+ *
+ * Inputs   :   dummy(unused)
+ * Outputs  :   None
+ *
+ */
+static void
+ia64_mca_cpe_poll (unsigned long dummy)
+{
+        /* Trigger a CPE interrupt cascade  */
+        platform_send_ipi(first_cpu(cpu_online_map), IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
+}
+/*
+ * C portion of the OS INIT handler
+ *
+ * Called from ia64_monarch_init_handler
+ *
+ * Inputs: pointer to pt_regs where processor info was saved.
+ *
+ * Returns:
+ *   0 if SAL must warm boot the System
+ *   1 if SAL must return to interrupted context using PAL_MC_RESUME
+ *
+ */
+void
+ia64_init_handler (struct pt_regs *pt, struct switch_stack *sw)
+{
+        pal_min_state_area_t *ms;
+        oops_in_progress = 1;   /* avoid deadlock in printk, but it makes recovery dodgy */
+        console_loglevel = 15;  /* make sure printks make it to console */
+        printk(KERN_INFO "Entered OS INIT handler. PSP=%lx\n",
+                ia64_sal_to_os_handoff_state.proc_state_param);
+        /*
+         * Address of minstate area provided by PAL is physical,
+         * uncacheable (bit 63 set). Convert to Linux virtual
+         * address in region 6.
+         */
+        ms = (pal_min_state_area_t *)(ia64_sal_to_os_handoff_state.pal_min_state | (6ul<<61));
+        init_handler_platform(ms, pt, sw);      /* call platform specific routines */
+}
+static int __init
+ia64_mca_disable_cpe_polling(char *str)
+{
+        cpe_poll_enabled = 0;
+        return 1;
+}
+__setup("disable_cpe_poll", ia64_mca_disable_cpe_polling);
+static struct irqaction cmci_irqaction = {
+        .handler =      ia64_mca_cmc_int_handler,
+        .flags =        SA_INTERRUPT,
+        .name =         "cmc_hndlr"
+};
+static struct irqaction cmcp_irqaction = {
+        .handler =      ia64_mca_cmc_int_caller,
+        .flags =        SA_INTERRUPT,
+        .name =         "cmc_poll"
+};
+static struct irqaction mca_rdzv_irqaction = {
+        .handler =      ia64_mca_rendez_int_handler,
+        .flags =        SA_INTERRUPT,
+        .name =         "mca_rdzv"
+};
+static struct irqaction mca_wkup_irqaction = {
+        .handler =      ia64_mca_wakeup_int_handler,
+        .flags =        SA_INTERRUPT,
+        .name =         "mca_wkup"
+};
+#ifdef CONFIG_ACPI
+static struct irqaction mca_cpe_irqaction = {
+        .handler =      ia64_mca_cpe_int_handler,
+        .flags =        SA_INTERRUPT,
+        .name =         "cpe_hndlr"
+};
+static struct irqaction mca_cpep_irqaction = {
+        .handler =      ia64_mca_cpe_int_caller,
+        .flags =        SA_INTERRUPT,
+        .name =         "cpe_poll"
+};
+#endif /* CONFIG_ACPI */
+/* Do per-CPU MCA-related initialization.  */
+void __devinit
+ia64_mca_cpu_init(void *cpu_data)
+{
+        void *pal_vaddr;
+        if (smp_processor_id() == 0) {
+                void *mca_data;
+                int cpu;
+                mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu)
+                                         * NR_CPUS);
+                for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                        __per_cpu_mca[cpu] = __pa(mca_data);
+                        mca_data += sizeof(struct ia64_mca_cpu);
+                }
+        }
+        /*
+         * The MCA info structure was allocated earlier and its
+         * physical address saved in __per_cpu_mca[cpu].  Copy that
+         * address * to ia64_mca_data so we can access it as a per-CPU
+         * variable.
+         */
+        __get_cpu_var(ia64_mca_data) = __per_cpu_mca[smp_processor_id()];
+        /*
+         * Stash away a copy of the PTE needed to map the per-CPU page.
+         * We may need it during MCA recovery.
+         */
+        __get_cpu_var(ia64_mca_per_cpu_pte) =
+                pte_val(mk_pte_phys(__pa(cpu_data), PAGE_KERNEL));
+        /*
+         * Also, stash away a copy of the PAL address and the PTE
+         * needed to map it.
+         */
+        pal_vaddr = efi_get_pal_addr();
+        if (!pal_vaddr)
+                return;
+        __get_cpu_var(ia64_mca_pal_base) =
+                GRANULEROUNDDOWN((unsigned long) pal_vaddr);
+        __get_cpu_var(ia64_mca_pal_pte) = pte_val(mk_pte_phys(__pa(pal_vaddr),
+                                                              PAGE_KERNEL));
+}
+/*
+ * ia64_mca_init
+ *
+ *  Do all the system level mca specific initialization.
+ *
+ *      1. Register spinloop and wakeup request interrupt vectors
+ *
+ *      2. Register OS_MCA handler entry point
+ *
+ *      3. Register OS_INIT handler entry point
+ *
+ *  4. Initialize MCA/CMC/INIT related log buffers maintained by the OS.
+ *
+ *  Note that this initialization is done very early before some kernel
+ *  services are available.
+ *
+ *  Inputs  :   None
+ *
+ *  Outputs :   None
+ */
+void __init
+ia64_mca_init(void)
+{
+        ia64_fptr_t *mon_init_ptr = (ia64_fptr_t *)ia64_monarch_init_handler;
+        ia64_fptr_t *slave_init_ptr = (ia64_fptr_t *)ia64_slave_init_handler;
+        ia64_fptr_t *mca_hldlr_ptr = (ia64_fptr_t *)ia64_os_mca_dispatch;
+        int i;
+        s64 rc;
+        struct ia64_sal_retval isrv;
+        u64 timeout = IA64_MCA_RENDEZ_TIMEOUT;  /* platform specific */
+        IA64_MCA_DEBUG("%s: begin\n", __FUNCTION__);
+        /* Clear the Rendez checkin flag for all cpus */
+        for(i = 0 ; i < NR_CPUS; i++)
+                ia64_mc_info.imi_rendez_checkin[i] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
+        /*
+         * Register the rendezvous spinloop and wakeup mechanism with SAL
+         */
+        /* Register the rendezvous interrupt vector with SAL */
+        while (1) {
+                isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_INT,
+                                              SAL_MC_PARAM_MECHANISM_INT,
+                                              IA64_MCA_RENDEZ_VECTOR,
+                                              timeout,
+                                              SAL_MC_PARAM_RZ_ALWAYS);
+                rc = isrv.status;
+                if (rc == 0)
+                        break;
+                if (rc == -2) {
+                        printk(KERN_INFO "Increasing MCA rendezvous timeout from "
+                                "%ld to %ld milliseconds\n", timeout, isrv.v0);
+                        timeout = isrv.v0;
+                        continue;
+                }
+                printk(KERN_ERR "Failed to register rendezvous interrupt "
+                       "with SAL (status %ld)\n", rc);
+                return;
+        }
+        /* Register the wakeup interrupt vector with SAL */
+        isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_WAKEUP,
+                                      SAL_MC_PARAM_MECHANISM_INT,
+                                      IA64_MCA_WAKEUP_VECTOR,
+                                      0, 0);
+        rc = isrv.status;
+        if (rc) {
+                printk(KERN_ERR "Failed to register wakeup interrupt with SAL "
+                       "(status %ld)\n", rc);
+                return;
+        }
+        IA64_MCA_DEBUG("%s: registered MCA rendezvous spinloop and wakeup mech.\n", __FUNCTION__);
+        ia64_mc_info.imi_mca_handler        = ia64_tpa(mca_hldlr_ptr->fp);
+        /*
+         * XXX - disable SAL checksum by setting size to 0; should be
+         *      ia64_tpa(ia64_os_mca_dispatch_end) - ia64_tpa(ia64_os_mca_dispatch);
+         */
+        ia64_mc_info.imi_mca_handler_size       = 0;
+        /* Register the os mca handler with SAL */
+        if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_MCA,
+                                       ia64_mc_info.imi_mca_handler,
+                                       ia64_tpa(mca_hldlr_ptr->gp),
+                                       ia64_mc_info.imi_mca_handler_size,
+                                       0, 0, 0)))
+        {
+                printk(KERN_ERR "Failed to register OS MCA handler with SAL "
+                       "(status %ld)\n", rc);
+                return;
+        }
+        IA64_MCA_DEBUG("%s: registered OS MCA handler with SAL at 0x%lx, gp = 0x%lx\n", __FUNCTION__,
+                       ia64_mc_info.imi_mca_handler, ia64_tpa(mca_hldlr_ptr->gp));
+        /*
+         * XXX - disable SAL checksum by setting size to 0, should be
+         * size of the actual init handler in mca_asm.S.
+         */
+        ia64_mc_info.imi_monarch_init_handler           = ia64_tpa(mon_init_ptr->fp);
+        ia64_mc_info.imi_monarch_init_handler_size      = 0;
+        ia64_mc_info.imi_slave_init_handler             = ia64_tpa(slave_init_ptr->fp);
+        ia64_mc_info.imi_slave_init_handler_size        = 0;
+        IA64_MCA_DEBUG("%s: OS INIT handler at %lx\n", __FUNCTION__,
+                       ia64_mc_info.imi_monarch_init_handler);
+        /* Register the os init handler with SAL */
+        if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_INIT,
+                                       ia64_mc_info.imi_monarch_init_handler,
+                                       ia64_tpa(ia64_getreg(_IA64_REG_GP)),
+                                       ia64_mc_info.imi_monarch_init_handler_size,
+                                       ia64_mc_info.imi_slave_init_handler,
+                                       ia64_tpa(ia64_getreg(_IA64_REG_GP)),
+                                       ia64_mc_info.imi_slave_init_handler_size)))
+        {
+                printk(KERN_ERR "Failed to register m/s INIT handlers with SAL "
+                       "(status %ld)\n", rc);
+                return;
+        }
+        IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __FUNCTION__);
+        /*
+         *  Configure the CMCI/P vector and handler. Interrupts for CMC are
+         *  per-processor, so AP CMC interrupts are setup in smp_callin() (smpboot.c).
+         */
+        register_percpu_irq(IA64_CMC_VECTOR, &cmci_irqaction);
+        register_percpu_irq(IA64_CMCP_VECTOR, &cmcp_irqaction);
+        ia64_mca_cmc_vector_setup();       /* Setup vector on BSP */
+        /* Setup the MCA rendezvous interrupt vector */
+        register_percpu_irq(IA64_MCA_RENDEZ_VECTOR, &mca_rdzv_irqaction);
+        /* Setup the MCA wakeup interrupt vector */
+        register_percpu_irq(IA64_MCA_WAKEUP_VECTOR, &mca_wkup_irqaction);
+#ifdef CONFIG_ACPI
+        /* Setup the CPEI/P vector and handler */
+        cpe_vector = acpi_request_vector(ACPI_INTERRUPT_CPEI);
+        register_percpu_irq(IA64_CPEP_VECTOR, &mca_cpep_irqaction);
+#endif
+        /* Initialize the areas set aside by the OS to buffer the
+         * platform/processor error states for MCA/INIT/CMC
+         * handling.
+         */
+        ia64_log_init(SAL_INFO_TYPE_MCA);
+        ia64_log_init(SAL_INFO_TYPE_INIT);
+        ia64_log_init(SAL_INFO_TYPE_CMC);
+        ia64_log_init(SAL_INFO_TYPE_CPE);
+        mca_init = 1;
+        printk(KERN_INFO "MCA related initialization done\n");
+}
+/*
+ * ia64_mca_late_init
+ *
+ *      Opportunity to setup things that require initialization later
+ *      than ia64_mca_init.  Setup a timer to poll for CPEs if the
+ *      platform doesn't support an interrupt driven mechanism.
+ *
+ *  Inputs  :   None
+ *  Outputs :   Status
+ */
+static int __init
+ia64_mca_late_init(void)
+{
+        if (!mca_init)
+                return 0;
+        /* Setup the CMCI/P vector and handler */
+        init_timer(&cmc_poll_timer);
+        cmc_poll_timer.function = ia64_mca_cmc_poll;
+        /* Unmask/enable the vector */
+        cmc_polling_enabled = 0;
+        schedule_work(&cmc_enable_work);
+        IA64_MCA_DEBUG("%s: CMCI/P setup and enabled.\n", __FUNCTION__);
+#ifdef CONFIG_ACPI
+        /* Setup the CPEI/P vector and handler */
+        init_timer(&cpe_poll_timer);
+        cpe_poll_timer.function = ia64_mca_cpe_poll;
+        {
+                irq_desc_t *desc;
+                unsigned int irq;
+                if (cpe_vector >= 0) {
+                        /* If platform supports CPEI, enable the irq. */
+                        cpe_poll_enabled = 0;
+                        for (irq = 0; irq < NR_IRQS; ++irq)
+                                if (irq_to_vector(irq) == cpe_vector) {
+                                        desc = irq_descp(irq);
+                                        desc->status |= IRQ_PER_CPU;
+                                        setup_irq(irq, &mca_cpe_irqaction);
+                                }
+                        ia64_mca_register_cpev(cpe_vector);
+                        IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__);
+                } else {
+                        /* If platform doesn't support CPEI, get the timer going. */
+                        if (cpe_poll_enabled) {
+                                ia64_mca_cpe_poll(0UL);
+                                IA64_MCA_DEBUG("%s: CPEP setup and enabled.\n", __FUNCTION__);
+                        }
+                }
+        }
+#endif
+        return 0;
+}
+device_initcall(ia64_mca_late_init);
diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S
new file mode 100644
index 000000000000..cf3f8014f9ad
--- /dev/null
+++ b/arch/ia64/kernel/mca_asm.S
@@ -0,0 +1,928 @@
+//
+// assembly portion of the IA64 MCA handling
+//
+// Mods by cfleck to integrate into kernel build
+// 00/03/15 davidm Added various stop bits to get a clean compile
+//
+// 00/03/29 cfleck Added code to save INIT handoff state in pt_regs format, switch to temp
+//                 kstack, switch modes, jump to C INIT handler
+//
+// 02/01/04 J.Hall <jenna.s.hall@intel.com>
+//                 Before entering virtual mode code:
+//                 1. Check for TLB CPU error
+//                 2. Restore current thread pointer to kr6
+//                 3. Move stack ptr 16 bytes to conform to C calling convention
+//
+// 04/11/12 Russ Anderson <rja@sgi.com>
+//                 Added per cpu MCA/INIT stack save areas.
+//
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <asm/asmmacro.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/mca_asm.h>
+#include <asm/mca.h>
+/*
+ * When we get a machine check, the kernel stack pointer is no longer
+ * valid, so we need to set a new stack pointer.
+ */
+#define MINSTATE_PHYS   /* Make sure stack access is physical for MINSTATE */
+/*
+ * Needed for return context to SAL
+ */
+#define IA64_MCA_SAME_CONTEXT   0
+#define IA64_MCA_COLD_BOOT      -2
+#include "minstate.h"
+/*
+ * SAL_TO_OS_MCA_HANDOFF_STATE (SAL 3.0 spec)
+ *              1. GR1 = OS GP
+ *              2. GR8 = PAL_PROC physical address
+ *              3. GR9 = SAL_PROC physical address
+ *              4. GR10 = SAL GP (physical)
+ *              5. GR11 = Rendez state
+ *              6. GR12 = Return address to location within SAL_CHECK
+ */
+#define SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(_tmp)          \
+        LOAD_PHYSICAL(p0, _tmp, ia64_sal_to_os_handoff_state);; \
+        st8     [_tmp]=r1,0x08;;                        \
+        st8     [_tmp]=r8,0x08;;                        \
+        st8     [_tmp]=r9,0x08;;                        \
+        st8     [_tmp]=r10,0x08;;                       \
+        st8     [_tmp]=r11,0x08;;                       \
+        st8     [_tmp]=r12,0x08;;                       \
+        st8     [_tmp]=r17,0x08;;                       \
+        st8     [_tmp]=r18,0x08
+/*
+ * OS_MCA_TO_SAL_HANDOFF_STATE (SAL 3.0 spec)
+ * (p6) is executed if we never entered virtual mode (TLB error)
+ * (p7) is executed if we entered virtual mode as expected (normal case)
+ *      1. GR8 = OS_MCA return status
+ *      2. GR9 = SAL GP (physical)
+ *      3. GR10 = 0/1 returning same/new context
+ *      4. GR22 = New min state save area pointer
+ *      returns ptr to SAL rtn save loc in _tmp
+ */
+#define OS_MCA_TO_SAL_HANDOFF_STATE_RESTORE(_tmp)       \
+        movl    _tmp=ia64_os_to_sal_handoff_state;;     \
+        DATA_VA_TO_PA(_tmp);;                           \
+        ld8     r8=[_tmp],0x08;;                        \
+        ld8     r9=[_tmp],0x08;;                        \
+        ld8     r10=[_tmp],0x08;;                       \
+        ld8     r22=[_tmp],0x08;;
+        // now _tmp is pointing to SAL rtn save location
+/*
+ * COLD_BOOT_HANDOFF_STATE() sets ia64_mca_os_to_sal_state
+ *      imots_os_status=IA64_MCA_COLD_BOOT
+ *      imots_sal_gp=SAL GP
+ *      imots_context=IA64_MCA_SAME_CONTEXT
+ *      imots_new_min_state=Min state save area pointer
+ *      imots_sal_check_ra=Return address to location within SAL_CHECK
+ *
+ */
+#define COLD_BOOT_HANDOFF_STATE(sal_to_os_handoff,os_to_sal_handoff,tmp)\
+        movl    tmp=IA64_MCA_COLD_BOOT;                                 \
+        movl    sal_to_os_handoff=__pa(ia64_sal_to_os_handoff_state);   \
+        movl    os_to_sal_handoff=__pa(ia64_os_to_sal_handoff_state);;  \
+        st8     [os_to_sal_handoff]=tmp,8;;                             \
+        ld8     tmp=[sal_to_os_handoff],48;;                            \
+        st8     [os_to_sal_handoff]=tmp,8;;                             \
+        movl    tmp=IA64_MCA_SAME_CONTEXT;;                             \
+        st8     [os_to_sal_handoff]=tmp,8;;                             \
+        ld8     tmp=[sal_to_os_handoff],-8;;                            \
+        st8     [os_to_sal_handoff]=tmp,8;;                             \
+        ld8     tmp=[sal_to_os_handoff];;                               \
+        st8     [os_to_sal_handoff]=tmp;;
+#define GET_IA64_MCA_DATA(reg)                                          \
+        GET_THIS_PADDR(reg, ia64_mca_data)                              \
+        ;;                                                              \
+        ld8 reg=[reg]
+        .global ia64_os_mca_dispatch
+        .global ia64_os_mca_dispatch_end
+        .global ia64_sal_to_os_handoff_state
+        .global ia64_os_to_sal_handoff_state
+        .text
+        .align 16
+ia64_os_mca_dispatch:
+        // Serialize all MCA processing
+        mov     r3=1;;
+        LOAD_PHYSICAL(p0,r2,ia64_mca_serialize);;
+ia64_os_mca_spin:
+        xchg8   r4=[r2],r3;;
+        cmp.ne  p6,p0=r4,r0
+(p6)    br ia64_os_mca_spin
+        // Save the SAL to OS MCA handoff state as defined
+        // by SAL SPEC 3.0
+        // NOTE : The order in which the state gets saved
+        //        is dependent on the way the C-structure
+        //        for ia64_mca_sal_to_os_state_t has been
+        //        defined in include/asm/mca.h
+        SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(r2)
+        ;;
+        // LOG PROCESSOR STATE INFO FROM HERE ON..
+begin_os_mca_dump:
+        br      ia64_os_mca_proc_state_dump;;
+ia64_os_mca_done_dump:
+        LOAD_PHYSICAL(p0,r16,ia64_sal_to_os_handoff_state+56)
+        ;;
+        ld8 r18=[r16]           // Get processor state parameter on existing PALE_CHECK.
+        ;;
+        tbit.nz p6,p7=r18,60
+(p7)    br.spnt done_tlb_purge_and_reload
+        // The following code purges TC and TR entries. Then reload all TC entries.
+        // Purge percpu data TC entries.
+begin_tlb_purge_and_reload:
+#define O(member)       IA64_CPUINFO_##member##_OFFSET
+        GET_THIS_PADDR(r2, cpu_info)    // load phys addr of cpu_info into r2
+        ;;
+        addl r17=O(PTCE_STRIDE),r2
+        addl r2=O(PTCE_BASE),r2
+        ;;
+        ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));;     // r18=ptce_base
+        ld4 r19=[r2],4                                  // r19=ptce_count[0]
+        ld4 r21=[r17],4                                 // r21=ptce_stride[0]
+        ;;
+        ld4 r20=[r2]                                    // r20=ptce_count[1]
+        ld4 r22=[r17]                                   // r22=ptce_stride[1]
+        mov r24=0
+        ;;
+        adds r20=-1,r20
+        ;;
+#undef O
+2:
+        cmp.ltu p6,p7=r24,r19
+(p7)    br.cond.dpnt.few 4f
+        mov ar.lc=r20
+3:
+        ptc.e r18
+        ;;
+        add r18=r22,r18
+        br.cloop.sptk.few 3b
+        ;;
+        add r18=r21,r18
+        add r24=1,r24
+        ;;
+        br.sptk.few 2b
+4:
+        srlz.i                  // srlz.i implies srlz.d
+        ;;
+        // Now purge addresses formerly mapped by TR registers
+        // 1. Purge ITR&DTR for kernel.
+        movl r16=KERNEL_START
+        mov r18=KERNEL_TR_PAGE_SHIFT<<2
+        ;;
+        ptr.i r16, r18
+        ptr.d r16, r18
+        ;;
+        srlz.i
+        ;;
+        srlz.d
+        ;;
+        // 2. Purge DTR for PERCPU data.
+        movl r16=PERCPU_ADDR
+        mov r18=PERCPU_PAGE_SHIFT<<2
+        ;;
+        ptr.d r16,r18
+        ;;
+        srlz.d
+        ;;
+        // 3. Purge ITR for PAL code.
+        GET_THIS_PADDR(r2, ia64_mca_pal_base)
+        ;;
+        ld8 r16=[r2]
+        mov r18=IA64_GRANULE_SHIFT<<2
+        ;;
+        ptr.i r16,r18
+        ;;
+        srlz.i
+        ;;
+        // 4. Purge DTR for stack.
+        mov r16=IA64_KR(CURRENT_STACK)
+        ;;
+        shl r16=r16,IA64_GRANULE_SHIFT
+        movl r19=PAGE_OFFSET
+        ;;
+        add r16=r19,r16
+        mov r18=IA64_GRANULE_SHIFT<<2
+        ;;
+        ptr.d r16,r18
+        ;;
+        srlz.i
+        ;;
+        // Finally reload the TR registers.
+        // 1. Reload DTR/ITR registers for kernel.
+        mov r18=KERNEL_TR_PAGE_SHIFT<<2
+        movl r17=KERNEL_START
+        ;;
+        mov cr.itir=r18
+        mov cr.ifa=r17
+        mov r16=IA64_TR_KERNEL
+        mov r19=ip
+        movl r18=PAGE_KERNEL
+        ;;
+        dep r17=0,r19,0, KERNEL_TR_PAGE_SHIFT
+        ;;
+        or r18=r17,r18
+        ;;
+        itr.i itr[r16]=r18
+        ;;
+        itr.d dtr[r16]=r18
+        ;;
+        srlz.i
+        srlz.d
+        ;;
+        // 2. Reload DTR register for PERCPU data.
+        GET_THIS_PADDR(r2, ia64_mca_per_cpu_pte)
+        ;;
+        movl r16=PERCPU_ADDR            // vaddr
+        movl r18=PERCPU_PAGE_SHIFT<<2
+        ;;
+        mov cr.itir=r18
+        mov cr.ifa=r16
+        ;;
+        ld8 r18=[r2]                    // load per-CPU PTE
+        mov r16=IA64_TR_PERCPU_DATA;
+        ;;
+        itr.d dtr[r16]=r18
+        ;;
+        srlz.d
+        ;;
+        // 3. Reload ITR for PAL code.
+        GET_THIS_PADDR(r2, ia64_mca_pal_pte)
+        ;;
+        ld8 r18=[r2]                    // load PAL PTE
+        ;;
+        GET_THIS_PADDR(r2, ia64_mca_pal_base)
+        ;;
+        ld8 r16=[r2]                    // load PAL vaddr
+        mov r19=IA64_GRANULE_SHIFT<<2
+        ;;
+        mov cr.itir=r19
+        mov cr.ifa=r16
+        mov r20=IA64_TR_PALCODE
+        ;;
+        itr.i itr[r20]=r18
+        ;;
+        srlz.i
+        ;;
+        // 4. Reload DTR for stack.
+        mov r16=IA64_KR(CURRENT_STACK)
+        ;;
+        shl r16=r16,IA64_GRANULE_SHIFT
+        movl r19=PAGE_OFFSET
+        ;;
+        add r18=r19,r16
+        movl r20=PAGE_KERNEL
+        ;;
+        add r16=r20,r16
+        mov r19=IA64_GRANULE_SHIFT<<2
+        ;;
+        mov cr.itir=r19
+        mov cr.ifa=r18
+        mov r20=IA64_TR_CURRENT_STACK
+        ;;
+        itr.d dtr[r20]=r16
+        ;;
+        srlz.d
+        ;;
+        br.sptk.many done_tlb_purge_and_reload
+err:
+        COLD_BOOT_HANDOFF_STATE(r20,r21,r22)
+        br.sptk.many ia64_os_mca_done_restore
+done_tlb_purge_and_reload:
+        // Setup new stack frame for OS_MCA handling
+        GET_IA64_MCA_DATA(r2)
+        ;;
+        add r3 = IA64_MCA_CPU_STACKFRAME_OFFSET, r2
+        add r2 = IA64_MCA_CPU_RBSTORE_OFFSET, r2
+        ;;
+        rse_switch_context(r6,r3,r2);;  // RSC management in this new context
+        GET_IA64_MCA_DATA(r2)
+        ;;
+        add r2 = IA64_MCA_CPU_STACK_OFFSET+IA64_MCA_STACK_SIZE-16, r2
+        ;;
+        mov r12=r2              // establish new stack-pointer
+        // Enter virtual mode from physical mode
+        VIRTUAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_begin, r4)
+ia64_os_mca_virtual_begin:
+        // Call virtual mode handler
+        movl            r2=ia64_mca_ucmc_handler;;
+        mov             b6=r2;;
+        br.call.sptk.many    b0=b6;;
+.ret0:
+        // Revert back to physical mode before going back to SAL
+        PHYSICAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_end, r4)
+ia64_os_mca_virtual_end:
+        // restore the original stack frame here
+        GET_IA64_MCA_DATA(r2)
+        ;;
+        add r2 = IA64_MCA_CPU_STACKFRAME_OFFSET, r2
+        ;;
+        movl    r4=IA64_PSR_MC
+        ;;
+        rse_return_context(r4,r3,r2)    // switch from interrupt context for RSE
+        // let us restore all the registers from our PSI structure
+        mov     r8=gp
+        ;;
+begin_os_mca_restore:
+        br      ia64_os_mca_proc_state_restore;;
+ia64_os_mca_done_restore:
+        OS_MCA_TO_SAL_HANDOFF_STATE_RESTORE(r2);;
+        // branch back to SALE_CHECK
+        ld8             r3=[r2];;
+        mov             b0=r3;;         // SAL_CHECK return address
+        // release lock
+        movl            r3=ia64_mca_serialize;;
+        DATA_VA_TO_PA(r3);;
+        st8.rel         [r3]=r0
+        br              b0
+        ;;
+ia64_os_mca_dispatch_end:
+//EndMain//////////////////////////////////////////////////////////////////////
+//++
+// Name:
+//      ia64_os_mca_proc_state_dump()
+//
+// Stub Description:
+//
+//       This stub dumps the processor state during MCHK to a data area
+//
+//--
+ia64_os_mca_proc_state_dump:
+// Save bank 1 GRs 16-31 which will be used by c-language code when we switch
+//  to virtual addressing mode.
+        GET_IA64_MCA_DATA(r2)
+        ;;
+        add r2 = IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET, r2
+        ;;
+// save ar.NaT
+        mov             r5=ar.unat                  // ar.unat
+// save banked GRs 16-31 along with NaT bits
+        bsw.1;;
+        st8.spill       [r2]=r16,8;;
+        st8.spill       [r2]=r17,8;;
+        st8.spill       [r2]=r18,8;;
+        st8.spill       [r2]=r19,8;;
+        st8.spill       [r2]=r20,8;;
+        st8.spill       [r2]=r21,8;;
+        st8.spill       [r2]=r22,8;;
+        st8.spill       [r2]=r23,8;;
+        st8.spill       [r2]=r24,8;;
+        st8.spill       [r2]=r25,8;;
+        st8.spill       [r2]=r26,8;;
+        st8.spill       [r2]=r27,8;;
+        st8.spill       [r2]=r28,8;;
+        st8.spill       [r2]=r29,8;;
+        st8.spill       [r2]=r30,8;;
+        st8.spill       [r2]=r31,8;;
+        mov             r4=ar.unat;;
+        st8             [r2]=r4,8                // save User NaT bits for r16-r31
+        mov             ar.unat=r5                  // restore original unat
+        bsw.0;;
+//save BRs
+        add             r4=8,r2                  // duplicate r2 in r4
+        add             r6=2*8,r2                // duplicate r2 in r4
+        mov             r3=b0
+        mov             r5=b1
+        mov             r7=b2;;
+        st8             [r2]=r3,3*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;
+        mov             r3=b3
+        mov             r5=b4
+        mov             r7=b5;;
+        st8             [r2]=r3,3*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;
+        mov             r3=b6
+        mov             r5=b7;;
+        st8             [r2]=r3,2*8
+        st8             [r4]=r5,2*8;;
+cSaveCRs:
+// save CRs
+        add             r4=8,r2                  // duplicate r2 in r4
+        add             r6=2*8,r2                // duplicate r2 in r4
+        mov             r3=cr.dcr
+        mov             r5=cr.itm
+        mov             r7=cr.iva;;
+        st8             [r2]=r3,8*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;            // 48 byte rements
+        mov             r3=cr.pta;;
+        st8             [r2]=r3,8*8;;            // 64 byte rements
+// if PSR.ic=0, reading interruption registers causes an illegal operation fault
+        mov             r3=psr;;
+        tbit.nz.unc     p6,p0=r3,PSR_IC;;           // PSI Valid Log bit pos. test
+(p6)    st8     [r2]=r0,9*8+160             // increment by 232 byte inc.
+begin_skip_intr_regs:
+(p6)    br              SkipIntrRegs;;
+        add             r4=8,r2                  // duplicate r2 in r4
+        add             r6=2*8,r2                // duplicate r2 in r6
+        mov             r3=cr.ipsr
+        mov             r5=cr.isr
+        mov             r7=r0;;
+        st8             [r2]=r3,3*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;
+        mov             r3=cr.iip
+        mov             r5=cr.ifa
+        mov             r7=cr.itir;;
+        st8             [r2]=r3,3*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;
+        mov             r3=cr.iipa
+        mov             r5=cr.ifs
+        mov             r7=cr.iim;;
+        st8             [r2]=r3,3*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;
+        mov             r3=cr25;;                   // cr.iha
+        st8             [r2]=r3,160;;               // 160 byte rement
+SkipIntrRegs:
+        st8             [r2]=r0,152;;               // another 152 byte .
+        add             r4=8,r2                     // duplicate r2 in r4
+        add             r6=2*8,r2                   // duplicate r2 in r6
+        mov             r3=cr.lid
+//      mov             r5=cr.ivr                     // cr.ivr, don't read it
+        mov             r7=cr.tpr;;
+        st8             [r2]=r3,3*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;
+        mov             r3=r0                       // cr.eoi => cr67
+        mov             r5=r0                       // cr.irr0 => cr68
+        mov             r7=r0;;                     // cr.irr1 => cr69
+        st8             [r2]=r3,3*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;
+        mov             r3=r0                       // cr.irr2 => cr70
+        mov             r5=r0                       // cr.irr3 => cr71
+        mov             r7=cr.itv;;
+        st8             [r2]=r3,3*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;
+        mov             r3=cr.pmv
+        mov             r5=cr.cmcv;;
+        st8             [r2]=r3,7*8
+        st8             [r4]=r5,7*8;;
+        mov             r3=r0                       // cr.lrr0 => cr80
+        mov             r5=r0;;                     // cr.lrr1 => cr81
+        st8             [r2]=r3,23*8
+        st8             [r4]=r5,23*8;;
+        adds            r2=25*8,r2;;
+cSaveARs:
+// save ARs
+        add             r4=8,r2                  // duplicate r2 in r4
+        add             r6=2*8,r2                // duplicate r2 in r6
+        mov             r3=ar.k0
+        mov             r5=ar.k1
+        mov             r7=ar.k2;;
+        st8             [r2]=r3,3*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;
+        mov             r3=ar.k3
+        mov             r5=ar.k4
+        mov             r7=ar.k5;;
+        st8             [r2]=r3,3*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;
+        mov             r3=ar.k6
+        mov             r5=ar.k7
+        mov             r7=r0;;                     // ar.kr8
+        st8             [r2]=r3,10*8
+        st8             [r4]=r5,10*8
+        st8             [r6]=r7,10*8;;           // rement by 72 bytes
+        mov             r3=ar.rsc
+        mov             ar.rsc=r0                           // put RSE in enforced lazy mode
+        mov             r5=ar.bsp
+        ;;
+        mov             r7=ar.bspstore;;
+        st8             [r2]=r3,3*8
+        st8             [r4]=r5,3*8
+        st8             [r6]=r7,3*8;;
+        mov             r3=ar.rnat;;
+        st8             [r2]=r3,8*13             // increment by 13x8 bytes
+        mov             r3=ar.ccv;;
+        st8             [r2]=r3,8*4
+        mov             r3=ar.unat;;
+        st8             [r2]=r3,8*4
+        mov             r3=ar.fpsr;;
+        st8             [r2]=r3,8*4
+        mov             r3=ar.itc;;
+        st8             [r2]=r3,160                 // 160
+        mov             r3=ar.pfs;;
+        st8             [r2]=r3,8
+        mov             r3=ar.lc;;
+        st8             [r2]=r3,8
+        mov             r3=ar.ec;;
+        st8             [r2]=r3
+        add             r2=8*62,r2               //padding
+// save RRs
+        mov             ar.lc=0x08-1
+        movl            r4=0x00;;
+cStRR:
+        dep.z           r5=r4,61,3;;
+        mov             r3=rr[r5];;
+        st8             [r2]=r3,8
+        add             r4=1,r4
+        br.cloop.sptk.few       cStRR
+        ;;
+end_os_mca_dump:
+        br      ia64_os_mca_done_dump;;
+//EndStub//////////////////////////////////////////////////////////////////////
+//++
+// Name:
+//       ia64_os_mca_proc_state_restore()
+//
+// Stub Description:
+//
+//       This is a stub to restore the saved processor state during MCHK
+//
+//--
+ia64_os_mca_proc_state_restore:
+// Restore bank1 GR16-31
+        GET_IA64_MCA_DATA(r2)
+        ;;
+        add r2 = IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET, r2
+restore_GRs:                                    // restore bank-1 GRs 16-31
+        bsw.1;;
+        add             r3=16*8,r2;;                // to get to NaT of GR 16-31
+        ld8             r3=[r3];;
+        mov             ar.unat=r3;;                // first restore NaT
+        ld8.fill        r16=[r2],8;;
+        ld8.fill        r17=[r2],8;;
+        ld8.fill        r18=[r2],8;;
+        ld8.fill        r19=[r2],8;;
+        ld8.fill        r20=[r2],8;;
+        ld8.fill        r21=[r2],8;;
+        ld8.fill        r22=[r2],8;;
+        ld8.fill        r23=[r2],8;;
+        ld8.fill        r24=[r2],8;;
+        ld8.fill        r25=[r2],8;;
+        ld8.fill        r26=[r2],8;;
+        ld8.fill        r27=[r2],8;;
+        ld8.fill        r28=[r2],8;;
+        ld8.fill        r29=[r2],8;;
+        ld8.fill        r30=[r2],8;;
+        ld8.fill        r31=[r2],8;;
+        ld8             r3=[r2],8;;              // increment to skip NaT
+        bsw.0;;
+restore_BRs:
+        add             r4=8,r2                  // duplicate r2 in r4
+        add             r6=2*8,r2;;              // duplicate r2 in r4
+        ld8             r3=[r2],3*8
+        ld8             r5=[r4],3*8
+        ld8             r7=[r6],3*8;;
+        mov             b0=r3
+        mov             b1=r5
+        mov             b2=r7;;
+        ld8             r3=[r2],3*8
+        ld8             r5=[r4],3*8
+        ld8             r7=[r6],3*8;;
+        mov             b3=r3
+        mov             b4=r5
+        mov             b5=r7;;
+        ld8             r3=[r2],2*8
+        ld8             r5=[r4],2*8;;
+        mov             b6=r3
+        mov             b7=r5;;
+restore_CRs:
+        add             r4=8,r2                  // duplicate r2 in r4
+        add             r6=2*8,r2;;              // duplicate r2 in r4
+        ld8             r3=[r2],8*8
+        ld8             r5=[r4],3*8
+        ld8             r7=[r6],3*8;;            // 48 byte increments
+        mov             cr.dcr=r3
+        mov             cr.itm=r5
+        mov             cr.iva=r7;;
+        ld8             r3=[r2],8*8;;            // 64 byte increments
+//      mov             cr.pta=r3
+// if PSR.ic=1, reading interruption registers causes an illegal operation fault
+        mov             r3=psr;;
+        tbit.nz.unc     p6,p0=r3,PSR_IC;;           // PSI Valid Log bit pos. test
+(p6)    st8     [r2]=r0,9*8+160             // increment by 232 byte inc.
+begin_rskip_intr_regs:
+(p6)    br              rSkipIntrRegs;;
+        add             r4=8,r2                  // duplicate r2 in r4
+        add             r6=2*8,r2;;              // duplicate r2 in r4
+        ld8             r3=[r2],3*8
+        ld8             r5=[r4],3*8
+        ld8             r7=[r6],3*8;;
+        mov             cr.ipsr=r3
+//      mov             cr.isr=r5                   // cr.isr is read only
+        ld8             r3=[r2],3*8
+        ld8             r5=[r4],3*8
+        ld8             r7=[r6],3*8;;
+        mov             cr.iip=r3
+        mov             cr.ifa=r5
+        mov             cr.itir=r7;;
+        ld8             r3=[r2],3*8
+        ld8             r5=[r4],3*8
+        ld8             r7=[r6],3*8;;
+        mov             cr.iipa=r3
+        mov             cr.ifs=r5
+        mov             cr.iim=r7
+        ld8             r3=[r2],160;;               // 160 byte increment
+        mov             cr.iha=r3
+rSkipIntrRegs:
+        ld8             r3=[r2],152;;               // another 152 byte inc.
+        add             r4=8,r2                     // duplicate r2 in r4
+        add             r6=2*8,r2;;                 // duplicate r2 in r6
+        ld8             r3=[r2],8*3
+        ld8             r5=[r4],8*3
+        ld8             r7=[r6],8*3;;
+        mov             cr.lid=r3
+//      mov             cr.ivr=r5                   // cr.ivr is read only
+        mov             cr.tpr=r7;;
+        ld8             r3=[r2],8*3
+        ld8             r5=[r4],8*3
+        ld8             r7=[r6],8*3;;
+//      mov             cr.eoi=r3
+//      mov             cr.irr0=r5                  // cr.irr0 is read only
+//      mov             cr.irr1=r7;;                // cr.irr1 is read only
+        ld8             r3=[r2],8*3
+        ld8             r5=[r4],8*3
+        ld8             r7=[r6],8*3;;
+//      mov             cr.irr2=r3                  // cr.irr2 is read only
+//      mov             cr.irr3=r5                  // cr.irr3 is read only
+        mov             cr.itv=r7;;
+        ld8             r3=[r2],8*7
+        ld8             r5=[r4],8*7;;
+        mov             cr.pmv=r3
+        mov             cr.cmcv=r5;;
+        ld8             r3=[r2],8*23
+        ld8             r5=[r4],8*23;;
+        adds            r2=8*23,r2
+        adds            r4=8*23,r4;;
+//      mov             cr.lrr0=r3
+//      mov             cr.lrr1=r5
+        adds            r2=8*2,r2;;
+restore_ARs:
+        add             r4=8,r2                  // duplicate r2 in r4
+        add             r6=2*8,r2;;              // duplicate r2 in r4
+        ld8             r3=[r2],3*8
+        ld8             r5=[r4],3*8
+        ld8             r7=[r6],3*8;;
+        mov             ar.k0=r3
+        mov             ar.k1=r5
+        mov             ar.k2=r7;;
+        ld8             r3=[r2],3*8
+        ld8             r5=[r4],3*8
+        ld8             r7=[r6],3*8;;
+        mov             ar.k3=r3
+        mov             ar.k4=r5
+        mov             ar.k5=r7;;
+        ld8             r3=[r2],10*8
+        ld8             r5=[r4],10*8
+        ld8             r7=[r6],10*8;;
+        mov             ar.k6=r3
+        mov             ar.k7=r5
+        ;;
+        ld8             r3=[r2],3*8
+        ld8             r5=[r4],3*8
+        ld8             r7=[r6],3*8;;
+//      mov             ar.rsc=r3
+//      mov             ar.bsp=r5                   // ar.bsp is read only
+        mov             ar.rsc=r0                           // make sure that RSE is in enforced lazy mode
+        ;;
+        mov             ar.bspstore=r7;;
+        ld8             r9=[r2],8*13;;
+        mov             ar.rnat=r9
+        mov             ar.rsc=r3
+        ld8             r3=[r2],8*4;;
+        mov             ar.ccv=r3
+        ld8             r3=[r2],8*4;;
+        mov             ar.unat=r3
+        ld8             r3=[r2],8*4;;
+        mov             ar.fpsr=r3
+        ld8             r3=[r2],160;;               // 160
+//      mov             ar.itc=r3
+        ld8             r3=[r2],8;;
+        mov             ar.pfs=r3
+        ld8             r3=[r2],8;;
+        mov             ar.lc=r3
+        ld8             r3=[r2];;
+        mov             ar.ec=r3
+        add             r2=8*62,r2;;             // padding
+restore_RRs:
+        mov             r5=ar.lc
+        mov             ar.lc=0x08-1
+        movl            r4=0x00;;
+cStRRr:
+        dep.z           r7=r4,61,3
+        ld8             r3=[r2],8;;
+        mov             rr[r7]=r3                   // what are its access previledges?
+        add             r4=1,r4
+        br.cloop.sptk.few       cStRRr
+        ;;
+        mov             ar.lc=r5
+        ;;
+end_os_mca_restore:
+        br      ia64_os_mca_done_restore;;
+//EndStub//////////////////////////////////////////////////////////////////////
+// ok, the issue here is that we need to save state information so
+// it can be useable by the kernel debugger and show regs routines.
+// In order to do this, our best bet is save the current state (plus
+// the state information obtain from the MIN_STATE_AREA) into a pt_regs
+// format.  This way we can pass it on in a useable format.
+//
+//
+// SAL to OS entry point for INIT on the monarch processor
+// This has been defined for registration purposes with SAL
+// as a part of ia64_mca_init.
+//
+// When we get here, the following registers have been
+// set by the SAL for our use
+//
+//              1. GR1 = OS INIT GP
+//              2. GR8 = PAL_PROC physical address
+//              3. GR9 = SAL_PROC physical address
+//              4. GR10 = SAL GP (physical)
+//              5. GR11 = Init Reason
+//                      0 = Received INIT for event other than crash dump switch
+//                      1 = Received wakeup at the end of an OS_MCA corrected machine check
+//                      2 = Received INIT dude to CrashDump switch assertion
+//
+//              6. GR12 = Return address to location within SAL_INIT procedure
+GLOBAL_ENTRY(ia64_monarch_init_handler)
+        .prologue
+        // stash the information the SAL passed to os
+        SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(r2)
+        ;;
+        SAVE_MIN_WITH_COVER
+        ;;
+        mov r8=cr.ifa
+        mov r9=cr.isr
+        adds r3=8,r2                            // set up second base pointer
+        ;;
+        SAVE_REST
+// ok, enough should be saved at this point to be dangerous, and supply
+// information for a dump
+// We need to switch to Virtual mode before hitting the C functions.
+        movl    r2=IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN
+        mov     r3=psr  // get the current psr, minimum enabled at this point
+        ;;
+        or      r2=r2,r3
+        ;;
+        movl    r3=IVirtual_Switch
+        ;;
+        mov     cr.iip=r3       // short return to set the appropriate bits
+        mov     cr.ipsr=r2      // need to do an rfi to set appropriate bits
+        ;;
+        rfi
+        ;;
+IVirtual_Switch:
+        //
+        // We should now be running virtual
+        //
+        // Let's call the C handler to get the rest of the state info
+        //
+        alloc r14=ar.pfs,0,0,2,0                // now it's safe (must be first in insn group!)
+        ;;
+        adds out0=16,sp                         // out0 = pointer to pt_regs
+        ;;
+        DO_SAVE_SWITCH_STACK
+        .body
+        adds out1=16,sp                         // out0 = pointer to switch_stack
+        br.call.sptk.many rp=ia64_init_handler
+.ret1:
+return_from_init:
+        br.sptk return_from_init
+END(ia64_monarch_init_handler)
+//
+// SAL to OS entry point for INIT on the slave processor
+// This has been defined for registration purposes with SAL
+// as a part of ia64_mca_init.
+//
+GLOBAL_ENTRY(ia64_slave_init_handler)
+1:      br.sptk 1b
+END(ia64_slave_init_handler)
diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
new file mode 100644
index 000000000000..ab478172c349
--- /dev/null
+++ b/arch/ia64/kernel/mca_drv.c
@@ -0,0 +1,639 @@
+/*
+ * File:        mca_drv.c
+ * Purpose:     Generic MCA handling layer
+ *
+ * Copyright (C) 2004 FUJITSU LIMITED
+ * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kallsyms.h>
+#include <linux/smp_lock.h>
+#include <linux/bootmem.h>
+#include <linux/acpi.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/workqueue.h>
+#include <linux/mm.h>
+#include <asm/delay.h>
+#include <asm/machvec.h>
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/system.h>
+#include <asm/sal.h>
+#include <asm/mca.h>
+#include <asm/irq.h>
+#include <asm/hw_irq.h>
+#include "mca_drv.h"
+/* max size of SAL error record (default) */
+static int sal_rec_max = 10000;
+/* from mca.c */
+static ia64_mca_sal_to_os_state_t *sal_to_os_handoff_state;
+static ia64_mca_os_to_sal_state_t *os_to_sal_handoff_state;
+/* from mca_drv_asm.S */
+extern void *mca_handler_bhhook(void);
+static DEFINE_SPINLOCK(mca_bh_lock);
+typedef enum {
+        MCA_IS_LOCAL  = 0,
+        MCA_IS_GLOBAL = 1
+} mca_type_t;
+#define MAX_PAGE_ISOLATE 1024
+static struct page *page_isolate[MAX_PAGE_ISOLATE];
+static int num_page_isolate = 0;
+typedef enum {
+        ISOLATE_NG = 0,
+        ISOLATE_OK = 1
+} isolate_status_t;
+/*
+ *  This pool keeps pointers to the section part of SAL error record
+ */
+static struct {
+        slidx_list_t *buffer; /* section pointer list pool */
+        int          cur_idx; /* Current index of section pointer list pool */
+        int          max_idx; /* Maximum index of section pointer list pool */
+} slidx_pool;
+/**
+ * mca_page_isolate - isolate a poisoned page in order not to use it later
+ * @paddr:      poisoned memory location
+ *
+ * Return value:
+ *      ISOLATE_OK / ISOLATE_NG
+ */
+static isolate_status_t
+mca_page_isolate(unsigned long paddr)
+{
+        int i;
+        struct page *p;
+        /* whether physical address is valid or not */
+        if ( !ia64_phys_addr_valid(paddr) ) 
+                return ISOLATE_NG;
+        /* convert physical address to physical page number */
+        p = pfn_to_page(paddr>>PAGE_SHIFT);
+        /* check whether a page number have been already registered or not */
+        for( i = 0; i < num_page_isolate; i++ )
+                if( page_isolate[i] == p )
+                        return ISOLATE_OK; /* already listed */
+        /* limitation check */
+        if( num_page_isolate == MAX_PAGE_ISOLATE ) 
+                return ISOLATE_NG;
+        /* kick pages having attribute 'SLAB' or 'Reserved' */
+        if( PageSlab(p) || PageReserved(p) ) 
+                return ISOLATE_NG;
+        /* add attribute 'Reserved' and register the page */
+        SetPageReserved(p);
+        page_isolate[num_page_isolate++] = p;
+        return ISOLATE_OK;
+}
+/**
+ * mca_hanlder_bh - Kill the process which occurred memory read error
+ * @paddr:      poisoned address received from MCA Handler
+ */
+void
+mca_handler_bh(unsigned long paddr)
+{
+        printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n",
+                current->pid, current->comm);
+        spin_lock(&mca_bh_lock);
+        if (mca_page_isolate(paddr) == ISOLATE_OK) {
+                printk(KERN_DEBUG "Page isolation: ( %lx ) success.\n", paddr);
+        } else {
+                printk(KERN_DEBUG "Page isolation: ( %lx ) failure.\n", paddr);
+        }
+        spin_unlock(&mca_bh_lock);
+        /* This process is about to be killed itself */
+        force_sig(SIGKILL, current);
+        schedule();
+}
+/**
+ * mca_make_peidx - Make index of processor error section
+ * @slpi:       pointer to record of processor error section
+ * @peidx:      pointer to index of processor error section
+ */
+static void 
+mca_make_peidx(sal_log_processor_info_t *slpi, peidx_table_t *peidx)
+{
+        /* 
+         * calculate the start address of
+         *   "struct cpuid_info" and "sal_processor_static_info_t".
+         */
+        u64 total_check_num = slpi->valid.num_cache_check
+                                + slpi->valid.num_tlb_check
+                                + slpi->valid.num_bus_check
+                                + slpi->valid.num_reg_file_check
+                                + slpi->valid.num_ms_check;
+        u64 head_size = sizeof(sal_log_mod_error_info_t) * total_check_num
+                        + sizeof(sal_log_processor_info_t);
+        u64 mid_size  = slpi->valid.cpuid_info * sizeof(struct sal_cpuid_info);
+        peidx_head(peidx)   = slpi;
+        peidx_mid(peidx)    = (struct sal_cpuid_info *)
+                (slpi->valid.cpuid_info ? ((char*)slpi + head_size) : NULL);
+        peidx_bottom(peidx) = (sal_processor_static_info_t *)
+                (slpi->valid.psi_static_struct ?
+                        ((char*)slpi + head_size + mid_size) : NULL);
+}
+/**
+ * mca_make_slidx -  Make index of SAL error record 
+ * @buffer:     pointer to SAL error record
+ * @slidx:      pointer to index of SAL error record
+ *
+ * Return value:
+ *      1 if record has platform error / 0 if not
+ */
+#define LOG_INDEX_ADD_SECT_PTR(sect, ptr) \
+        { slidx_list_t *hl = &slidx_pool.buffer[slidx_pool.cur_idx]; \
+          hl->hdr = ptr; \
+          list_add(&hl->list, &(sect)); \
+          slidx_pool.cur_idx = (slidx_pool.cur_idx + 1)%slidx_pool.max_idx; }
+static int 
+mca_make_slidx(void *buffer, slidx_table_t *slidx)
+{
+        int platform_err = 0;
+        int record_len = ((sal_log_record_header_t*)buffer)->len;
+        u32 ercd_pos;
+        int sects;
+        sal_log_section_hdr_t *sp;
+        /*
+         * Initialize index referring current record
+         */
+        INIT_LIST_HEAD(&(slidx->proc_err));
+        INIT_LIST_HEAD(&(slidx->mem_dev_err));
+        INIT_LIST_HEAD(&(slidx->sel_dev_err));
+        INIT_LIST_HEAD(&(slidx->pci_bus_err));
+        INIT_LIST_HEAD(&(slidx->smbios_dev_err));
+        INIT_LIST_HEAD(&(slidx->pci_comp_err));
+        INIT_LIST_HEAD(&(slidx->plat_specific_err));
+        INIT_LIST_HEAD(&(slidx->host_ctlr_err));
+        INIT_LIST_HEAD(&(slidx->plat_bus_err));
+        INIT_LIST_HEAD(&(slidx->unsupported));
+        /*
+         * Extract a Record Header
+         */
+        slidx->header = buffer;
+        /*
+         * Extract each section records
+         * (arranged from "int ia64_log_platform_info_print()")
+         */
+        for (ercd_pos = sizeof(sal_log_record_header_t), sects = 0;
+                ercd_pos < record_len; ercd_pos += sp->len, sects++) {
+                sp = (sal_log_section_hdr_t *)((char*)buffer + ercd_pos);
+                if (!efi_guidcmp(sp->guid, SAL_PROC_DEV_ERR_SECT_GUID)) {
+                        LOG_INDEX_ADD_SECT_PTR(slidx->proc_err, sp);
+                } else if (!efi_guidcmp(sp->guid, SAL_PLAT_MEM_DEV_ERR_SECT_GUID)) {
+                        platform_err = 1;
+                        LOG_INDEX_ADD_SECT_PTR(slidx->mem_dev_err, sp);
+                } else if (!efi_guidcmp(sp->guid, SAL_PLAT_SEL_DEV_ERR_SECT_GUID)) {
+                        platform_err = 1;
+                        LOG_INDEX_ADD_SECT_PTR(slidx->sel_dev_err, sp);
+                } else if (!efi_guidcmp(sp->guid, SAL_PLAT_PCI_BUS_ERR_SECT_GUID)) {
+                        platform_err = 1;
+                        LOG_INDEX_ADD_SECT_PTR(slidx->pci_bus_err, sp);
+                } else if (!efi_guidcmp(sp->guid, SAL_PLAT_SMBIOS_DEV_ERR_SECT_GUID)) {
+                        platform_err = 1;
+                        LOG_INDEX_ADD_SECT_PTR(slidx->smbios_dev_err, sp);
+                } else if (!efi_guidcmp(sp->guid, SAL_PLAT_PCI_COMP_ERR_SECT_GUID)) {
+                        platform_err = 1;
+                        LOG_INDEX_ADD_SECT_PTR(slidx->pci_comp_err, sp);
+                } else if (!efi_guidcmp(sp->guid, SAL_PLAT_SPECIFIC_ERR_SECT_GUID)) {
+                        platform_err = 1;
+                        LOG_INDEX_ADD_SECT_PTR(slidx->plat_specific_err, sp);
+                } else if (!efi_guidcmp(sp->guid, SAL_PLAT_HOST_CTLR_ERR_SECT_GUID)) {
+                        platform_err = 1;
+                        LOG_INDEX_ADD_SECT_PTR(slidx->host_ctlr_err, sp);
+                } else if (!efi_guidcmp(sp->guid, SAL_PLAT_BUS_ERR_SECT_GUID)) {
+                        platform_err = 1;
+                        LOG_INDEX_ADD_SECT_PTR(slidx->plat_bus_err, sp);
+                } else {
+                        LOG_INDEX_ADD_SECT_PTR(slidx->unsupported, sp);
+                }
+        }
+        slidx->n_sections = sects;
+        return platform_err;
+}
+/**
+ * init_record_index_pools - Initialize pool of lists for SAL record index
+ *
+ * Return value:
+ *      0 on Success / -ENOMEM on Failure
+ */
+static int 
+init_record_index_pools(void)
+{
+        int i;
+        int rec_max_size;  /* Maximum size of SAL error records */
+        int sect_min_size; /* Minimum size of SAL error sections */
+        /* minimum size table of each section */
+        static int sal_log_sect_min_sizes[] = { 
+                sizeof(sal_log_processor_info_t) + sizeof(sal_processor_static_info_t),
+                sizeof(sal_log_mem_dev_err_info_t),
+                sizeof(sal_log_sel_dev_err_info_t),
+                sizeof(sal_log_pci_bus_err_info_t),
+                sizeof(sal_log_smbios_dev_err_info_t),
+                sizeof(sal_log_pci_comp_err_info_t),
+                sizeof(sal_log_plat_specific_err_info_t),
+                sizeof(sal_log_host_ctlr_err_info_t),
+                sizeof(sal_log_plat_bus_err_info_t),
+        };
+        /*
+         * MCA handler cannot allocate new memory on flight,
+         * so we preallocate enough memory to handle a SAL record.
+         *
+         * Initialize a handling set of slidx_pool:
+         *   1. Pick up the max size of SAL error records
+         *   2. Pick up the min size of SAL error sections
+         *   3. Allocate the pool as enough to 2 SAL records
+         *     (now we can estimate the maxinum of section in a record.)
+         */
+        /* - 1 - */
+        rec_max_size = sal_rec_max;
+        /* - 2 - */
+        sect_min_size = sal_log_sect_min_sizes[0];
+        for (i = 1; i < sizeof sal_log_sect_min_sizes/sizeof(size_t); i++)
+                if (sect_min_size > sal_log_sect_min_sizes[i])
+                        sect_min_size = sal_log_sect_min_sizes[i];
+        /* - 3 - */
+        slidx_pool.max_idx = (rec_max_size/sect_min_size) * 2 + 1;
+        slidx_pool.buffer = (slidx_list_t *) kmalloc(slidx_pool.max_idx * sizeof(slidx_list_t), GFP_KERNEL);
+        return slidx_pool.buffer ? 0 : -ENOMEM;
+}
+/*****************************************************************************
+ * Recovery functions                                                        *
+ *****************************************************************************/
+/**
+ * is_mca_global - Check whether this MCA is global or not
+ * @peidx:      pointer of index of processor error section
+ * @pbci:       pointer to pal_bus_check_info_t
+ *
+ * Return value:
+ *      MCA_IS_LOCAL / MCA_IS_GLOBAL
+ */
+static mca_type_t
+is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci)
+{
+        pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx);
+        /* 
+         * PAL can request a rendezvous, if the MCA has a global scope.
+         * If "rz_always" flag is set, SAL requests MCA rendezvous 
+         * in spite of global MCA.
+         * Therefore it is local MCA when rendezvous has not been requested.
+         * Failed to rendezvous, the system must be down.
+         */
+        switch (sal_to_os_handoff_state->imsto_rendez_state) {
+                case -1: /* SAL rendezvous unsuccessful */
+                        return MCA_IS_GLOBAL;
+                case  0: /* SAL rendezvous not required */
+                        return MCA_IS_LOCAL;
+                case  1: /* SAL rendezvous successful int */
+                case  2: /* SAL rendezvous successful int with init */
+                default:
+                        break;
+        }
+        /*
+         * If One or more Cache/TLB/Reg_File/Uarch_Check is here,
+         * it would be a local MCA. (i.e. processor internal error)
+         */
+        if (psp->tc || psp->cc || psp->rc || psp->uc)
+                return MCA_IS_LOCAL;
+        
+        /*
+         * Bus_Check structure with Bus_Check.ib (internal bus error) flag set
+         * would be a global MCA. (e.g. a system bus address parity error)
+         */
+        if (!pbci || pbci->ib)
+                return MCA_IS_GLOBAL;
+        /*
+         * Bus_Check structure with Bus_Check.eb (external bus error) flag set
+         * could be either a local MCA or a global MCA.
+         *
+         * Referring Bus_Check.bsi:
+         *   0: Unknown/unclassified
+         *   1: BERR#
+         *   2: BINIT#
+         *   3: Hard Fail
+         * (FIXME: Are these SGI specific or generic bsi values?)
+         */
+        if (pbci->eb)
+                switch (pbci->bsi) {
+                        case 0:
+                                /* e.g. a load from poisoned memory */
+                                return MCA_IS_LOCAL;
+                        case 1:
+                        case 2:
+                        case 3:
+                                return MCA_IS_GLOBAL;
+                }
+        return MCA_IS_GLOBAL;
+}
+/**
+ * recover_from_read_error - Try to recover the errors which type are "read"s.
+ * @slidx:      pointer of index of SAL error record
+ * @peidx:      pointer of index of processor error section
+ * @pbci:       pointer of pal_bus_check_info
+ *
+ * Return value:
+ *      1 on Success / 0 on Failure
+ */
+static int
+recover_from_read_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci)
+{
+        sal_log_mod_error_info_t *smei;
+        pal_min_state_area_t *pmsa;
+        struct ia64_psr *psr1, *psr2;
+        ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
+        /* Is target address valid? */
+        if (!pbci->tv)
+                return 0;
+        /*
+         * cpu read or memory-mapped io read
+         *
+         *    offending process  affected process  OS MCA do
+         *     kernel mode        kernel mode       down system
+         *     kernel mode        user   mode       kill the process
+         *     user   mode        kernel mode       down system (*)
+         *     user   mode        user   mode       kill the process
+         *
+         * (*) You could terminate offending user-mode process
+         *    if (pbci->pv && pbci->pl != 0) *and* if you sure
+         *    the process not have any locks of kernel.
+         */
+        psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr);
+        /*
+         *  Check the privilege level of interrupted context.
+         *   If it is user-mode, then terminate affected process.
+         */
+        if (psr1->cpl != 0) {
+                smei = peidx_bus_check(peidx, 0);
+                if (smei->valid.target_identifier) {
+                        /*
+                         *  setup for resume to bottom half of MCA,
+                         * "mca_handler_bhhook"
+                         */
+                        pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61));
+                        /* pass to bhhook as 1st argument (gr8) */
+                        pmsa->pmsa_gr[8-1] = smei->target_identifier;
+                        /* set interrupted return address (but no use) */
+                        pmsa->pmsa_br0 = pmsa->pmsa_iip;
+                        /* change resume address to bottom half */
+                        pmsa->pmsa_iip = mca_hdlr_bh->fp;
+                        pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
+                        /* set cpl with kernel mode */
+                        psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
+                        psr2->cpl = 0;
+                        psr2->ri  = 0;
+                        return 1;
+                }
+        }
+        return 0;
+}
+/**
+ * recover_from_platform_error - Recover from platform error.
+ * @slidx:      pointer of index of SAL error record
+ * @peidx:      pointer of index of processor error section
+ * @pbci:       pointer of pal_bus_check_info
+ *
+ * Return value:
+ *      1 on Success / 0 on Failure
+ */
+static int
+recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci)
+{
+        int status = 0;
+        pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx);
+        if (psp->bc && pbci->eb && pbci->bsi == 0) {
+                switch(pbci->type) {
+                case 1: /* partial read */
+                case 3: /* full line(cpu) read */
+                case 9: /* I/O space read */
+                        status = recover_from_read_error(slidx, peidx, pbci);
+                        break;
+                case 0: /* unknown */
+                case 2: /* partial write */
+                case 4: /* full line write */
+                case 5: /* implicit or explicit write-back operation */
+                case 6: /* snoop probe */
+                case 7: /* incoming or outgoing ptc.g */
+                case 8: /* write coalescing transactions */
+                case 10: /* I/O space write */
+                case 11: /* inter-processor interrupt message(IPI) */
+                case 12: /* interrupt acknowledge or external task priority cycle */
+                default:
+                        break;
+                }
+        }
+        return status;
+}
+/**
+ * recover_from_processor_error
+ * @platform:   whether there are some platform error section or not
+ * @slidx:      pointer of index of SAL error record
+ * @peidx:      pointer of index of processor error section
+ * @pbci:       pointer of pal_bus_check_info
+ *
+ * Return value:
+ *      1 on Success / 0 on Failure
+ */
+/*
+ *  Later we try to recover when below all conditions are satisfied.
+ *   1. Only one processor error section is exist.
+ *   2. BUS_CHECK is exist and the others are not exist.(Except TLB_CHECK)
+ *   3. The entry of BUS_CHECK_INFO is 1.
+ *   4. "External bus error" flag is set and the others are not set.
+ */
+static int
+recover_from_processor_error(int platform, slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci)
+{
+        pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx);
+        /* 
+         * We cannot recover errors with other than bus_check.
+         */
+        if (psp->cc || psp->rc || psp->uc) 
+                return 0;
+        /*
+         * If there is no bus error, record is weird but we need not to recover.
+         */
+        if (psp->bc == 0 || pbci == NULL)
+                return 1;
+        /*
+         * Sorry, we cannot handle so many.
+         */
+        if (peidx_bus_check_num(peidx) > 1)
+                return 0;
+        /*
+         * Well, here is only one bus error.
+         */
+        if (pbci->ib || pbci->cc)
+                return 0;
+        if (pbci->eb && pbci->bsi > 0)
+                return 0;
+        if (psp->ci == 0)
+                return 0;
+        /*
+         * This is a local MCA and estimated as recoverble external bus error.
+         * (e.g. a load from poisoned memory)
+         * This means "there are some platform errors".
+         */
+        if (platform) 
+                return recover_from_platform_error(slidx, peidx, pbci);
+        /* 
+         * On account of strange SAL error record, we cannot recover. 
+         */
+        return 0;
+}
+/**
+ * mca_try_to_recover - Try to recover from MCA
+ * @rec:        pointer to a SAL error record
+ *
+ * Return value:
+ *      1 on Success / 0 on Failure
+ */
+static int
+mca_try_to_recover(void *rec, 
+        ia64_mca_sal_to_os_state_t *sal_to_os_state,
+        ia64_mca_os_to_sal_state_t *os_to_sal_state)
+{
+        int platform_err;
+        int n_proc_err;
+        slidx_table_t slidx;
+        peidx_table_t peidx;
+        pal_bus_check_info_t pbci;
+        /* handoff state from/to mca.c */
+        sal_to_os_handoff_state = sal_to_os_state;
+        os_to_sal_handoff_state = os_to_sal_state;
+        /* Make index of SAL error record */
+        platform_err = mca_make_slidx(rec, &slidx);
+        /* Count processor error sections */
+        n_proc_err = slidx_count(&slidx, proc_err);
+         /* Now, OS can recover when there is one processor error section */
+        if (n_proc_err > 1)
+                return 0;
+        else if (n_proc_err == 0) {
+                /* Weird SAL record ... We need not to recover */
+                return 1;
+        }
+        /* Make index of processor error section */
+        mca_make_peidx((sal_log_processor_info_t*)slidx_first_entry(&slidx.proc_err)->hdr, &peidx);
+        /* Extract Processor BUS_CHECK[0] */
+        *((u64*)&pbci) = peidx_check_info(&peidx, bus_check, 0);
+        /* Check whether MCA is global or not */
+        if (is_mca_global(&peidx, &pbci))
+                return 0;
+        
+        /* Try to recover a processor error */
+        return recover_from_processor_error(platform_err, &slidx, &peidx, &pbci);
+}
+/*
+ * =============================================================================
+ */
+int __init mca_external_handler_init(void)
+{
+        if (init_record_index_pools())
+                return -ENOMEM;
+        /* register external mca handlers */
+        if (ia64_reg_MCA_extension(mca_try_to_recover)){        
+                printk(KERN_ERR "ia64_reg_MCA_extension failed.\n");
+                kfree(slidx_pool.buffer);
+                return -EFAULT;
+        }
+        return 0;
+}
+void __exit mca_external_handler_exit(void)
+{
+        /* unregister external mca handlers */
+        ia64_unreg_MCA_extension();
+        kfree(slidx_pool.buffer);
+}
+module_init(mca_external_handler_init);
+module_exit(mca_external_handler_exit);
+module_param(sal_rec_max, int, 0644);
+MODULE_PARM_DESC(sal_rec_max, "Max size of SAL error record");
+MODULE_DESCRIPTION("ia64 platform dependent mca handler driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/ia64/kernel/mca_drv.h b/arch/ia64/kernel/mca_drv.h
new file mode 100644
index 000000000000..0227b761f2c4
--- /dev/null
+++ b/arch/ia64/kernel/mca_drv.h
@@ -0,0 +1,113 @@
+/*
+ * File:        mca_drv.h
+ * Purpose:     Define helpers for Generic MCA handling
+ *
+ * Copyright (C) 2004 FUJITSU LIMITED
+ * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
+ */
+/*
+ * Processor error section: 
+ *
+ *  +-sal_log_processor_info_t *info-------------+
+ *  | sal_log_section_hdr_t header;              |
+ *  | ...                                        |
+ *  | sal_log_mod_error_info_t info[0];          |
+ *  +-+----------------+-------------------------+
+ *    | CACHE_CHECK    |  ^ num_cache_check v
+ *    +----------------+
+ *    | TLB_CHECK      |  ^ num_tlb_check v
+ *    +----------------+
+ *    | BUS_CHECK      |  ^ num_bus_check v
+ *    +----------------+
+ *    | REG_FILE_CHECK |  ^ num_reg_file_check v
+ *    +----------------+
+ *    | MS_CHECK       |  ^ num_ms_check v
+ *  +-struct cpuid_info *id----------------------+
+ *  | regs[5];                                   |
+ *  | reserved;                                  |
+ *  +-sal_processor_static_info_t *regs----------+
+ *  | valid;                                     |
+ *  | ...                                        |
+ *  | fr[128];                                   |
+ *  +--------------------------------------------+
+ */
+/* peidx: index of processor error section */
+typedef struct peidx_table {
+        sal_log_processor_info_t        *info;
+        struct sal_cpuid_info           *id;
+        sal_processor_static_info_t     *regs;
+} peidx_table_t;
+#define peidx_head(p)   (((p)->info))
+#define peidx_mid(p)    (((p)->id))
+#define peidx_bottom(p) (((p)->regs))
+#define peidx_psp(p)           (&(peidx_head(p)->proc_state_parameter))
+#define peidx_field_valid(p)   (&(peidx_head(p)->valid))
+#define peidx_minstate_area(p) (&(peidx_bottom(p)->min_state_area))
+#define peidx_cache_check_num(p)    (peidx_head(p)->valid.num_cache_check)
+#define peidx_tlb_check_num(p)      (peidx_head(p)->valid.num_tlb_check)
+#define peidx_bus_check_num(p)      (peidx_head(p)->valid.num_bus_check)
+#define peidx_reg_file_check_num(p) (peidx_head(p)->valid.num_reg_file_check)
+#define peidx_ms_check_num(p)       (peidx_head(p)->valid.num_ms_check)
+#define peidx_cache_check_idx(p, n)    (n)
+#define peidx_tlb_check_idx(p, n)      (peidx_cache_check_idx(p, peidx_cache_check_num(p)) + n)
+#define peidx_bus_check_idx(p, n)      (peidx_tlb_check_idx(p, peidx_tlb_check_num(p)) + n)
+#define peidx_reg_file_check_idx(p, n) (peidx_bus_check_idx(p, peidx_bus_check_num(p)) + n)
+#define peidx_ms_check_idx(p, n)       (peidx_reg_file_check_idx(p, peidx_reg_file_check_num(p)) + n)
+#define peidx_mod_error_info(p, name, n) \
+({      int __idx = peidx_##name##_idx(p, n); \
+        sal_log_mod_error_info_t *__ret = NULL; \
+        if (peidx_##name##_num(p) > n) /*BUG*/ \
+                __ret = &(peidx_head(p)->info[__idx]); \
+        __ret; })
+#define peidx_cache_check(p, n)    peidx_mod_error_info(p, cache_check, n)
+#define peidx_tlb_check(p, n)      peidx_mod_error_info(p, tlb_check, n)
+#define peidx_bus_check(p, n)      peidx_mod_error_info(p, bus_check, n)
+#define peidx_reg_file_check(p, n) peidx_mod_error_info(p, reg_file_check, n)
+#define peidx_ms_check(p, n)       peidx_mod_error_info(p, ms_check, n)
+#define peidx_check_info(proc, name, n) \
+({ \
+        sal_log_mod_error_info_t *__info = peidx_mod_error_info(proc, name, n);\
+        u64 __temp = __info && __info->valid.check_info \
+                ? __info->check_info : 0; \
+        __temp; })
+/* slidx: index of SAL log error record */
+typedef struct slidx_list {
+        struct list_head list;
+        sal_log_section_hdr_t *hdr;
+} slidx_list_t;
+typedef struct slidx_table {
+        sal_log_record_header_t *header;
+        int n_sections;                 /* # of section headers */
+        struct list_head proc_err;
+        struct list_head mem_dev_err;
+        struct list_head sel_dev_err;
+        struct list_head pci_bus_err;
+        struct list_head smbios_dev_err;
+        struct list_head pci_comp_err;
+        struct list_head plat_specific_err;
+        struct list_head host_ctlr_err;
+        struct list_head plat_bus_err;
+        struct list_head unsupported;   /* list of unsupported sections */
+} slidx_table_t;
+#define slidx_foreach_entry(pos, head) \
+        list_for_each_entry(pos, head, list)
+#define slidx_first_entry(head) \
+        (((head)->next != (head)) ? list_entry((head)->next, typeof(slidx_list_t), list) : NULL)
+#define slidx_count(slidx, sec) \
+({      int __count = 0; \
+        slidx_list_t *__pos; \
+        slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\
+        __count; })
diff --git a/arch/ia64/kernel/mca_drv_asm.S b/arch/ia64/kernel/mca_drv_asm.S
new file mode 100644
index 000000000000..bcfa05acc561
--- /dev/null
+++ b/arch/ia64/kernel/mca_drv_asm.S
@@ -0,0 +1,45 @@
+/*
+ * File:        mca_drv_asm.S
+ * Purpose:     Assembly portion of Generic MCA handling
+ *
+ * Copyright (C) 2004 FUJITSU LIMITED
+ * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com)
+ */
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <asm/asmmacro.h>
+#include <asm/processor.h>
+GLOBAL_ENTRY(mca_handler_bhhook)
+        invala                                          // clear RSE ?
+        ;;                                              //
+        cover                                           // 
+        ;;                                              //
+        clrrrb                                          //
+        ;;                                              
+        alloc           r16=ar.pfs,0,2,1,0              // make a new frame
+        ;;
+        mov             r13=IA64_KR(CURRENT)            // current task pointer
+        ;;
+        adds            r12=IA64_TASK_THREAD_KSP_OFFSET,r13
+        ;;
+        ld8             r12=[r12]                       // stack pointer
+        ;;
+        mov             loc0=r16
+        movl            loc1=mca_handler_bh             // recovery C function
+        ;;
+        mov             out0=r8                         // poisoned address
+        mov             b6=loc1
+        ;;
+        mov             loc1=rp
+        ;;
+        br.call.sptk.many    rp=b6                      // not return ...
+        ;;
+        mov             ar.pfs=loc0
+        mov             rp=loc1
+        ;;
+        mov             r8=r0
+        br.ret.sptk.many rp
+        ;;
+END(mca_handler_bhhook)
diff --git a/arch/ia64/kernel/minstate.h b/arch/ia64/kernel/minstate.h
new file mode 100644
index 000000000000..1dbc7b2497c9
--- /dev/null
+++ b/arch/ia64/kernel/minstate.h
@@ -0,0 +1,251 @@
+#include <linux/config.h>
+#include <asm/cache.h>
+#include "entry.h"
+/*
+ * For ivt.s we want to access the stack virtually so we don't have to disable translation
+ * on interrupts.
+ *
+ *  On entry:
+ *      r1:     pointer to current task (ar.k6)
+ */
+#define MINSTATE_START_SAVE_MIN_VIRT                                                            \
+(pUStk) mov ar.rsc=0;           /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */     \
+        ;;                                                                                      \
+(pUStk) mov.m r24=ar.rnat;                                                                      \
+(pUStk) addl r22=IA64_RBS_OFFSET,r1;                    /* compute base of RBS */               \
+(pKStk) mov r1=sp;                                      /* get sp  */                           \
+        ;;                                                                                      \
+(pUStk) lfetch.fault.excl.nt1 [r22];                                                            \
+(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base of memory stack */      \
+(pUStk) mov r23=ar.bspstore;                            /* save ar.bspstore */                  \
+        ;;                                                                                      \
+(pUStk) mov ar.bspstore=r22;                            /* switch to kernel RBS */              \
+(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;                  /* if in kernel mode, use sp (r12) */   \
+        ;;                                                                                      \
+(pUStk) mov r18=ar.bsp;                                                                         \
+(pUStk) mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, loadrs=0 */
+#define MINSTATE_END_SAVE_MIN_VIRT                                                              \
+        bsw.1;                  /* switch back to bank 1 (must be last in insn group) */        \
+        ;;
+/*
+ * For mca_asm.S we want to access the stack physically since the state is saved before we
+ * go virtual and don't want to destroy the iip or ipsr.
+ */
+#define MINSTATE_START_SAVE_MIN_PHYS                                                            \
+(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;                                                          \
+(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;                                                    \
+(pKStk) ld8 r3 = [r3];;                                                                         \
+(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;                                             \
+(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;                                           \
+(pUStk) mov ar.rsc=0;           /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */     \
+(pUStk) addl r22=IA64_RBS_OFFSET,r1;            /* compute base of register backing store */    \
+        ;;                                                                                      \
+(pUStk) mov r24=ar.rnat;                                                                        \
+(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base of memory stack */      \
+(pUStk) mov r23=ar.bspstore;                            /* save ar.bspstore */                  \
+(pUStk) dep r22=-1,r22,61,3;                    /* compute kernel virtual addr of RBS */        \
+        ;;                                                                                      \
+(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;          /* if in kernel mode, use sp (r12) */           \
+(pUStk) mov ar.bspstore=r22;                    /* switch to kernel RBS */                      \
+        ;;                                                                                      \
+(pUStk) mov r18=ar.bsp;                                                                         \
+(pUStk) mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, loadrs=0 */             \
+#define MINSTATE_END_SAVE_MIN_PHYS                                                              \
+        dep r12=-1,r12,61,3;            /* make sp a kernel virtual address */                  \
+        ;;
+#ifdef MINSTATE_VIRT
+# define MINSTATE_GET_CURRENT(reg)      mov reg=IA64_KR(CURRENT)
+# define MINSTATE_START_SAVE_MIN        MINSTATE_START_SAVE_MIN_VIRT
+# define MINSTATE_END_SAVE_MIN          MINSTATE_END_SAVE_MIN_VIRT
+#endif
+#ifdef MINSTATE_PHYS
+# define MINSTATE_GET_CURRENT(reg)      mov reg=IA64_KR(CURRENT);; tpa reg=reg
+# define MINSTATE_START_SAVE_MIN        MINSTATE_START_SAVE_MIN_PHYS
+# define MINSTATE_END_SAVE_MIN          MINSTATE_END_SAVE_MIN_PHYS
+#endif
+/*
+ * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
+ * the minimum state necessary that allows us to turn psr.ic back
+ * on.
+ *
+ * Assumed state upon entry:
+ *      psr.ic: off
+ *      r31:    contains saved predicates (pr)
+ *
+ * Upon exit, the state is as follows:
+ *      psr.ic: off
+ *       r2 = points to &pt_regs.r16
+ *       r8 = contents of ar.ccv
+ *       r9 = contents of ar.csd
+ *      r10 = contents of ar.ssd
+ *      r11 = FPSR_DEFAULT
+ *      r12 = kernel sp (kernel virtual address)
+ *      r13 = points to current task_struct (kernel virtual address)
+ *      p15 = TRUE if psr.i is set in cr.ipsr
+ *      predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
+ *              preserved
+ *
+ * Note that psr.ic is NOT turned on by this macro.  This is so that
+ * we can pass interruption state as arguments to a handler.
+ */
+#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                                       \
+        MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                                       \
+        mov r27=ar.rsc;                 /* M */                                                 \
+        mov r20=r1;                     /* A */                                                 \
+        mov r25=ar.unat;                /* M */                                                 \
+        mov r29=cr.ipsr;                /* M */                                                 \
+        mov r26=ar.pfs;                 /* I */                                                 \
+        mov r28=cr.iip;                 /* M */                                                 \
+        mov r21=ar.fpsr;                /* M */                                                 \
+        COVER;                          /* B;; (or nothing) */                                  \
+        ;;                                                                                      \
+        adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                                         \
+        ;;                                                                                      \
+        ld1 r17=[r16];                          /* load current->thread.on_ustack flag */       \
+        st1 [r16]=r0;                           /* clear current->thread.on_ustack flag */      \
+        adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                                          \
+        /* switch from user to kernel RBS: */                                                   \
+        ;;                                                                                      \
+        invala;                         /* M */                                                 \
+        SAVE_IFS;                                                                               \
+        cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode already? */            \
+        ;;                                                                                      \
+        MINSTATE_START_SAVE_MIN                                                                 \
+        adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line size */           \
+        adds r16=PT(CR_IPSR),r1;                                                                \
+        ;;                                                                                      \
+        lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                                             \
+        st8 [r16]=r29;          /* save cr.ipsr */                                              \
+        ;;                                                                                      \
+        lfetch.fault.excl.nt1 [r17];                                                            \
+        tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                                      \
+        mov r29=b0                                                                              \
+        ;;                                                                                      \
+        adds r16=PT(R8),r1;     /* initialize first base pointer */                             \
+        adds r17=PT(R9),r1;     /* initialize second base pointer */                            \
+(pKStk) mov r18=r0;             /* make sure r18 isn't NaT */                                   \
+        ;;                                                                                      \
+.mem.offset 0,0; st8.spill [r16]=r8,16;                                                         \
+.mem.offset 8,0; st8.spill [r17]=r9,16;                                                         \
+        ;;                                                                                      \
+.mem.offset 0,0; st8.spill [r16]=r10,24;                                                        \
+.mem.offset 8,0; st8.spill [r17]=r11,24;                                                        \
+        ;;                                                                                      \
+        st8 [r16]=r28,16;       /* save cr.iip */                                               \
+        st8 [r17]=r30,16;       /* save cr.ifs */                                               \
+(pUStk) sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                                          \
+        mov r8=ar.ccv;                                                                          \
+        mov r9=ar.csd;                                                                          \
+        mov r10=ar.ssd;                                                                         \
+        movl r11=FPSR_DEFAULT;   /* L-unit */                                                   \
+        ;;                                                                                      \
+        st8 [r16]=r25,16;       /* save ar.unat */                                              \
+        st8 [r17]=r26,16;       /* save ar.pfs */                                               \
+        shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */                    \
+        ;;                                                                                      \
+        st8 [r16]=r27,16;       /* save ar.rsc */                                               \
+(pUStk) st8 [r17]=r24,16;       /* save ar.rnat */                                              \
+(pKStk) adds r17=16,r17;        /* skip over ar_rnat field */                                   \
+        ;;                      /* avoid RAW on r16 & r17 */                                    \
+(pUStk) st8 [r16]=r23,16;       /* save ar.bspstore */                                          \
+        st8 [r17]=r31,16;       /* save predicates */                                           \
+(pKStk) adds r16=16,r16;        /* skip over ar_bspstore field */                               \
+        ;;                                                                                      \
+        st8 [r16]=r29,16;       /* save b0 */                                                   \
+        st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */                            \
+        cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */                      \
+        ;;                                                                                      \
+.mem.offset 0,0; st8.spill [r16]=r20,16;        /* save original r1 */                          \
+.mem.offset 8,0; st8.spill [r17]=r12,16;                                                        \
+        adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes of scratch) */  \
+        ;;                                                                                      \
+.mem.offset 0,0; st8.spill [r16]=r13,16;                                                        \
+.mem.offset 8,0; st8.spill [r17]=r21,16;        /* save ar.fpsr */                              \
+        mov r13=IA64_KR(CURRENT);       /* establish `current' */                               \
+        ;;                                                                                      \
+.mem.offset 0,0; st8.spill [r16]=r15,16;                                                        \
+.mem.offset 8,0; st8.spill [r17]=r14,16;                                                        \
+        ;;                                                                                      \
+.mem.offset 0,0; st8.spill [r16]=r2,16;                                                         \
+.mem.offset 8,0; st8.spill [r17]=r3,16;                                                         \
+        adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                                     \
+        ;;                                                                                      \
+        EXTRA;                                                                                  \
+        movl r1=__gp;           /* establish kernel global pointer */                           \
+        ;;                                                                                      \
+        MINSTATE_END_SAVE_MIN
+/*
+ * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
+ *
+ * Assumed state upon entry:
+ *      psr.ic: on
+ *      r2:     points to &pt_regs.r16
+ *      r3:     points to &pt_regs.r17
+ *      r8:     contents of ar.ccv
+ *      r9:     contents of ar.csd
+ *      r10:    contents of ar.ssd
+ *      r11:    FPSR_DEFAULT
+ *
+ * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
+ */
+#define SAVE_REST                               \
+.mem.offset 0,0; st8.spill [r2]=r16,16;         \
+.mem.offset 8,0; st8.spill [r3]=r17,16;         \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r18,16;         \
+.mem.offset 8,0; st8.spill [r3]=r19,16;         \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r20,16;         \
+.mem.offset 8,0; st8.spill [r3]=r21,16;         \
+        mov r18=b6;                             \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r22,16;         \
+.mem.offset 8,0; st8.spill [r3]=r23,16;         \
+        mov r19=b7;                             \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r24,16;         \
+.mem.offset 8,0; st8.spill [r3]=r25,16;         \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r26,16;         \
+.mem.offset 8,0; st8.spill [r3]=r27,16;         \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r28,16;         \
+.mem.offset 8,0; st8.spill [r3]=r29,16;         \
+        ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r30,16;         \
+.mem.offset 8,0; st8.spill [r3]=r31,32;         \
+        ;;                                      \
+        mov ar.fpsr=r11;        /* M-unit */    \
+        st8 [r2]=r8,8;          /* ar.ccv */    \
+        adds r24=PT(B6)-PT(F7),r3;              \
+        ;;                                      \
+        stf.spill [r2]=f6,32;                   \
+        stf.spill [r3]=f7,32;                   \
+        ;;                                      \
+        stf.spill [r2]=f8,32;                   \
+        stf.spill [r3]=f9,32;                   \
+        ;;                                      \
+        stf.spill [r2]=f10;                     \
+        stf.spill [r3]=f11;                     \
+        adds r25=PT(B7)-PT(F11),r3;             \
+        ;;                                      \
+        st8 [r24]=r18,16;       /* b6 */        \
+        st8 [r25]=r19,16;       /* b7 */        \
+        ;;                                      \
+        st8 [r24]=r9;           /* ar.csd */    \
+        st8 [r25]=r10;          /* ar.ssd */    \
+        ;;
+#define SAVE_MIN_WITH_COVER     DO_SAVE_MIN(cover, mov r30=cr.ifs,)
+#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
+#define SAVE_MIN                DO_SAVE_MIN(     , mov r30=r0, )
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
new file mode 100644
index 000000000000..febc091c2f02
--- /dev/null
+++ b/arch/ia64/kernel/module.c
@@ -0,0 +1,952 @@
+/*
+ * IA-64-specific support for kernel module loader.
+ *
+ * Copyright (C) 2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * Loosely based on patch by Rusty Russell.
+ */
+/* relocs tested so far:
+   DIR64LSB
+   FPTR64LSB
+   GPREL22
+   LDXMOV
+   LDXMOV
+   LTOFF22
+   LTOFF22X
+   LTOFF22X
+   LTOFF_FPTR22
+   PCREL21B     (for br.call only; br.cond is not supported out of modules!)
+   PCREL60B     (for brl.cond only; brl.call is not supported for modules!)
+   PCREL64LSB
+   SECREL32LSB
+   SEGREL64LSB
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/elf.h>
+#include <linux/moduleloader.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <asm/patch.h>
+#include <asm/unaligned.h>
+#define ARCH_MODULE_DEBUG 0
+#if ARCH_MODULE_DEBUG
+# define DEBUGP printk
+# define inline
+#else
+# define DEBUGP(fmt , a...)
+#endif
+#ifdef CONFIG_ITANIUM
+# define USE_BRL        0
+#else
+# define USE_BRL        1
+#endif
+#define MAX_LTOFF       ((uint64_t) (1 << 22))  /* max. allowable linkage-table offset */
+/* Define some relocation helper macros/types: */
+#define FORMAT_SHIFT    0
+#define FORMAT_BITS     3
+#define FORMAT_MASK     ((1 << FORMAT_BITS) - 1)
+#define VALUE_SHIFT     3
+#define VALUE_BITS      5
+#define VALUE_MASK      ((1 << VALUE_BITS) - 1)
+enum reloc_target_format {
+        /* direct encoded formats: */
+        RF_NONE = 0,
+        RF_INSN14 = 1,
+        RF_INSN22 = 2,
+        RF_INSN64 = 3,
+        RF_32MSB = 4,
+        RF_32LSB = 5,
+        RF_64MSB = 6,
+        RF_64LSB = 7,
+        /* formats that cannot be directly decoded: */
+        RF_INSN60,
+        RF_INSN21B,     /* imm21 form 1 */
+        RF_INSN21M,     /* imm21 form 2 */
+        RF_INSN21F      /* imm21 form 3 */
+};
+enum reloc_value_formula {
+        RV_DIRECT = 4,          /* S + A */
+        RV_GPREL = 5,           /* @gprel(S + A) */
+        RV_LTREL = 6,           /* @ltoff(S + A) */
+        RV_PLTREL = 7,          /* @pltoff(S + A) */
+        RV_FPTR = 8,            /* @fptr(S + A) */
+        RV_PCREL = 9,           /* S + A - P */
+        RV_LTREL_FPTR = 10,     /* @ltoff(@fptr(S + A)) */
+        RV_SEGREL = 11,         /* @segrel(S + A) */
+        RV_SECREL = 12,         /* @secrel(S + A) */
+        RV_BDREL = 13,          /* BD + A */
+        RV_LTV = 14,            /* S + A (like RV_DIRECT, except frozen at static link-time) */
+        RV_PCREL2 = 15,         /* S + A - P */
+        RV_SPECIAL = 16,        /* various (see below) */
+        RV_RSVD17 = 17,
+        RV_TPREL = 18,          /* @tprel(S + A) */
+        RV_LTREL_TPREL = 19,    /* @ltoff(@tprel(S + A)) */
+        RV_DTPMOD = 20,         /* @dtpmod(S + A) */
+        RV_LTREL_DTPMOD = 21,   /* @ltoff(@dtpmod(S + A)) */
+        RV_DTPREL = 22,         /* @dtprel(S + A) */
+        RV_LTREL_DTPREL = 23,   /* @ltoff(@dtprel(S + A)) */
+        RV_RSVD24 = 24,
+        RV_RSVD25 = 25,
+        RV_RSVD26 = 26,
+        RV_RSVD27 = 27
+        /* 28-31 reserved for implementation-specific purposes.  */
+};
+#define N(reloc)        [R_IA64_##reloc] = #reloc
+static const char *reloc_name[256] = {
+        N(NONE),                N(IMM14),               N(IMM22),               N(IMM64),
+        N(DIR32MSB),            N(DIR32LSB),            N(DIR64MSB),            N(DIR64LSB),
+        N(GPREL22),             N(GPREL64I),            N(GPREL32MSB),          N(GPREL32LSB),
+        N(GPREL64MSB),          N(GPREL64LSB),          N(LTOFF22),             N(LTOFF64I),
+        N(PLTOFF22),            N(PLTOFF64I),           N(PLTOFF64MSB),         N(PLTOFF64LSB),
+        N(FPTR64I),             N(FPTR32MSB),           N(FPTR32LSB),           N(FPTR64MSB),
+        N(FPTR64LSB),           N(PCREL60B),            N(PCREL21B),            N(PCREL21M),
+        N(PCREL21F),            N(PCREL32MSB),          N(PCREL32LSB),          N(PCREL64MSB),
+        N(PCREL64LSB),          N(LTOFF_FPTR22),        N(LTOFF_FPTR64I),       N(LTOFF_FPTR32MSB),
+        N(LTOFF_FPTR32LSB),     N(LTOFF_FPTR64MSB),     N(LTOFF_FPTR64LSB),     N(SEGREL32MSB),
+        N(SEGREL32LSB),         N(SEGREL64MSB),         N(SEGREL64LSB),         N(SECREL32MSB),
+        N(SECREL32LSB),         N(SECREL64MSB),         N(SECREL64LSB),         N(REL32MSB),
+        N(REL32LSB),            N(REL64MSB),            N(REL64LSB),            N(LTV32MSB),
+        N(LTV32LSB),            N(LTV64MSB),            N(LTV64LSB),            N(PCREL21BI),
+        N(PCREL22),             N(PCREL64I),            N(IPLTMSB),             N(IPLTLSB),
+        N(COPY),                N(LTOFF22X),            N(LDXMOV),              N(TPREL14),
+        N(TPREL22),             N(TPREL64I),            N(TPREL64MSB),          N(TPREL64LSB),
+        N(LTOFF_TPREL22),       N(DTPMOD64MSB),         N(DTPMOD64LSB),         N(LTOFF_DTPMOD22),
+        N(DTPREL14),            N(DTPREL22),            N(DTPREL64I),           N(DTPREL32MSB),
+        N(DTPREL32LSB),         N(DTPREL64MSB),         N(DTPREL64LSB),         N(LTOFF_DTPREL22)
+};
+#undef N
+struct got_entry {
+        uint64_t val;
+};
+struct fdesc {
+        uint64_t ip;
+        uint64_t gp;
+};
+/* Opaque struct for insns, to protect against derefs. */
+struct insn;
+static inline uint64_t
+bundle (const struct insn *insn)
+{
+        return (uint64_t) insn & ~0xfUL;
+}
+static inline int
+slot (const struct insn *insn)
+{
+        return (uint64_t) insn & 0x3;
+}
+static int
+apply_imm64 (struct module *mod, struct insn *insn, uint64_t val)
+{
+        if (slot(insn) != 2) {
+                printk(KERN_ERR "%s: invalid slot number %d for IMM64\n",
+                       mod->name, slot(insn));
+                return 0;
+        }
+        ia64_patch_imm64((u64) insn, val);
+        return 1;
+}
+static int
+apply_imm60 (struct module *mod, struct insn *insn, uint64_t val)
+{
+        if (slot(insn) != 2) {
+                printk(KERN_ERR "%s: invalid slot number %d for IMM60\n",
+                       mod->name, slot(insn));
+                return 0;
+        }
+        if (val + ((uint64_t) 1 << 59) >= (1UL << 60)) {
+                printk(KERN_ERR "%s: value %ld out of IMM60 range\n", mod->name, (int64_t) val);
+                return 0;
+        }
+        ia64_patch_imm60((u64) insn, val);
+        return 1;
+}
+static int
+apply_imm22 (struct module *mod, struct insn *insn, uint64_t val)
+{
+        if (val + (1 << 21) >= (1 << 22)) {
+                printk(KERN_ERR "%s: value %li out of IMM22 range\n", mod->name, (int64_t)val);
+                return 0;
+        }
+        ia64_patch((u64) insn, 0x01fffcfe000UL, (  ((val & 0x200000UL) << 15) /* bit 21 -> 36 */
+                                                 | ((val & 0x1f0000UL) <<  6) /* bit 16 -> 22 */
+                                                 | ((val & 0x00ff80UL) << 20) /* bit  7 -> 27 */
+                                                 | ((val & 0x00007fUL) << 13) /* bit  0 -> 13 */));
+        return 1;
+}
+static int
+apply_imm21b (struct module *mod, struct insn *insn, uint64_t val)
+{
+        if (val + (1 << 20) >= (1 << 21)) {
+                printk(KERN_ERR "%s: value %li out of IMM21b range\n", mod->name, (int64_t)val);
+                return 0;
+        }
+        ia64_patch((u64) insn, 0x11ffffe000UL, (  ((val & 0x100000UL) << 16) /* bit 20 -> 36 */
+                                                | ((val & 0x0fffffUL) << 13) /* bit  0 -> 13 */));
+        return 1;
+}
+#if USE_BRL
+struct plt_entry {
+        /* Three instruction bundles in PLT. */
+        unsigned char bundle[2][16];
+};
+static const struct plt_entry ia64_plt_template = {
+        {
+                {
+                        0x04, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
+                        0x00, 0x00, 0x00, 0x00, 0x00, 0x20, /*       movl gp=TARGET_GP */
+                        0x00, 0x00, 0x00, 0x60
+                },
+                {
+                        0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
+                        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /*       brl.many gp=TARGET_GP */
+                        0x08, 0x00, 0x00, 0xc0
+                }
+        }
+};
+static int
+patch_plt (struct module *mod, struct plt_entry *plt, long target_ip, unsigned long target_gp)
+{
+        if (apply_imm64(mod, (struct insn *) (plt->bundle[0] + 2), target_gp)
+            && apply_imm60(mod, (struct insn *) (plt->bundle[1] + 2),
+                           (target_ip - (int64_t) plt->bundle[1]) / 16))
+                return 1;
+        return 0;
+}
+unsigned long
+plt_target (struct plt_entry *plt)
+{
+        uint64_t b0, b1, *b = (uint64_t *) plt->bundle[1];
+        long off;
+        b0 = b[0]; b1 = b[1];
+        off = (  ((b1 & 0x00fffff000000000UL) >> 36)            /* imm20b -> bit 0 */
+               | ((b0 >> 48) << 20) | ((b1 & 0x7fffffUL) << 36) /* imm39 -> bit 20 */
+               | ((b1 & 0x0800000000000000UL) << 0));           /* i -> bit 59 */
+        return (long) plt->bundle[1] + 16*off;
+}
+#else /* !USE_BRL */
+struct plt_entry {
+        /* Three instruction bundles in PLT. */
+        unsigned char bundle[3][16];
+};
+static const struct plt_entry ia64_plt_template = {
+        {
+                {
+                        0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
+                        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /*       movl r16=TARGET_IP */
+                        0x02, 0x00, 0x00, 0x60
+                },
+                {
+                        0x04, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
+                        0x00, 0x00, 0x00, 0x00, 0x00, 0x20, /*       movl gp=TARGET_GP */
+                        0x00, 0x00, 0x00, 0x60
+                },
+                {
+                        0x11, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MIB] nop.m 0 */
+                        0x60, 0x80, 0x04, 0x80, 0x03, 0x00, /*       mov b6=r16 */
+                        0x60, 0x00, 0x80, 0x00              /*       br.few b6 */
+                }
+        }
+};
+static int
+patch_plt (struct module *mod, struct plt_entry *plt, long target_ip, unsigned long target_gp)
+{
+        if (apply_imm64(mod, (struct insn *) (plt->bundle[0] + 2), target_ip)
+            && apply_imm64(mod, (struct insn *) (plt->bundle[1] + 2), target_gp))
+                return 1;
+        return 0;
+}
+unsigned long
+plt_target (struct plt_entry *plt)
+{
+        uint64_t b0, b1, *b = (uint64_t *) plt->bundle[0];
+        b0 = b[0]; b1 = b[1];
+        return (  ((b1 & 0x000007f000000000) >> 36)             /* imm7b -> bit 0 */
+                | ((b1 & 0x07fc000000000000) >> 43)             /* imm9d -> bit 7 */
+                | ((b1 & 0x0003e00000000000) >> 29)             /* imm5c -> bit 16 */
+                | ((b1 & 0x0000100000000000) >> 23)             /* ic -> bit 21 */
+                | ((b0 >> 46) << 22) | ((b1 & 0x7fffff) << 40)  /* imm41 -> bit 22 */
+                | ((b1 & 0x0800000000000000) <<  4));           /* i -> bit 63 */
+}
+#endif /* !USE_BRL */
+void *
+module_alloc (unsigned long size)
+{
+        if (!size)
+                return NULL;
+        return vmalloc(size);
+}
+void
+module_free (struct module *mod, void *module_region)
+{
+        if (mod->arch.init_unw_table && module_region == mod->module_init) {
+                unw_remove_unwind_table(mod->arch.init_unw_table);
+                mod->arch.init_unw_table = NULL;
+        }
+        vfree(module_region);
+}
+/* Have we already seen one of these relocations? */
+/* FIXME: we could look in other sections, too --RR */
+static int
+duplicate_reloc (const Elf64_Rela *rela, unsigned int num)
+{
+        unsigned int i;
+        for (i = 0; i < num; i++) {
+                if (rela[i].r_info == rela[num].r_info && rela[i].r_addend == rela[num].r_addend)
+                        return 1;
+        }
+        return 0;
+}
+/* Count how many GOT entries we may need */
+static unsigned int
+count_gots (const Elf64_Rela *rela, unsigned int num)
+{
+        unsigned int i, ret = 0;
+        /* Sure, this is order(n^2), but it's usually short, and not
+           time critical */
+        for (i = 0; i < num; i++) {
+                switch (ELF64_R_TYPE(rela[i].r_info)) {
+                      case R_IA64_LTOFF22:
+                      case R_IA64_LTOFF22X:
+                      case R_IA64_LTOFF64I:
+                      case R_IA64_LTOFF_FPTR22:
+                      case R_IA64_LTOFF_FPTR64I:
+                      case R_IA64_LTOFF_FPTR32MSB:
+                      case R_IA64_LTOFF_FPTR32LSB:
+                      case R_IA64_LTOFF_FPTR64MSB:
+                      case R_IA64_LTOFF_FPTR64LSB:
+                        if (!duplicate_reloc(rela, i))
+                                ret++;
+                        break;
+                }
+        }
+        return ret;
+}
+/* Count how many PLT entries we may need */
+static unsigned int
+count_plts (const Elf64_Rela *rela, unsigned int num)
+{
+        unsigned int i, ret = 0;
+        /* Sure, this is order(n^2), but it's usually short, and not
+           time critical */
+        for (i = 0; i < num; i++) {
+                switch (ELF64_R_TYPE(rela[i].r_info)) {
+                      case R_IA64_PCREL21B:
+                      case R_IA64_PLTOFF22:
+                      case R_IA64_PLTOFF64I:
+                      case R_IA64_PLTOFF64MSB:
+                      case R_IA64_PLTOFF64LSB:
+                      case R_IA64_IPLTMSB:
+                      case R_IA64_IPLTLSB:
+                        if (!duplicate_reloc(rela, i))
+                                ret++;
+                        break;
+                }
+        }
+        return ret;
+}
+/* We need to create an function-descriptors for any internal function
+   which is referenced. */
+static unsigned int
+count_fdescs (const Elf64_Rela *rela, unsigned int num)
+{
+        unsigned int i, ret = 0;
+        /* Sure, this is order(n^2), but it's usually short, and not time critical.  */
+        for (i = 0; i < num; i++) {
+                switch (ELF64_R_TYPE(rela[i].r_info)) {
+                      case R_IA64_FPTR64I:
+                      case R_IA64_FPTR32LSB:
+                      case R_IA64_FPTR32MSB:
+                      case R_IA64_FPTR64LSB:
+                      case R_IA64_FPTR64MSB:
+                      case R_IA64_LTOFF_FPTR22:
+                      case R_IA64_LTOFF_FPTR32LSB:
+                      case R_IA64_LTOFF_FPTR32MSB:
+                      case R_IA64_LTOFF_FPTR64I:
+                      case R_IA64_LTOFF_FPTR64LSB:
+                      case R_IA64_LTOFF_FPTR64MSB:
+                      case R_IA64_IPLTMSB:
+                      case R_IA64_IPLTLSB:
+                        /*
+                         * Jumps to static functions sometimes go straight to their
+                         * offset.  Of course, that may not be possible if the jump is
+                         * from init -> core or vice. versa, so we need to generate an
+                         * FDESC (and PLT etc) for that.
+                         */
+                      case R_IA64_PCREL21B:
+                        if (!duplicate_reloc(rela, i))
+                                ret++;
+                        break;
+                }
+        }
+        return ret;
+}
+int
+module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
+                           struct module *mod)
+{
+        unsigned long core_plts = 0, init_plts = 0, gots = 0, fdescs = 0;
+        Elf64_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum;
+        /*
+         * To store the PLTs and function-descriptors, we expand the .text section for
+         * core module-code and the .init.text section for initialization code.
+         */
+        for (s = sechdrs; s < sechdrs_end; ++s)
+                if (strcmp(".core.plt", secstrings + s->sh_name) == 0)
+                        mod->arch.core_plt = s;
+                else if (strcmp(".init.plt", secstrings + s->sh_name) == 0)
+                        mod->arch.init_plt = s;
+                else if (strcmp(".got", secstrings + s->sh_name) == 0)
+                        mod->arch.got = s;
+                else if (strcmp(".opd", secstrings + s->sh_name) == 0)
+                        mod->arch.opd = s;
+                else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0)
+                        mod->arch.unwind = s;
+        if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) {
+                printk(KERN_ERR "%s: sections missing\n", mod->name);
+                return -ENOEXEC;
+        }
+        /* GOT and PLTs can occur in any relocated section... */
+        for (s = sechdrs + 1; s < sechdrs_end; ++s) {
+                const Elf64_Rela *rels = (void *)ehdr + s->sh_offset;
+                unsigned long numrels = s->sh_size/sizeof(Elf64_Rela);
+                if (s->sh_type != SHT_RELA)
+                        continue;
+                gots += count_gots(rels, numrels);
+                fdescs += count_fdescs(rels, numrels);
+                if (strstr(secstrings + s->sh_name, ".init"))
+                        init_plts += count_plts(rels, numrels);
+                else
+                        core_plts += count_plts(rels, numrels);
+        }
+        mod->arch.core_plt->sh_type = SHT_NOBITS;
+        mod->arch.core_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+        mod->arch.core_plt->sh_addralign = 16;
+        mod->arch.core_plt->sh_size = core_plts * sizeof(struct plt_entry);
+        mod->arch.init_plt->sh_type = SHT_NOBITS;
+        mod->arch.init_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+        mod->arch.init_plt->sh_addralign = 16;
+        mod->arch.init_plt->sh_size = init_plts * sizeof(struct plt_entry);
+        mod->arch.got->sh_type = SHT_NOBITS;
+        mod->arch.got->sh_flags = ARCH_SHF_SMALL | SHF_ALLOC;
+        mod->arch.got->sh_addralign = 8;
+        mod->arch.got->sh_size = gots * sizeof(struct got_entry);
+        mod->arch.opd->sh_type = SHT_NOBITS;
+        mod->arch.opd->sh_flags = SHF_ALLOC;
+        mod->arch.opd->sh_addralign = 8;
+        mod->arch.opd->sh_size = fdescs * sizeof(struct fdesc);
+        DEBUGP("%s: core.plt=%lx, init.plt=%lx, got=%lx, fdesc=%lx\n",
+               __FUNCTION__, mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size,
+               mod->arch.got->sh_size, mod->arch.opd->sh_size);
+        return 0;
+}
+static inline int
+in_init (const struct module *mod, uint64_t addr)
+{
+        return addr - (uint64_t) mod->module_init < mod->init_size;
+}
+static inline int
+in_core (const struct module *mod, uint64_t addr)
+{
+        return addr - (uint64_t) mod->module_core < mod->core_size;
+}
+static inline int
+is_internal (const struct module *mod, uint64_t value)
+{
+        return in_init(mod, value) || in_core(mod, value);
+}
+/*
+ * Get gp-relative offset for the linkage-table entry of VALUE.
+ */
+static uint64_t
+get_ltoff (struct module *mod, uint64_t value, int *okp)
+{
+        struct got_entry *got, *e;
+        if (!*okp)
+                return 0;
+        got = (void *) mod->arch.got->sh_addr;
+        for (e = got; e < got + mod->arch.next_got_entry; ++e)
+                if (e->val == value)
+                        goto found;
+        /* Not enough GOT entries? */
+        if (e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size))
+                BUG();
+        e->val = value;
+        ++mod->arch.next_got_entry;
+  found:
+        return (uint64_t) e - mod->arch.gp;
+}
+static inline int
+gp_addressable (struct module *mod, uint64_t value)
+{
+        return value - mod->arch.gp + MAX_LTOFF/2 < MAX_LTOFF;
+}
+/* Get PC-relative PLT entry for this value.  Returns 0 on failure. */
+static uint64_t
+get_plt (struct module *mod, const struct insn *insn, uint64_t value, int *okp)
+{
+        struct plt_entry *plt, *plt_end;
+        uint64_t target_ip, target_gp;
+        if (!*okp)
+                return 0;
+        if (in_init(mod, (uint64_t) insn)) {
+                plt = (void *) mod->arch.init_plt->sh_addr;
+                plt_end = (void *) plt + mod->arch.init_plt->sh_size;
+        } else {
+                plt = (void *) mod->arch.core_plt->sh_addr;
+                plt_end = (void *) plt + mod->arch.core_plt->sh_size;
+        }
+        /* "value" is a pointer to a function-descriptor; fetch the target ip/gp from it: */
+        target_ip = ((uint64_t *) value)[0];
+        target_gp = ((uint64_t *) value)[1];
+        /* Look for existing PLT entry. */
+        while (plt->bundle[0][0]) {
+                if (plt_target(plt) == target_ip)
+                        goto found;
+                if (++plt >= plt_end)
+                        BUG();
+        }
+        *plt = ia64_plt_template;
+        if (!patch_plt(mod, plt, target_ip, target_gp)) {
+                *okp = 0;
+                return 0;
+        }
+#if ARCH_MODULE_DEBUG
+        if (plt_target(plt) != target_ip) {
+                printk("%s: mistargeted PLT: wanted %lx, got %lx\n",
+                       __FUNCTION__, target_ip, plt_target(plt));
+                *okp = 0;
+                return 0;
+        }
+#endif
+  found:
+        return (uint64_t) plt;
+}
+/* Get function descriptor for VALUE. */
+static uint64_t
+get_fdesc (struct module *mod, uint64_t value, int *okp)
+{
+        struct fdesc *fdesc = (void *) mod->arch.opd->sh_addr;
+        if (!*okp)
+                return 0;
+        if (!value) {
+                printk(KERN_ERR "%s: fdesc for zero requested!\n", mod->name);
+                return 0;
+        }
+        if (!is_internal(mod, value))
+                /*
+                 * If it's not a module-local entry-point, "value" already points to a
+                 * function-descriptor.
+                 */
+                return value;
+        /* Look for existing function descriptor. */
+        while (fdesc->ip) {
+                if (fdesc->ip == value)
+                        return (uint64_t)fdesc;
+                if ((uint64_t) ++fdesc >= mod->arch.opd->sh_addr + mod->arch.opd->sh_size)
+                        BUG();
+        }
+        /* Create new one */
+        fdesc->ip = value;
+        fdesc->gp = mod->arch.gp;
+        return (uint64_t) fdesc;
+}
+static inline int
+do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend,
+          Elf64_Shdr *sec, void *location)
+{
+        enum reloc_target_format format = (r_type >> FORMAT_SHIFT) & FORMAT_MASK;
+        enum reloc_value_formula formula = (r_type >> VALUE_SHIFT) & VALUE_MASK;
+        uint64_t val;
+        int ok = 1;
+        val = sym->st_value + addend;
+        switch (formula) {
+              case RV_SEGREL:   /* segment base is arbitrarily chosen to be 0 for kernel modules */
+              case RV_DIRECT:
+                break;
+              case RV_GPREL:      val -= mod->arch.gp; break;
+              case RV_LTREL:      val = get_ltoff(mod, val, &ok); break;
+              case RV_PLTREL:     val = get_plt(mod, location, val, &ok); break;
+              case RV_FPTR:       val = get_fdesc(mod, val, &ok); break;
+              case RV_SECREL:     val -= sec->sh_addr; break;
+              case RV_LTREL_FPTR: val = get_ltoff(mod, get_fdesc(mod, val, &ok), &ok); break;
+              case RV_PCREL:
+                switch (r_type) {
+                      case R_IA64_PCREL21B:
+                        if ((in_init(mod, val) && in_core(mod, (uint64_t)location)) ||
+                            (in_core(mod, val) && in_init(mod, (uint64_t)location))) {
+                                /*
+                                 * Init section may have been allocated far away from core,
+                                 * if the branch won't reach, then allocate a plt for it.
+                                 */
+                                uint64_t delta = ((int64_t)val - (int64_t)location) / 16;
+                                if (delta + (1 << 20) >= (1 << 21)) {
+                                        val = get_fdesc(mod, val, &ok);
+                                        val = get_plt(mod, location, val, &ok);
+                                }
+                        } else if (!is_internal(mod, val))
+                                val = get_plt(mod, location, val, &ok);
+                        /* FALL THROUGH */
+                      default:
+                        val -= bundle(location);
+                        break;
+                      case R_IA64_PCREL32MSB:
+                      case R_IA64_PCREL32LSB:
+                      case R_IA64_PCREL64MSB:
+                      case R_IA64_PCREL64LSB:
+                        val -= (uint64_t) location;
+                        break;
+                }
+                switch (r_type) {
+                      case R_IA64_PCREL60B: format = RF_INSN60; break;
+                      case R_IA64_PCREL21B: format = RF_INSN21B; break;
+                      case R_IA64_PCREL21M: format = RF_INSN21M; break;
+                      case R_IA64_PCREL21F: format = RF_INSN21F; break;
+                      default: break;
+                }
+                break;
+              case RV_BDREL:
+                val -= (uint64_t) (in_init(mod, val) ? mod->module_init : mod->module_core);
+                break;
+              case RV_LTV:
+                /* can link-time value relocs happen here?  */
+                BUG();
+                break;
+              case RV_PCREL2:
+                if (r_type == R_IA64_PCREL21BI) {
+                        if (!is_internal(mod, val)) {
+                                printk(KERN_ERR "%s: %s reloc against non-local symbol (%lx)\n",
+                                       __FUNCTION__, reloc_name[r_type], val);
+                                return -ENOEXEC;
+                        }
+                        format = RF_INSN21B;
+                }
+                val -= bundle(location);
+                break;
+              case RV_SPECIAL:
+                switch (r_type) {
+                      case R_IA64_IPLTMSB:
+                      case R_IA64_IPLTLSB:
+                        val = get_fdesc(mod, get_plt(mod, location, val, &ok), &ok);
+                        format = RF_64LSB;
+                        if (r_type == R_IA64_IPLTMSB)
+                                format = RF_64MSB;
+                        break;
+                      case R_IA64_SUB:
+                        val = addend - sym->st_value;
+                        format = RF_INSN64;
+                        break;
+                      case R_IA64_LTOFF22X:
+                        if (gp_addressable(mod, val))
+                                val -= mod->arch.gp;
+                        else
+                                val = get_ltoff(mod, val, &ok);
+                        format = RF_INSN22;
+                        break;
+                      case R_IA64_LDXMOV:
+                        if (gp_addressable(mod, val)) {
+                                /* turn "ld8" into "mov": */
+                                DEBUGP("%s: patching ld8 at %p to mov\n", __FUNCTION__, location);
+                                ia64_patch((u64) location, 0x1fff80fe000UL, 0x10000000000UL);
+                        }
+                        return 0;
+                      default:
+                        if (reloc_name[r_type])
+                                printk(KERN_ERR "%s: special reloc %s not supported",
+                                       mod->name, reloc_name[r_type]);
+                        else
+                                printk(KERN_ERR "%s: unknown special reloc %x\n",
+                                       mod->name, r_type);
+                        return -ENOEXEC;
+                }
+                break;
+              case RV_TPREL:
+              case RV_LTREL_TPREL:
+              case RV_DTPMOD:
+              case RV_LTREL_DTPMOD:
+              case RV_DTPREL:
+              case RV_LTREL_DTPREL:
+                printk(KERN_ERR "%s: %s reloc not supported\n",
+                       mod->name, reloc_name[r_type] ? reloc_name[r_type] : "?");
+                return -ENOEXEC;
+              default:
+                printk(KERN_ERR "%s: unknown reloc %x\n", mod->name, r_type);
+                return -ENOEXEC;
+        }
+        if (!ok)
+                return -ENOEXEC;
+        DEBUGP("%s: [%p]<-%016lx = %s(%lx)\n", __FUNCTION__, location, val,
+               reloc_name[r_type] ? reloc_name[r_type] : "?", sym->st_value + addend);
+        switch (format) {
+              case RF_INSN21B:  ok = apply_imm21b(mod, location, (int64_t) val / 16); break;
+              case RF_INSN22:   ok = apply_imm22(mod, location, val); break;
+              case RF_INSN64:   ok = apply_imm64(mod, location, val); break;
+              case RF_INSN60:   ok = apply_imm60(mod, location, (int64_t) val / 16); break;
+              case RF_32LSB:    put_unaligned(val, (uint32_t *) location); break;
+              case RF_64LSB:    put_unaligned(val, (uint64_t *) location); break;
+              case RF_32MSB:    /* ia64 Linux is little-endian... */
+              case RF_64MSB:    /* ia64 Linux is little-endian... */
+              case RF_INSN14:   /* must be within-module, i.e., resolved by "ld -r" */
+              case RF_INSN21M:  /* must be within-module, i.e., resolved by "ld -r" */
+              case RF_INSN21F:  /* must be within-module, i.e., resolved by "ld -r" */
+                printk(KERN_ERR "%s: format %u needed by %s reloc is not supported\n",
+                       mod->name, format, reloc_name[r_type] ? reloc_name[r_type] : "?");
+                return -ENOEXEC;
+              default:
+                printk(KERN_ERR "%s: relocation %s resulted in unknown format %u\n",
+                       mod->name, reloc_name[r_type] ? reloc_name[r_type] : "?", format);
+                return -ENOEXEC;
+        }
+        return ok ? 0 : -ENOEXEC;
+}
+int
+apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex,
+                    unsigned int relsec, struct module *mod)
+{
+        unsigned int i, n = sechdrs[relsec].sh_size / sizeof(Elf64_Rela);
+        Elf64_Rela *rela = (void *) sechdrs[relsec].sh_addr;
+        Elf64_Shdr *target_sec;
+        int ret;
+        DEBUGP("%s: applying section %u (%u relocs) to %u\n", __FUNCTION__,
+               relsec, n, sechdrs[relsec].sh_info);
+        target_sec = sechdrs + sechdrs[relsec].sh_info;
+        if (target_sec->sh_entsize == ~0UL)
+                /*
+                 * If target section wasn't allocated, we don't need to relocate it.
+                 * Happens, e.g., for debug sections.
+                 */
+                return 0;
+        if (!mod->arch.gp) {
+                /*
+                 * XXX Should have an arch-hook for running this after final section
+                 *     addresses have been selected...
+                 */
+                /* See if gp can cover the entire core module:  */
+                uint64_t gp = (uint64_t) mod->module_core + MAX_LTOFF / 2;
+                if (mod->core_size >= MAX_LTOFF)
+                        /*
+                         * This takes advantage of fact that SHF_ARCH_SMALL gets allocated
+                         * at the end of the module.
+                         */
+                        gp = (uint64_t) mod->module_core + mod->core_size - MAX_LTOFF / 2;
+                mod->arch.gp = gp;
+                DEBUGP("%s: placing gp at 0x%lx\n", __FUNCTION__, gp);
+        }
+        for (i = 0; i < n; i++) {
+                ret = do_reloc(mod, ELF64_R_TYPE(rela[i].r_info),
+                               ((Elf64_Sym *) sechdrs[symindex].sh_addr
+                                + ELF64_R_SYM(rela[i].r_info)),
+                               rela[i].r_addend, target_sec,
+                               (void *) target_sec->sh_addr + rela[i].r_offset);
+                if (ret < 0)
+                        return ret;
+        }
+        return 0;
+}
+int
+apply_relocate (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex,
+                unsigned int relsec, struct module *mod)
+{
+        printk(KERN_ERR "module %s: REL relocs in section %u unsupported\n", mod->name, relsec);
+        return -ENOEXEC;
+}
+/*
+ * Modules contain a single unwind table which covers both the core and the init text
+ * sections but since the two are not contiguous, we need to split this table up such that
+ * we can register (and unregister) each "segment" seperately.  Fortunately, this sounds
+ * more complicated than it really is.
+ */
+static void
+register_unwind_table (struct module *mod)
+{
+        struct unw_table_entry *start = (void *) mod->arch.unwind->sh_addr;
+        struct unw_table_entry *end = start + mod->arch.unwind->sh_size / sizeof (*start);
+        struct unw_table_entry tmp, *e1, *e2, *core, *init;
+        unsigned long num_init = 0, num_core = 0;
+        /* First, count how many init and core unwind-table entries there are.  */
+        for (e1 = start; e1 < end; ++e1)
+                if (in_init(mod, e1->start_offset))
+                        ++num_init;
+                else
+                        ++num_core;
+        /*
+         * Second, sort the table such that all unwind-table entries for the init and core
+         * text sections are nicely separated.  We do this with a stupid bubble sort
+         * (unwind tables don't get ridiculously huge).
+         */
+        for (e1 = start; e1 < end; ++e1) {
+                for (e2 = e1 + 1; e2 < end; ++e2) {
+                        if (e2->start_offset < e1->start_offset) {
+                                tmp = *e1;
+                                *e1 = *e2;
+                                *e2 = tmp;
+                        }
+                }
+        }
+        /*
+         * Third, locate the init and core segments in the unwind table:
+         */
+        if (in_init(mod, start->start_offset)) {
+                init = start;
+                core = start + num_init;
+        } else {
+                core = start;
+                init = start + num_core;
+        }
+        DEBUGP("%s: name=%s, gp=%lx, num_init=%lu, num_core=%lu\n", __FUNCTION__,
+               mod->name, mod->arch.gp, num_init, num_core);
+        /*
+         * Fourth, register both tables (if not empty).
+         */
+        if (num_core > 0) {
+                mod->arch.core_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp,
+                                                                core, core + num_core);
+                DEBUGP("%s:  core: handle=%p [%p-%p)\n", __FUNCTION__,
+                       mod->arch.core_unw_table, core, core + num_core);
+        }
+        if (num_init > 0) {
+                mod->arch.init_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp,
+                                                                init, init + num_init);
+                DEBUGP("%s:  init: handle=%p [%p-%p)\n", __FUNCTION__,
+                       mod->arch.init_unw_table, init, init + num_init);
+        }
+}
+int
+module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod)
+{
+        DEBUGP("%s: init: entry=%p\n", __FUNCTION__, mod->init);
+        if (mod->arch.unwind)
+                register_unwind_table(mod);
+        return 0;
+}
+void
+module_arch_cleanup (struct module *mod)
+{
+        if (mod->arch.init_unw_table)
+                unw_remove_unwind_table(mod->arch.init_unw_table);
+        if (mod->arch.core_unw_table)
+                unw_remove_unwind_table(mod->arch.core_unw_table);
+}
+#ifdef CONFIG_SMP
+void
+percpu_modcopy (void *pcpudst, const void *src, unsigned long size)
+{
+        unsigned int i;
+        for (i = 0; i < NR_CPUS; i++)
+                if (cpu_possible(i))
+                        memcpy(pcpudst + __per_cpu_offset[i], src, size);
+}
+#endif /* CONFIG_SMP */
diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S
new file mode 100644
index 000000000000..5018c7f2e7a8
--- /dev/null
+++ b/arch/ia64/kernel/pal.S
@@ -0,0 +1,302 @@
+/*
+ * PAL Firmware support
+ * IA-64 Processor Programmers Reference Vol 2
+ *
+ * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2001, 2003 Hewlett-Packard Co
+ *      David Mosberger <davidm@hpl.hp.com>
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 05/22/2000 eranian Added support for stacked register calls
+ * 05/24/2000 eranian Added support for physical mode static calls
+ */
+#include <asm/asmmacro.h>
+#include <asm/processor.h>
+        .data
+pal_entry_point:
+        data8 ia64_pal_default_handler
+        .text
+/*
+ * Set the PAL entry point address.  This could be written in C code, but we do it here
+ * to keep it all in one module (besides, it's so trivial that it's
+ * not a big deal).
+ *
+ * in0          Address of the PAL entry point (text address, NOT a function descriptor).
+ */
+GLOBAL_ENTRY(ia64_pal_handler_init)
+        alloc r3=ar.pfs,1,0,0,0
+        movl r2=pal_entry_point
+        ;;
+        st8 [r2]=in0
+        br.ret.sptk.many rp
+END(ia64_pal_handler_init)
+/*
+ * Default PAL call handler.  This needs to be coded in assembly because it uses
+ * the static calling convention, i.e., the RSE may not be used and calls are
+ * done via "br.cond" (not "br.call").
+ */
+GLOBAL_ENTRY(ia64_pal_default_handler)
+        mov r8=-1
+        br.cond.sptk.many rp
+END(ia64_pal_default_handler)
+/*
+ * Make a PAL call using the static calling convention.
+ *
+ * in0         Index of PAL service
+ * in1 - in3   Remaining PAL arguments
+ * in4         1 ==> clear psr.ic,  0 ==> don't clear psr.ic
+ *
+ */
+GLOBAL_ENTRY(ia64_pal_call_static)
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
+        alloc loc1 = ar.pfs,5,5,0,0
+        movl loc2 = pal_entry_point
+1:      {
+          mov r28 = in0
+          mov r29 = in1
+          mov r8 = ip
+        }
+        ;;
+        ld8 loc2 = [loc2]               // loc2 <- entry point
+        tbit.nz p6,p7 = in4, 0
+        adds r8 = 1f-1b,r8
+        mov loc4=ar.rsc                 // save RSE configuration
+        ;;
+        mov ar.rsc=0                    // put RSE in enforced lazy, LE mode
+        mov loc3 = psr
+        mov loc0 = rp
+        .body
+        mov r30 = in2
+(p6)    rsm psr.i | psr.ic
+        mov r31 = in3
+        mov b7 = loc2
+(p7)    rsm psr.i
+        ;;
+(p6)    srlz.i
+        mov rp = r8
+        br.cond.sptk.many b7
+1:      mov psr.l = loc3
+        mov ar.rsc = loc4               // restore RSE configuration
+        mov ar.pfs = loc1
+        mov rp = loc0
+        ;;
+        srlz.d                          // seralize restoration of psr.l
+        br.ret.sptk.many b0
+END(ia64_pal_call_static)
+/*
+ * Make a PAL call using the stacked registers calling convention.
+ *
+ * Inputs:
+ *      in0         Index of PAL service
+ *      in2 - in3   Remaning PAL arguments
+ */
+GLOBAL_ENTRY(ia64_pal_call_stacked)
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
+        alloc loc1 = ar.pfs,4,4,4,0
+        movl loc2 = pal_entry_point
+        mov r28  = in0                  // Index MUST be copied to r28
+        mov out0 = in0                  // AND in0 of PAL function
+        mov loc0 = rp
+        .body
+        ;;
+        ld8 loc2 = [loc2]               // loc2 <- entry point
+        mov out1 = in1
+        mov out2 = in2
+        mov out3 = in3
+        mov loc3 = psr
+        ;;
+        rsm psr.i
+        mov b7 = loc2
+        ;;
+        br.call.sptk.many rp=b7         // now make the call
+.ret0:  mov psr.l  = loc3
+        mov ar.pfs = loc1
+        mov rp = loc0
+        ;;
+        srlz.d                          // serialize restoration of psr.l
+        br.ret.sptk.many b0
+END(ia64_pal_call_stacked)
+/*
+ * Make a physical mode PAL call using the static registers calling convention.
+ *
+ * Inputs:
+ *      in0         Index of PAL service
+ *      in2 - in3   Remaning PAL arguments
+ *
+ * PSR_LP, PSR_TB, PSR_ID, PSR_DA are never set by the kernel.
+ * So we don't need to clear them.
+ */
+#define PAL_PSR_BITS_TO_CLEAR                                                   \
+        (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT  | IA64_PSR_DB | IA64_PSR_RT |  \
+         IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |                \
+         IA64_PSR_DFL | IA64_PSR_DFH)
+#define PAL_PSR_BITS_TO_SET                                                     \
+        (IA64_PSR_BN)
+GLOBAL_ENTRY(ia64_pal_call_phys_static)
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
+        alloc loc1 = ar.pfs,4,7,0,0
+        movl loc2 = pal_entry_point
+1:      {
+          mov r28  = in0                // copy procedure index
+          mov r8   = ip                 // save ip to compute branch
+          mov loc0 = rp                 // save rp
+        }
+        .body
+        ;;
+        ld8 loc2 = [loc2]               // loc2 <- entry point
+        mov r29  = in1                  // first argument
+        mov r30  = in2                  // copy arg2
+        mov r31  = in3                  // copy arg3
+        ;;
+        mov loc3 = psr                  // save psr
+        adds r8  = 1f-1b,r8             // calculate return address for call
+        ;;
+        mov loc4=ar.rsc                 // save RSE configuration
+        dep.z loc2=loc2,0,61            // convert pal entry point to physical
+        tpa r8=r8                       // convert rp to physical
+        ;;
+        mov b7 = loc2                   // install target to branch reg
+        mov ar.rsc=0                    // put RSE in enforced lazy, LE mode
+        movl r16=PAL_PSR_BITS_TO_CLEAR
+        movl r17=PAL_PSR_BITS_TO_SET
+        ;;
+        or loc3=loc3,r17                // add in psr the bits to set
+        ;;
+        andcm r16=loc3,r16              // removes bits to clear from psr
+        br.call.sptk.many rp=ia64_switch_mode_phys
+.ret1:  mov rp = r8                     // install return address (physical)
+        mov loc5 = r19
+        mov loc6 = r20
+        br.cond.sptk.many b7
+1:
+        mov ar.rsc=0                    // put RSE in enforced lazy, LE mode
+        mov r16=loc3                    // r16= original psr
+        mov r19=loc5
+        mov r20=loc6
+        br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
+.ret2:
+        mov psr.l = loc3                // restore init PSR
+        mov ar.pfs = loc1
+        mov rp = loc0
+        ;;
+        mov ar.rsc=loc4                 // restore RSE configuration
+        srlz.d                          // seralize restoration of psr.l
+        br.ret.sptk.many b0
+END(ia64_pal_call_phys_static)
+/*
+ * Make a PAL call using the stacked registers in physical mode.
+ *
+ * Inputs:
+ *      in0         Index of PAL service
+ *      in2 - in3   Remaning PAL arguments
+ */
+GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
+        .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
+        alloc   loc1 = ar.pfs,5,7,4,0
+        movl    loc2 = pal_entry_point
+1:      {
+          mov r28  = in0                // copy procedure index
+          mov loc0 = rp         // save rp
+        }
+        .body
+        ;;
+        ld8 loc2 = [loc2]               // loc2 <- entry point
+        mov out0 = in0          // first argument
+        mov out1 = in1          // copy arg2
+        mov out2 = in2          // copy arg3
+        mov out3 = in3          // copy arg3
+        ;;
+        mov loc3 = psr          // save psr
+        ;;
+        mov loc4=ar.rsc                 // save RSE configuration
+        dep.z loc2=loc2,0,61            // convert pal entry point to physical
+        ;;
+        mov ar.rsc=0                    // put RSE in enforced lazy, LE mode
+        movl r16=PAL_PSR_BITS_TO_CLEAR
+        movl r17=PAL_PSR_BITS_TO_SET
+        ;;
+        or loc3=loc3,r17                // add in psr the bits to set
+        mov b7 = loc2                   // install target to branch reg
+        ;;
+        andcm r16=loc3,r16              // removes bits to clear from psr
+        br.call.sptk.many rp=ia64_switch_mode_phys
+.ret6:
+        mov loc5 = r19
+        mov loc6 = r20
+        br.call.sptk.many rp=b7         // now make the call
+.ret7:
+        mov ar.rsc=0                    // put RSE in enforced lazy, LE mode
+        mov r16=loc3                    // r16= original psr
+        mov r19=loc5
+        mov r20=loc6
+        br.call.sptk.many rp=ia64_switch_mode_virt      // return to virtual mode
+.ret8:  mov psr.l  = loc3               // restore init PSR
+        mov ar.pfs = loc1
+        mov rp = loc0
+        ;;
+        mov ar.rsc=loc4                 // restore RSE configuration
+        srlz.d                          // seralize restoration of psr.l
+        br.ret.sptk.many b0
+END(ia64_pal_call_phys_stacked)
+/*
+ * Save scratch fp scratch regs which aren't saved in pt_regs already (fp10-fp15).
+ *
+ * NOTE: We need to do this since firmware (SAL and PAL) may use any of the scratch
+ * regs fp-low partition.
+ *
+ * Inputs:
+ *      in0     Address of stack storage for fp regs
+ */
+GLOBAL_ENTRY(ia64_save_scratch_fpregs)
+        alloc r3=ar.pfs,1,0,0,0
+        add r2=16,in0
+        ;;
+        stf.spill [in0] = f10,32
+        stf.spill [r2]  = f11,32
+        ;;
+        stf.spill [in0] = f12,32
+        stf.spill [r2]  = f13,32
+        ;;
+        stf.spill [in0] = f14,32
+        stf.spill [r2]  = f15,32
+        br.ret.sptk.many rp
+END(ia64_save_scratch_fpregs)
+/*
+ * Load scratch fp scratch regs (fp10-fp15)
+ *
+ * Inputs:
+ *      in0     Address of stack storage for fp regs
+ */
+GLOBAL_ENTRY(ia64_load_scratch_fpregs)
+        alloc r3=ar.pfs,1,0,0,0
+        add r2=16,in0
+        ;;
+        ldf.fill  f10 = [in0],32
+        ldf.fill  f11 = [r2],32
+        ;;
+        ldf.fill  f12 = [in0],32
+        ldf.fill  f13 = [r2],32
+        ;;
+        ldf.fill  f14 = [in0],32
+        ldf.fill  f15 = [r2],32
+        br.ret.sptk.many rp
+END(ia64_load_scratch_fpregs)
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
new file mode 100644
index 000000000000..25e7c8344564
--- /dev/null
+++ b/arch/ia64/kernel/palinfo.c
@@ -0,0 +1,1023 @@
+/*
+ * palinfo.c
+ *
+ * Prints processor specific information reported by PAL.
+ * This code is based on specification of PAL as of the
+ * Intel IA-64 Architecture Software Developer's Manual v1.0.
+ *
+ *
+ * Copyright (C) 2000-2001, 2003 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2004 Intel Corporation
+ *  Ashok Raj <ashok.raj@intel.com>
+ *
+ * 05/26/2000   S.Eranian       initial release
+ * 08/21/2000   S.Eranian       updated to July 2000 PAL specs
+ * 02/05/2001   S.Eranian       fixed module support
+ * 10/23/2001   S.Eranian       updated pal_perf_mon_info bug fixes
+ * 03/24/2004   Ashok Raj       updated to work with CPU Hotplug
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <asm/pal.h>
+#include <asm/sal.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <linux/smp.h>
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("/proc interface to IA-64 PAL");
+MODULE_LICENSE("GPL");
+#define PALINFO_VERSION "0.5"
+typedef int (*palinfo_func_t)(char*);
+typedef struct {
+        const char              *name;          /* name of the proc entry */
+        palinfo_func_t          proc_read;      /* function to call for reading */
+        struct proc_dir_entry   *entry;         /* registered entry (removal) */
+} palinfo_entry_t;
+/*
+ *  A bunch of string array to get pretty printing
+ */
+static char *cache_types[] = {
+        "",                     /* not used */
+        "Instruction",
+        "Data",
+        "Data/Instruction"      /* unified */
+};
+static const char *cache_mattrib[]={
+        "WriteThrough",
+        "WriteBack",
+        "",             /* reserved */
+        ""              /* reserved */
+};
+static const char *cache_st_hints[]={
+        "Temporal, level 1",
+        "Reserved",
+        "Reserved",
+        "Non-temporal, all levels",
+        "Reserved",
+        "Reserved",
+        "Reserved",
+        "Reserved"
+};
+static const char *cache_ld_hints[]={
+        "Temporal, level 1",
+        "Non-temporal, level 1",
+        "Reserved",
+        "Non-temporal, all levels",
+        "Reserved",
+        "Reserved",
+        "Reserved",
+        "Reserved"
+};
+static const char *rse_hints[]={
+        "enforced lazy",
+        "eager stores",
+        "eager loads",
+        "eager loads and stores"
+};
+#define RSE_HINTS_COUNT ARRAY_SIZE(rse_hints)
+static const char *mem_attrib[]={
+        "WB",           /* 000 */
+        "SW",           /* 001 */
+        "010",          /* 010 */
+        "011",          /* 011 */
+        "UC",           /* 100 */
+        "UCE",          /* 101 */
+        "WC",           /* 110 */
+        "NaTPage"       /* 111 */
+};
+/*
+ * Take a 64bit vector and produces a string such that
+ * if bit n is set then 2^n in clear text is generated. The adjustment
+ * to the right unit is also done.
+ *
+ * Input:
+ *      - a pointer to a buffer to hold the string
+ *      - a 64-bit vector
+ * Ouput:
+ *      - a pointer to the end of the buffer
+ *
+ */
+static char *
+bitvector_process(char *p, u64 vector)
+{
+        int i,j;
+        const char *units[]={ "", "K", "M", "G", "T" };
+        for (i=0, j=0; i < 64; i++ , j=i/10) {
+                if (vector & 0x1) {
+                        p += sprintf(p, "%d%s ", 1 << (i-j*10), units[j]);
+                }
+                vector >>= 1;
+        }
+        return p;
+}
+/*
+ * Take a 64bit vector and produces a string such that
+ * if bit n is set then register n is present. The function
+ * takes into account consecutive registers and prints out ranges.
+ *
+ * Input:
+ *      - a pointer to a buffer to hold the string
+ *      - a 64-bit vector
+ * Ouput:
+ *      - a pointer to the end of the buffer
+ *
+ */
+static char *
+bitregister_process(char *p, u64 *reg_info, int max)
+{
+        int i, begin, skip = 0;
+        u64 value = reg_info[0];
+        value >>= i = begin = ffs(value) - 1;
+        for(; i < max; i++ ) {
+                if (i != 0 && (i%64) == 0) value = *++reg_info;
+                if ((value & 0x1) == 0 && skip == 0) {
+                        if (begin  <= i - 2)
+                                p += sprintf(p, "%d-%d ", begin, i-1);
+                        else
+                                p += sprintf(p, "%d ", i-1);
+                        skip  = 1;
+                        begin = -1;
+                } else if ((value & 0x1) && skip == 1) {
+                        skip = 0;
+                        begin = i;
+                }
+                value >>=1;
+        }
+        if (begin > -1) {
+                if (begin < 127)
+                        p += sprintf(p, "%d-127", begin);
+                else
+                        p += sprintf(p, "127");
+        }
+        return p;
+}
+static int
+power_info(char *page)
+{
+        s64 status;
+        char *p = page;
+        u64 halt_info_buffer[8];
+        pal_power_mgmt_info_u_t *halt_info =(pal_power_mgmt_info_u_t *)halt_info_buffer;
+        int i;
+        status = ia64_pal_halt_info(halt_info);
+        if (status != 0) return 0;
+        for (i=0; i < 8 ; i++ ) {
+                if (halt_info[i].pal_power_mgmt_info_s.im == 1) {
+                        p += sprintf(p, "Power level %d:\n"
+                                     "\tentry_latency       : %d cycles\n"
+                                     "\texit_latency        : %d cycles\n"
+                                     "\tpower consumption   : %d mW\n"
+                                     "\tCache+TLB coherency : %s\n", i,
+                                     halt_info[i].pal_power_mgmt_info_s.entry_latency,
+                                     halt_info[i].pal_power_mgmt_info_s.exit_latency,
+                                     halt_info[i].pal_power_mgmt_info_s.power_consumption,
+                                     halt_info[i].pal_power_mgmt_info_s.co ? "Yes" : "No");
+                } else {
+                        p += sprintf(p,"Power level %d: not implemented\n",i);
+                }
+        }
+        return p - page;
+}
+static int
+cache_info(char *page)
+{
+        char *p = page;
+        u64 i, levels, unique_caches;
+        pal_cache_config_info_t cci;
+        int j, k;
+        s64 status;
+        if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) {
+                printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status);
+                return 0;
+        }
+        p += sprintf(p, "Cache levels  : %ld\nUnique caches : %ld\n\n", levels, unique_caches);
+        for (i=0; i < levels; i++) {
+                for (j=2; j >0 ; j--) {
+                        /* even without unification some level may not be present */
+                        if ((status=ia64_pal_cache_config_info(i,j, &cci)) != 0) {
+                                continue;
+                        }
+                        p += sprintf(p,
+                                     "%s Cache level %lu:\n"
+                                     "\tSize           : %lu bytes\n"
+                                     "\tAttributes     : ",
+                                     cache_types[j+cci.pcci_unified], i+1,
+                                     cci.pcci_cache_size);
+                        if (cci.pcci_unified) p += sprintf(p, "Unified ");
+                        p += sprintf(p, "%s\n", cache_mattrib[cci.pcci_cache_attr]);
+                        p += sprintf(p,
+                                     "\tAssociativity  : %d\n"
+                                     "\tLine size      : %d bytes\n"
+                                     "\tStride         : %d bytes\n",
+                                     cci.pcci_assoc, 1<<cci.pcci_line_size, 1<<cci.pcci_stride);
+                        if (j == 1)
+                                p += sprintf(p, "\tStore latency  : N/A\n");
+                        else
+                                p += sprintf(p, "\tStore latency  : %d cycle(s)\n",
+                                                cci.pcci_st_latency);
+                        p += sprintf(p,
+                                     "\tLoad latency   : %d cycle(s)\n"
+                                     "\tStore hints    : ", cci.pcci_ld_latency);
+                        for(k=0; k < 8; k++ ) {
+                                if ( cci.pcci_st_hints & 0x1)
+                                        p += sprintf(p, "[%s]", cache_st_hints[k]);
+                                cci.pcci_st_hints >>=1;
+                        }
+                        p += sprintf(p, "\n\tLoad hints     : ");
+                        for(k=0; k < 8; k++ ) {
+                                if (cci.pcci_ld_hints & 0x1)
+                                        p += sprintf(p, "[%s]", cache_ld_hints[k]);
+                                cci.pcci_ld_hints >>=1;
+                        }
+                        p += sprintf(p,
+                                     "\n\tAlias boundary : %d byte(s)\n"
+                                     "\tTag LSB        : %d\n"
+                                     "\tTag MSB        : %d\n",
+                                     1<<cci.pcci_alias_boundary, cci.pcci_tag_lsb,
+                                     cci.pcci_tag_msb);
+                        /* when unified, data(j=2) is enough */
+                        if (cci.pcci_unified) break;
+                }
+        }
+        return p - page;
+}
+static int
+vm_info(char *page)
+{
+        char *p = page;
+        u64 tr_pages =0, vw_pages=0, tc_pages;
+        u64 attrib;
+        pal_vm_info_1_u_t vm_info_1;
+        pal_vm_info_2_u_t vm_info_2;
+        pal_tc_info_u_t tc_info;
+        ia64_ptce_info_t ptce;
+        const char *sep;
+        int i, j;
+        s64 status;
+        if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
+                printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
+                return 0;
+        }
+        p += sprintf(p,
+                     "Physical Address Space         : %d bits\n"
+                     "Virtual Address Space          : %d bits\n"
+                     "Protection Key Registers(PKR)  : %d\n"
+                     "Implemented bits in PKR.key    : %d\n"
+                     "Hash Tag ID                    : 0x%x\n"
+                     "Size of RR.rid                 : %d\n",
+                     vm_info_1.pal_vm_info_1_s.phys_add_size,
+                     vm_info_2.pal_vm_info_2_s.impl_va_msb+1, vm_info_1.pal_vm_info_1_s.max_pkr+1,
+                     vm_info_1.pal_vm_info_1_s.key_size, vm_info_1.pal_vm_info_1_s.hash_tag_id,
+                     vm_info_2.pal_vm_info_2_s.rid_size);
+        if (ia64_pal_mem_attrib(&attrib) != 0)
+                return 0;
+        p += sprintf(p, "Supported memory attributes    : ");
+        sep = "";
+        for (i = 0; i < 8; i++) {
+                if (attrib & (1 << i)) {
+                        p += sprintf(p, "%s%s", sep, mem_attrib[i]);
+                        sep = ", ";
+                }
+        }
+        p += sprintf(p, "\n");
+        if ((status = ia64_pal_vm_page_size(&tr_pages, &vw_pages)) !=0) {
+                printk(KERN_ERR "ia64_pal_vm_page_size=%ld\n", status);
+                return 0;
+        }
+        p += sprintf(p,
+                     "\nTLB walker                     : %simplemented\n"
+                     "Number of DTR                  : %d\n"
+                     "Number of ITR                  : %d\n"
+                     "TLB insertable page sizes      : ",
+                     vm_info_1.pal_vm_info_1_s.vw ? "" : "not ",
+                     vm_info_1.pal_vm_info_1_s.max_dtr_entry+1,
+                     vm_info_1.pal_vm_info_1_s.max_itr_entry+1);
+        p = bitvector_process(p, tr_pages);
+        p += sprintf(p, "\nTLB purgeable page sizes       : ");
+        p = bitvector_process(p, vw_pages);
+        if ((status=ia64_get_ptce(&ptce)) != 0) {
+                printk(KERN_ERR "ia64_get_ptce=%ld\n", status);
+                return 0;
+        }
+        p += sprintf(p,
+                     "\nPurge base address             : 0x%016lx\n"
+                     "Purge outer loop count         : %d\n"
+                     "Purge inner loop count         : %d\n"
+                     "Purge outer loop stride        : %d\n"
+                     "Purge inner loop stride        : %d\n",
+                     ptce.base, ptce.count[0], ptce.count[1], ptce.stride[0], ptce.stride[1]);
+        p += sprintf(p,
+                     "TC Levels                      : %d\n"
+                     "Unique TC(s)                   : %d\n",
+                     vm_info_1.pal_vm_info_1_s.num_tc_levels,
+                     vm_info_1.pal_vm_info_1_s.max_unique_tcs);
+        for(i=0; i < vm_info_1.pal_vm_info_1_s.num_tc_levels; i++) {
+                for (j=2; j>0 ; j--) {
+                        tc_pages = 0; /* just in case */
+                        /* even without unification, some levels may not be present */
+                        if ((status=ia64_pal_vm_info(i,j, &tc_info, &tc_pages)) != 0) {
+                                continue;
+                        }
+                        p += sprintf(p,
+                                     "\n%s Translation Cache Level %d:\n"
+                                     "\tHash sets           : %d\n"
+                                     "\tAssociativity       : %d\n"
+                                     "\tNumber of entries   : %d\n"
+                                     "\tFlags               : ",
+                                     cache_types[j+tc_info.tc_unified], i+1, tc_info.tc_num_sets,
+                                     tc_info.tc_associativity, tc_info.tc_num_entries);
+                        if (tc_info.tc_pf) p += sprintf(p, "PreferredPageSizeOptimized ");
+                        if (tc_info.tc_unified) p += sprintf(p, "Unified ");
+                        if (tc_info.tc_reduce_tr) p += sprintf(p, "TCReduction");
+                        p += sprintf(p, "\n\tSupported page sizes: ");
+                        p = bitvector_process(p, tc_pages);
+                        /* when unified date (j=2) is enough */
+                        if (tc_info.tc_unified) break;
+                }
+        }
+        p += sprintf(p, "\n");
+        return p - page;
+}
+static int
+register_info(char *page)
+{
+        char *p = page;
+        u64 reg_info[2];
+        u64 info;
+        u64 phys_stacked;
+        pal_hints_u_t hints;
+        u64 iregs, dregs;
+        char *info_type[]={
+                "Implemented AR(s)",
+                "AR(s) with read side-effects",
+                "Implemented CR(s)",
+                "CR(s) with read side-effects",
+        };
+        for(info=0; info < 4; info++) {
+                if (ia64_pal_register_info(info, &reg_info[0], &reg_info[1]) != 0) return 0;
+                p += sprintf(p, "%-32s : ", info_type[info]);
+                p = bitregister_process(p, reg_info, 128);
+                p += sprintf(p, "\n");
+        }
+        if (ia64_pal_rse_info(&phys_stacked, &hints) != 0) return 0;
+        p += sprintf(p,
+                     "RSE stacked physical registers   : %ld\n"
+                     "RSE load/store hints             : %ld (%s)\n",
+                     phys_stacked, hints.ph_data,
+                     hints.ph_data < RSE_HINTS_COUNT ? rse_hints[hints.ph_data]: "(??)");
+        if (ia64_pal_debug_info(&iregs, &dregs))
+                return 0;
+        p += sprintf(p,
+                     "Instruction debug register pairs : %ld\n"
+                     "Data debug register pairs        : %ld\n", iregs, dregs);
+        return p - page;
+}
+static const char *proc_features[]={
+        NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
+        NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,
+        NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
+        NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,
+        NULL,NULL,NULL,NULL,NULL,
+        "XIP,XPSR,XFS implemented",
+        "XR1-XR3 implemented",
+        "Disable dynamic predicate prediction",
+        "Disable processor physical number",
+        "Disable dynamic data cache prefetch",
+        "Disable dynamic inst cache prefetch",
+        "Disable dynamic branch prediction",
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        "Disable BINIT on processor time-out",
+        "Disable dynamic power management (DPM)",
+        "Disable coherency",
+        "Disable cache",
+        "Enable CMCI promotion",
+        "Enable MCA to BINIT promotion",
+        "Enable MCA promotion",
+        "Enable BERR promotion"
+};
+static int
+processor_info(char *page)
+{
+        char *p = page;
+        const char **v = proc_features;
+        u64 avail=1, status=1, control=1;
+        int i;
+        s64 ret;
+        if ((ret=ia64_pal_proc_get_features(&avail, &status, &control)) != 0) return 0;
+        for(i=0; i < 64; i++, v++,avail >>=1, status >>=1, control >>=1) {
+                if ( ! *v ) continue;
+                p += sprintf(p, "%-40s : %s%s %s\n", *v,
+                                avail & 0x1 ? "" : "NotImpl",
+                                avail & 0x1 ? (status & 0x1 ? "On" : "Off"): "",
+                                avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): "");
+        }
+        return p - page;
+}
+static const char *bus_features[]={
+        NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
+        NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,
+        NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
+        NULL,NULL,
+        "Request  Bus Parking",
+        "Bus Lock Mask",
+        "Enable Half Transfer",
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+        NULL, NULL, NULL, NULL,
+        "Enable Cache Line Repl. Shared",
+        "Enable Cache Line Repl. Exclusive",
+        "Disable Transaction Queuing",
+        "Disable Response Error Checking",
+        "Disable Bus Error Checking",
+        "Disable Bus Requester Internal Error Signalling",
+        "Disable Bus Requester Error Signalling",
+        "Disable Bus Initialization Event Checking",
+        "Disable Bus Initialization Event Signalling",
+        "Disable Bus Address Error Checking",
+        "Disable Bus Address Error Signalling",
+        "Disable Bus Data Error Checking"
+};
+static int
+bus_info(char *page)
+{
+        char *p = page;
+        const char **v = bus_features;
+        pal_bus_features_u_t av, st, ct;
+        u64 avail, status, control;
+        int i;
+        s64 ret;
+        if ((ret=ia64_pal_bus_get_features(&av, &st, &ct)) != 0) return 0;
+        avail   = av.pal_bus_features_val;
+        status  = st.pal_bus_features_val;
+        control = ct.pal_bus_features_val;
+        for(i=0; i < 64; i++, v++, avail >>=1, status >>=1, control >>=1) {
+                if ( ! *v ) continue;
+                p += sprintf(p, "%-48s : %s%s %s\n", *v,
+                                avail & 0x1 ? "" : "NotImpl",
+                                avail & 0x1 ? (status  & 0x1 ? "On" : "Off"): "",
+                                avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): "");
+        }
+        return p - page;
+}
+static int
+version_info(char *page)
+{
+        pal_version_u_t min_ver, cur_ver;
+        char *p = page;
+        /* The PAL_VERSION call is advertised as being able to support
+         * both physical and virtual mode calls. This seems to be a documentation
+         * bug rather than firmware bug. In fact, it does only support physical mode.
+         * So now the code reflects this fact and the pal_version() has been updated
+         * accordingly.
+         */
+        if (ia64_pal_version(&min_ver, &cur_ver) != 0) return 0;
+        p += sprintf(p,
+                     "PAL_vendor : 0x%02x (min=0x%02x)\n"
+                     "PAL_A      : %x.%x.%x (min=%x.%x.%x)\n"
+                     "PAL_B      : %x.%x.%x (min=%x.%x.%x)\n",
+                     cur_ver.pal_version_s.pv_pal_vendor, min_ver.pal_version_s.pv_pal_vendor,
+                     cur_ver.pal_version_s.pv_pal_a_model>>4,
+                     cur_ver.pal_version_s.pv_pal_a_model&0xf, cur_ver.pal_version_s.pv_pal_a_rev,
+                     min_ver.pal_version_s.pv_pal_a_model>>4,
+                     min_ver.pal_version_s.pv_pal_a_model&0xf, min_ver.pal_version_s.pv_pal_a_rev,
+                     cur_ver.pal_version_s.pv_pal_b_model>>4,
+                     cur_ver.pal_version_s.pv_pal_b_model&0xf, cur_ver.pal_version_s.pv_pal_b_rev,
+                     min_ver.pal_version_s.pv_pal_b_model>>4,
+                     min_ver.pal_version_s.pv_pal_b_model&0xf, min_ver.pal_version_s.pv_pal_b_rev);
+        return p - page;
+}
+static int
+perfmon_info(char *page)
+{
+        char *p = page;
+        u64 pm_buffer[16];
+        pal_perf_mon_info_u_t pm_info;
+        if (ia64_pal_perf_mon_info(pm_buffer, &pm_info) != 0) return 0;
+        p += sprintf(p,
+                     "PMC/PMD pairs                 : %d\n"
+                     "Counter width                 : %d bits\n"
+                     "Cycle event number            : %d\n"
+                     "Retired event number          : %d\n"
+                     "Implemented PMC               : ",
+                     pm_info.pal_perf_mon_info_s.generic, pm_info.pal_perf_mon_info_s.width,
+                     pm_info.pal_perf_mon_info_s.cycles, pm_info.pal_perf_mon_info_s.retired);
+        p = bitregister_process(p, pm_buffer, 256);
+        p += sprintf(p, "\nImplemented PMD               : ");
+        p = bitregister_process(p, pm_buffer+4, 256);
+        p += sprintf(p, "\nCycles count capable          : ");
+        p = bitregister_process(p, pm_buffer+8, 256);
+        p += sprintf(p, "\nRetired bundles count capable : ");
+#ifdef CONFIG_ITANIUM
+        /*
+         * PAL_PERF_MON_INFO reports that only PMC4 can be used to count CPU_CYCLES
+         * which is wrong, both PMC4 and PMD5 support it.
+         */
+        if (pm_buffer[12] == 0x10) pm_buffer[12]=0x30;
+#endif
+        p = bitregister_process(p, pm_buffer+12, 256);
+        p += sprintf(p, "\n");
+        return p - page;
+}
+static int
+frequency_info(char *page)
+{
+        char *p = page;
+        struct pal_freq_ratio proc, itc, bus;
+        u64 base;
+        if (ia64_pal_freq_base(&base) == -1)
+                p += sprintf(p, "Output clock            : not implemented\n");
+        else
+                p += sprintf(p, "Output clock            : %ld ticks/s\n", base);
+        if (ia64_pal_freq_ratios(&proc, &bus, &itc) != 0) return 0;
+        p += sprintf(p,
+                     "Processor/Clock ratio   : %ld/%ld\n"
+                     "Bus/Clock ratio         : %ld/%ld\n"
+                     "ITC/Clock ratio         : %ld/%ld\n",
+                     proc.num, proc.den, bus.num, bus.den, itc.num, itc.den);
+        return p - page;
+}
+static int
+tr_info(char *page)
+{
+        char *p = page;
+        s64 status;
+        pal_tr_valid_u_t tr_valid;
+        u64 tr_buffer[4];
+        pal_vm_info_1_u_t vm_info_1;
+        pal_vm_info_2_u_t vm_info_2;
+        u64 i, j;
+        u64 max[3], pgm;
+        struct ifa_reg {
+                u64 valid:1;
+                u64 ig:11;
+                u64 vpn:52;
+        } *ifa_reg;
+        struct itir_reg {
+                u64 rv1:2;
+                u64 ps:6;
+                u64 key:24;
+                u64 rv2:32;
+        } *itir_reg;
+        struct gr_reg {
+                u64 p:1;
+                u64 rv1:1;
+                u64 ma:3;
+                u64 a:1;
+                u64 d:1;
+                u64 pl:2;
+                u64 ar:3;
+                u64 ppn:38;
+                u64 rv2:2;
+                u64 ed:1;
+                u64 ig:11;
+        } *gr_reg;
+        struct rid_reg {
+                u64 ig1:1;
+                u64 rv1:1;
+                u64 ig2:6;
+                u64 rid:24;
+                u64 rv2:32;
+        } *rid_reg;
+        if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
+                printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
+                return 0;
+        }
+        max[0] = vm_info_1.pal_vm_info_1_s.max_itr_entry+1;
+        max[1] = vm_info_1.pal_vm_info_1_s.max_dtr_entry+1;
+        for (i=0; i < 2; i++ ) {
+                for (j=0; j < max[i]; j++) {
+                status = ia64_pal_tr_read(j, i, tr_buffer, &tr_valid);
+                if (status != 0) {
+                        printk(KERN_ERR "palinfo: pal call failed on tr[%lu:%lu]=%ld\n",
+                               i, j, status);
+                        continue;
+                }
+                ifa_reg  = (struct ifa_reg *)&tr_buffer[2];
+                if (ifa_reg->valid == 0) continue;
+                gr_reg   = (struct gr_reg *)tr_buffer;
+                itir_reg = (struct itir_reg *)&tr_buffer[1];
+                rid_reg  = (struct rid_reg *)&tr_buffer[3];
+                pgm      = -1 << (itir_reg->ps - 12);
+                p += sprintf(p,
+                             "%cTR%lu: av=%d pv=%d dv=%d mv=%d\n"
+                             "\tppn  : 0x%lx\n"
+                             "\tvpn  : 0x%lx\n"
+                             "\tps   : ",
+                             "ID"[i], j,
+                             tr_valid.pal_tr_valid_s.access_rights_valid,
+                             tr_valid.pal_tr_valid_s.priv_level_valid,
+                             tr_valid.pal_tr_valid_s.dirty_bit_valid,
+                             tr_valid.pal_tr_valid_s.mem_attr_valid,
+                             (gr_reg->ppn & pgm)<< 12, (ifa_reg->vpn & pgm)<< 12);
+                p = bitvector_process(p, 1<< itir_reg->ps);
+                p += sprintf(p,
+                             "\n\tpl   : %d\n"
+                             "\tar   : %d\n"
+                             "\trid  : %x\n"
+                             "\tp    : %d\n"
+                             "\tma   : %d\n"
+                             "\td    : %d\n",
+                             gr_reg->pl, gr_reg->ar, rid_reg->rid, gr_reg->p, gr_reg->ma,
+                             gr_reg->d);
+                }
+        }
+        return p - page;
+}
+/*
+ * List {name,function} pairs for every entry in /proc/palinfo/cpu*
+ */
+static palinfo_entry_t palinfo_entries[]={
+        { "version_info",       version_info, },
+        { "vm_info",            vm_info, },
+        { "cache_info",         cache_info, },
+        { "power_info",         power_info, },
+        { "register_info",      register_info, },
+        { "processor_info",     processor_info, },
+        { "perfmon_info",       perfmon_info, },
+        { "frequency_info",     frequency_info, },
+        { "bus_info",           bus_info },
+        { "tr_info",            tr_info, }
+};
+#define NR_PALINFO_ENTRIES      (int) ARRAY_SIZE(palinfo_entries)
+/*
+ * this array is used to keep track of the proc entries we create. This is
+ * required in the module mode when we need to remove all entries. The procfs code
+ * does not do recursion of deletion
+ *
+ * Notes:
+ *      - +1 accounts for the cpuN directory entry in /proc/pal
+ */
+#define NR_PALINFO_PROC_ENTRIES (NR_CPUS*(NR_PALINFO_ENTRIES+1))
+static struct proc_dir_entry *palinfo_proc_entries[NR_PALINFO_PROC_ENTRIES];
+static struct proc_dir_entry *palinfo_dir;
+/*
+ * This data structure is used to pass which cpu,function is being requested
+ * It must fit in a 64bit quantity to be passed to the proc callback routine
+ *
+ * In SMP mode, when we get a request for another CPU, we must call that
+ * other CPU using IPI and wait for the result before returning.
+ */
+typedef union {
+        u64 value;
+        struct {
+                unsigned        req_cpu: 32;    /* for which CPU this info is */
+                unsigned        func_id: 32;    /* which function is requested */
+        } pal_func_cpu;
+} pal_func_cpu_u_t;
+#define req_cpu pal_func_cpu.req_cpu
+#define func_id pal_func_cpu.func_id
+#ifdef CONFIG_SMP
+/*
+ * used to hold information about final function to call
+ */
+typedef struct {
+        palinfo_func_t  func;   /* pointer to function to call */
+        char            *page;  /* buffer to store results */
+        int             ret;    /* return value from call */
+} palinfo_smp_data_t;
+/*
+ * this function does the actual final call and he called
+ * from the smp code, i.e., this is the palinfo callback routine
+ */
+static void
+palinfo_smp_call(void *info)
+{
+        palinfo_smp_data_t *data = (palinfo_smp_data_t *)info;
+        if (data == NULL) {
+                printk(KERN_ERR "palinfo: data pointer is NULL\n");
+                data->ret = 0; /* no output */
+                return;
+        }
+        /* does this actual call */
+        data->ret = (*data->func)(data->page);
+}
+/*
+ * function called to trigger the IPI, we need to access a remote CPU
+ * Return:
+ *      0 : error or nothing to output
+ *      otherwise how many bytes in the "page" buffer were written
+ */
+static
+int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page)
+{
+        palinfo_smp_data_t ptr;
+        int ret;
+        ptr.func = palinfo_entries[f->func_id].proc_read;
+        ptr.page = page;
+        ptr.ret  = 0; /* just in case */
+        /* will send IPI to other CPU and wait for completion of remote call */
+        if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 0, 1))) {
+                printk(KERN_ERR "palinfo: remote CPU call from %d to %d on function %d: "
+                       "error %d\n", smp_processor_id(), f->req_cpu, f->func_id, ret);
+                return 0;
+        }
+        return ptr.ret;
+}
+#else /* ! CONFIG_SMP */
+static
+int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page)
+{
+        printk(KERN_ERR "palinfo: should not be called with non SMP kernel\n");
+        return 0;
+}
+#endif /* CONFIG_SMP */
+/*
+ * Entry point routine: all calls go through this function
+ */
+static int
+palinfo_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data)
+{
+        int len=0;
+        pal_func_cpu_u_t *f = (pal_func_cpu_u_t *)&data;
+        /*
+         * in SMP mode, we may need to call another CPU to get correct
+         * information. PAL, by definition, is processor specific
+         */
+        if (f->req_cpu == get_cpu())
+                len = (*palinfo_entries[f->func_id].proc_read)(page);
+        else
+                len = palinfo_handle_smp(f, page);
+        put_cpu();
+        if (len <= off+count) *eof = 1;
+        *start = page + off;
+        len   -= off;
+        if (len>count) len = count;
+        if (len<0) len = 0;
+        return len;
+}
+static void
+create_palinfo_proc_entries(unsigned int cpu)
+{
+#       define CPUSTR   "cpu%d"
+        pal_func_cpu_u_t f;
+        struct proc_dir_entry **pdir;
+        struct proc_dir_entry *cpu_dir;
+        int j;
+        char cpustr[sizeof(CPUSTR)];
+        /*
+         * we keep track of created entries in a depth-first order for
+         * cleanup purposes. Each entry is stored into palinfo_proc_entries
+         */
+        sprintf(cpustr,CPUSTR, cpu);
+        cpu_dir = proc_mkdir(cpustr, palinfo_dir);
+        f.req_cpu = cpu;
+        /*
+         * Compute the location to store per cpu entries
+         * We dont store the top level entry in this list, but
+         * remove it finally after removing all cpu entries.
+         */
+        pdir = &palinfo_proc_entries[cpu*(NR_PALINFO_ENTRIES+1)];
+        *pdir++ = cpu_dir;
+        for (j=0; j < NR_PALINFO_ENTRIES; j++) {
+                f.func_id = j;
+                *pdir = create_proc_read_entry(
+                                palinfo_entries[j].name, 0, cpu_dir,
+                                palinfo_read_entry, (void *)f.value);
+                if (*pdir)
+                        (*pdir)->owner = THIS_MODULE;
+                pdir++;
+        }
+}
+static void
+remove_palinfo_proc_entries(unsigned int hcpu)
+{
+        int j;
+        struct proc_dir_entry *cpu_dir, **pdir;
+        pdir = &palinfo_proc_entries[hcpu*(NR_PALINFO_ENTRIES+1)];
+        cpu_dir = *pdir;
+        *pdir++=NULL;
+        for (j=0; j < (NR_PALINFO_ENTRIES); j++) {
+                if ((*pdir)) {
+                        remove_proc_entry ((*pdir)->name, cpu_dir);
+                        *pdir ++= NULL;
+                }
+        }
+        if (cpu_dir) {
+                remove_proc_entry(cpu_dir->name, palinfo_dir);
+        }
+}
+static int __devinit palinfo_cpu_callback(struct notifier_block *nfb,
+                                                                unsigned long action,
+                                                                void *hcpu)
+{
+        unsigned int hotcpu = (unsigned long)hcpu;
+        switch (action) {
+        case CPU_ONLINE:
+                create_palinfo_proc_entries(hotcpu);
+                break;
+#ifdef CONFIG_HOTPLUG_CPU
+        case CPU_DEAD:
+                remove_palinfo_proc_entries(hotcpu);
+                break;
+#endif
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block palinfo_cpu_notifier =
+{
+        .notifier_call = palinfo_cpu_callback,
+        .priority = 0,
+};
+static int __init
+palinfo_init(void)
+{
+        int i = 0;
+        printk(KERN_INFO "PAL Information Facility v%s\n", PALINFO_VERSION);
+        palinfo_dir = proc_mkdir("pal", NULL);
+        /* Create palinfo dirs in /proc for all online cpus */
+        for_each_online_cpu(i) {
+                create_palinfo_proc_entries(i);
+        }
+        /* Register for future delivery via notify registration */
+        register_cpu_notifier(&palinfo_cpu_notifier);
+        return 0;
+}
+static void __exit
+palinfo_exit(void)
+{
+        int i = 0;
+        /* remove all nodes: depth first pass. Could optimize this  */
+        for_each_online_cpu(i) {
+                remove_palinfo_proc_entries(i);
+        }
+        /*
+         * Remove the top level entry finally
+         */
+        remove_proc_entry(palinfo_dir->name, NULL);
+        /*
+         * Unregister from cpu notifier callbacks
+         */
+        unregister_cpu_notifier(&palinfo_cpu_notifier);
+}
+module_init(palinfo_init);
+module_exit(palinfo_exit);
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
new file mode 100644
index 000000000000..367804a605fa
--- /dev/null
+++ b/arch/ia64/kernel/patch.c
@@ -0,0 +1,189 @@
+/*
+ * Instruction-patching support.
+ *
+ * Copyright (C) 2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/init.h>
+#include <linux/string.h>
+#include <asm/patch.h>
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/unistd.h>
+/*
+ * This was adapted from code written by Tony Luck:
+ *
+ * The 64-bit value in a "movl reg=value" is scattered between the two words of the bundle
+ * like this:
+ *
+ * 6  6         5         4         3         2         1
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ * ABBBBBBBBBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCCCDEEEEEFFFFFFFFFGGGGGGG
+ *
+ * CCCCCCCCCCCCCCCCCCxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+ * xxxxAFFFFFFFFFEEEEEDxGGGGGGGxxxxxxxxxxxxxBBBBBBBBBBBBBBBBBBBBBBB
+ */
+static u64
+get_imm64 (u64 insn_addr)
+{
+        u64 *p = (u64 *) (insn_addr & -16);     /* mask out slot number */
+        return ( (p[1] & 0x0800000000000000UL) << 4)  | /*A*/
+                ((p[1] & 0x00000000007fffffUL) << 40) | /*B*/
+                ((p[0] & 0xffffc00000000000UL) >> 24) | /*C*/
+                ((p[1] & 0x0000100000000000UL) >> 23) | /*D*/
+                ((p[1] & 0x0003e00000000000UL) >> 29) | /*E*/
+                ((p[1] & 0x07fc000000000000UL) >> 43) | /*F*/
+                ((p[1] & 0x000007f000000000UL) >> 36);  /*G*/
+}
+/* Patch instruction with "val" where "mask" has 1 bits. */
+void
+ia64_patch (u64 insn_addr, u64 mask, u64 val)
+{
+        u64 m0, m1, v0, v1, b0, b1, *b = (u64 *) (insn_addr & -16);
+#       define insn_mask ((1UL << 41) - 1)
+        unsigned long shift;
+        b0 = b[0]; b1 = b[1];
+        shift = 5 + 41 * (insn_addr % 16); /* 5 bits of template, then 3 x 41-bit instructions */
+        if (shift >= 64) {
+                m1 = mask << (shift - 64);
+                v1 = val << (shift - 64);
+        } else {
+                m0 = mask << shift; m1 = mask >> (64 - shift);
+                v0 = val  << shift; v1 = val >> (64 - shift);
+                b[0] = (b0 & ~m0) | (v0 & m0);
+        }
+        b[1] = (b1 & ~m1) | (v1 & m1);
+}
+void
+ia64_patch_imm64 (u64 insn_addr, u64 val)
+{
+        ia64_patch(insn_addr,
+                   0x01fffefe000UL, (  ((val & 0x8000000000000000UL) >> 27) /* bit 63 -> 36 */
+                                     | ((val & 0x0000000000200000UL) <<  0) /* bit 21 -> 21 */
+                                     | ((val & 0x00000000001f0000UL) <<  6) /* bit 16 -> 22 */
+                                     | ((val & 0x000000000000ff80UL) << 20) /* bit  7 -> 27 */
+                                     | ((val & 0x000000000000007fUL) << 13) /* bit  0 -> 13 */));
+        ia64_patch(insn_addr - 1, 0x1ffffffffffUL, val >> 22);
+}
+void
+ia64_patch_imm60 (u64 insn_addr, u64 val)
+{
+        ia64_patch(insn_addr,
+                   0x011ffffe000UL, (  ((val & 0x0800000000000000UL) >> 23) /* bit 59 -> 36 */
+                                     | ((val & 0x00000000000fffffUL) << 13) /* bit  0 -> 13 */));
+        ia64_patch(insn_addr - 1, 0x1fffffffffcUL, val >> 18);
+}
+/*
+ * We need sometimes to load the physical address of a kernel
+ * object.  Often we can convert the virtual address to physical
+ * at execution time, but sometimes (either for performance reasons
+ * or during error recovery) we cannot to this.  Patch the marked
+ * bundles to load the physical address.
+ */
+void __init
+ia64_patch_vtop (unsigned long start, unsigned long end)
+{
+        s32 *offp = (s32 *) start;
+        u64 ip;
+        while (offp < (s32 *) end) {
+                ip = (u64) offp + *offp;
+                /* replace virtual address with corresponding physical address: */
+                ia64_patch_imm64(ip, ia64_tpa(get_imm64(ip)));
+                ia64_fc((void *) ip);
+                ++offp;
+        }
+        ia64_sync_i();
+        ia64_srlz_i();
+}
+void
+ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
+{
+        static int first_time = 1;
+        int need_workaround;
+        s32 *offp = (s32 *) start;
+        u64 *wp;
+        need_workaround = (local_cpu_data->family == 0x1f && local_cpu_data->model == 0);
+        if (first_time) {
+                first_time = 0;
+                if (need_workaround)
+                        printk(KERN_INFO "Leaving McKinley Errata 9 workaround enabled\n");
+                else
+                        printk(KERN_INFO "McKinley Errata 9 workaround not needed; "
+                               "disabling it\n");
+        }
+        if (need_workaround)
+                return;
+        while (offp < (s32 *) end) {
+                wp = (u64 *) ia64_imva((char *) offp + *offp);
+                wp[0] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */
+                wp[1] = 0x0004000000000200UL;
+                wp[2] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
+                wp[3] = 0x0084006880000200UL;
+                ia64_fc(wp); ia64_fc(wp + 2);
+                ++offp;
+        }
+        ia64_sync_i();
+        ia64_srlz_i();
+}
+static void
+patch_fsyscall_table (unsigned long start, unsigned long end)
+{
+        extern unsigned long fsyscall_table[NR_syscalls];
+        s32 *offp = (s32 *) start;
+        u64 ip;
+        while (offp < (s32 *) end) {
+                ip = (u64) ia64_imva((char *) offp + *offp);
+                ia64_patch_imm64(ip, (u64) fsyscall_table);
+                ia64_fc((void *) ip);
+                ++offp;
+        }
+        ia64_sync_i();
+        ia64_srlz_i();
+}
+static void
+patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
+{
+        extern char fsys_bubble_down[];
+        s32 *offp = (s32 *) start;
+        u64 ip;
+        while (offp < (s32 *) end) {
+                ip = (u64) offp + *offp;
+                ia64_patch_imm60((u64) ia64_imva((void *) ip),
+                                 (u64) (fsys_bubble_down - (ip & -16)) / 16);
+                ia64_fc((void *) ip);
+                ++offp;
+        }
+        ia64_sync_i();
+        ia64_srlz_i();
+}
+void
+ia64_patch_gate (void)
+{
+#       define START(name)      ((unsigned long) __start_gate_##name##_patchlist)
+#       define END(name)        ((unsigned long)__end_gate_##name##_patchlist)
+        patch_fsyscall_table(START(fsyscall), END(fsyscall));
+        patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
+        ia64_patch_vtop(START(vtop), END(vtop));
+        ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
+}
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
new file mode 100644
index 000000000000..71147be3279c
--- /dev/null
+++ b/arch/ia64/kernel/perfmon.c
@@ -0,0 +1,6676 @@
+/*
+ * This file implements the perfmon-2 subsystem which is used
+ * to program the IA-64 Performance Monitoring Unit (PMU).
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (C) 1999-2003, 2005  Hewlett Packard Co
+ *               Stephane Eranian <eranian@hpl.hp.com>
+ *               David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ *      http://www.hpl.hp.com/research/linux/perfmon
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/list.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/vfs.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/version.h>
+#include <linux/bitops.h>
+#include <asm/errno.h>
+#include <asm/intrinsics.h>
+#include <asm/page.h>
+#include <asm/perfmon.h>
+#include <asm/processor.h>
+#include <asm/signal.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/delay.h>
+#ifdef CONFIG_PERFMON
+/*
+ * perfmon context state
+ */
+#define PFM_CTX_UNLOADED        1       /* context is not loaded onto any task */
+#define PFM_CTX_LOADED          2       /* context is loaded onto a task */
+#define PFM_CTX_MASKED          3       /* context is loaded but monitoring is masked due to overflow */
+#define PFM_CTX_ZOMBIE          4       /* owner of the context is closing it */
+#define PFM_INVALID_ACTIVATION  (~0UL)
+/*
+ * depth of message queue
+ */
+#define PFM_MAX_MSGS            32
+#define PFM_CTXQ_EMPTY(g)       ((g)->ctx_msgq_head == (g)->ctx_msgq_tail)
+/*
+ * type of a PMU register (bitmask).
+ * bitmask structure:
+ *      bit0   : register implemented
+ *      bit1   : end marker
+ *      bit2-3 : reserved
+ *      bit4   : pmc has pmc.pm
+ *      bit5   : pmc controls a counter (has pmc.oi), pmd is used as counter
+ *      bit6-7 : register type
+ *      bit8-31: reserved
+ */
+#define PFM_REG_NOTIMPL         0x0 /* not implemented at all */
+#define PFM_REG_IMPL            0x1 /* register implemented */
+#define PFM_REG_END             0x2 /* end marker */
+#define PFM_REG_MONITOR         (0x1<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */
+#define PFM_REG_COUNTING        (0x2<<4|PFM_REG_MONITOR) /* a monitor + pmc.oi+ PMD used as a counter */
+#define PFM_REG_CONTROL         (0x4<<4|PFM_REG_IMPL) /* PMU control register */
+#define PFM_REG_CONFIG          (0x8<<4|PFM_REG_IMPL) /* configuration register */
+#define PFM_REG_BUFFER          (0xc<<4|PFM_REG_IMPL) /* PMD used as buffer */
+#define PMC_IS_LAST(i)  (pmu_conf->pmc_desc[i].type & PFM_REG_END)
+#define PMD_IS_LAST(i)  (pmu_conf->pmd_desc[i].type & PFM_REG_END)
+#define PMC_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_pmds[i].flags &  PFM_REGFL_OVFL_NOTIFY)
+/* i assumed unsigned */
+#define PMC_IS_IMPL(i)    (i< PMU_MAX_PMCS && (pmu_conf->pmc_desc[i].type & PFM_REG_IMPL))
+#define PMD_IS_IMPL(i)    (i< PMU_MAX_PMDS && (pmu_conf->pmd_desc[i].type & PFM_REG_IMPL))
+/* XXX: these assume that register i is implemented */
+#define PMD_IS_COUNTING(i) ((pmu_conf->pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
+#define PMC_IS_COUNTING(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
+#define PMC_IS_MONITOR(i)  ((pmu_conf->pmc_desc[i].type & PFM_REG_MONITOR)  == PFM_REG_MONITOR)
+#define PMC_IS_CONTROL(i)  ((pmu_conf->pmc_desc[i].type & PFM_REG_CONTROL)  == PFM_REG_CONTROL)
+#define PMC_DFL_VAL(i)     pmu_conf->pmc_desc[i].default_value
+#define PMC_RSVD_MASK(i)   pmu_conf->pmc_desc[i].reserved_mask
+#define PMD_PMD_DEP(i)     pmu_conf->pmd_desc[i].dep_pmd[0]
+#define PMC_PMD_DEP(i)     pmu_conf->pmc_desc[i].dep_pmd[0]
+#define PFM_NUM_IBRS      IA64_NUM_DBG_REGS
+#define PFM_NUM_DBRS      IA64_NUM_DBG_REGS
+#define CTX_OVFL_NOBLOCK(c)     ((c)->ctx_fl_block == 0)
+#define CTX_HAS_SMPL(c)         ((c)->ctx_fl_is_sampling)
+#define PFM_CTX_TASK(h)         (h)->ctx_task
+#define PMU_PMC_OI              5 /* position of pmc.oi bit */
+/* XXX: does not support more than 64 PMDs */
+#define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask)
+#define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
+#define CTX_USED_MONITOR(ctx, mask) (ctx)->ctx_used_monitors[0] |= (mask)
+#define CTX_USED_IBR(ctx,n)     (ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64)
+#define CTX_USED_DBR(ctx,n)     (ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64)
+#define CTX_USES_DBREGS(ctx)    (((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1)
+#define PFM_CODE_RR     0       /* requesting code range restriction */
+#define PFM_DATA_RR     1       /* requestion data range restriction */
+#define PFM_CPUINFO_CLEAR(v)    pfm_get_cpu_var(pfm_syst_info) &= ~(v)
+#define PFM_CPUINFO_SET(v)      pfm_get_cpu_var(pfm_syst_info) |= (v)
+#define PFM_CPUINFO_GET()       pfm_get_cpu_var(pfm_syst_info)
+#define RDEP(x) (1UL<<(x))
+/*
+ * context protection macros
+ * in SMP:
+ *      - we need to protect against CPU concurrency (spin_lock)
+ *      - we need to protect against PMU overflow interrupts (local_irq_disable)
+ * in UP:
+ *      - we need to protect against PMU overflow interrupts (local_irq_disable)
+ *
+ * spin_lock_irqsave()/spin_lock_irqrestore():
+ *      in SMP: local_irq_disable + spin_lock
+ *      in UP : local_irq_disable
+ *
+ * spin_lock()/spin_lock():
+ *      in UP : removed automatically
+ *      in SMP: protect against context accesses from other CPU. interrupts
+ *              are not masked. This is useful for the PMU interrupt handler
+ *              because we know we will not get PMU concurrency in that code.
+ */
+#define PROTECT_CTX(c, f) \
+        do {  \
+                DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, current->pid)); \
+                spin_lock_irqsave(&(c)->ctx_lock, f); \
+                DPRINT(("spinlocked ctx %p  by [%d]\n", c, current->pid)); \
+        } while(0)
+#define UNPROTECT_CTX(c, f) \
+        do { \
+                DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, current->pid)); \
+                spin_unlock_irqrestore(&(c)->ctx_lock, f); \
+        } while(0)
+#define PROTECT_CTX_NOPRINT(c, f) \
+        do {  \
+                spin_lock_irqsave(&(c)->ctx_lock, f); \
+        } while(0)
+#define UNPROTECT_CTX_NOPRINT(c, f) \
+        do { \
+                spin_unlock_irqrestore(&(c)->ctx_lock, f); \
+        } while(0)
+#define PROTECT_CTX_NOIRQ(c) \
+        do {  \
+                spin_lock(&(c)->ctx_lock); \
+        } while(0)
+#define UNPROTECT_CTX_NOIRQ(c) \
+        do { \
+                spin_unlock(&(c)->ctx_lock); \
+        } while(0)
+#ifdef CONFIG_SMP
+#define GET_ACTIVATION()        pfm_get_cpu_var(pmu_activation_number)
+#define INC_ACTIVATION()        pfm_get_cpu_var(pmu_activation_number)++
+#define SET_ACTIVATION(c)       (c)->ctx_last_activation = GET_ACTIVATION()
+#else /* !CONFIG_SMP */
+#define SET_ACTIVATION(t)       do {} while(0)
+#define GET_ACTIVATION(t)       do {} while(0)
+#define INC_ACTIVATION(t)       do {} while(0)
+#endif /* CONFIG_SMP */
+#define SET_PMU_OWNER(t, c)     do { pfm_get_cpu_var(pmu_owner) = (t); pfm_get_cpu_var(pmu_ctx) = (c); } while(0)
+#define GET_PMU_OWNER()         pfm_get_cpu_var(pmu_owner)
+#define GET_PMU_CTX()           pfm_get_cpu_var(pmu_ctx)
+#define LOCK_PFS(g)             spin_lock_irqsave(&pfm_sessions.pfs_lock, g)
+#define UNLOCK_PFS(g)           spin_unlock_irqrestore(&pfm_sessions.pfs_lock, g)
+#define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0)
+/*
+ * cmp0 must be the value of pmc0
+ */
+#define PMC0_HAS_OVFL(cmp0)  (cmp0 & ~0x1UL)
+#define PFMFS_MAGIC 0xa0b4d889
+/*
+ * debugging
+ */
+#define PFM_DEBUGGING 1
+#ifdef PFM_DEBUGGING
+#define DPRINT(a) \
+        do { \
+                if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
+        } while (0)
+#define DPRINT_ovfl(a) \
+        do { \
+                if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
+        } while (0)
+#endif
+/*
+ * 64-bit software counter structure
+ *
+ * the next_reset_type is applied to the next call to pfm_reset_regs()
+ */
+typedef struct {
+        unsigned long   val;            /* virtual 64bit counter value */
+        unsigned long   lval;           /* last reset value */
+        unsigned long   long_reset;     /* reset value on sampling overflow */
+        unsigned long   short_reset;    /* reset value on overflow */
+        unsigned long   reset_pmds[4];  /* which other pmds to reset when this counter overflows */
+        unsigned long   smpl_pmds[4];   /* which pmds are accessed when counter overflow */
+        unsigned long   seed;           /* seed for random-number generator */
+        unsigned long   mask;           /* mask for random-number generator */
+        unsigned int    flags;          /* notify/do not notify */
+        unsigned long   eventid;        /* overflow event identifier */
+} pfm_counter_t;
+/*
+ * context flags
+ */
+typedef struct {
+        unsigned int block:1;           /* when 1, task will blocked on user notifications */
+        unsigned int system:1;          /* do system wide monitoring */
+        unsigned int using_dbreg:1;     /* using range restrictions (debug registers) */
+        unsigned int is_sampling:1;     /* true if using a custom format */
+        unsigned int excl_idle:1;       /* exclude idle task in system wide session */
+        unsigned int going_zombie:1;    /* context is zombie (MASKED+blocking) */
+        unsigned int trap_reason:2;     /* reason for going into pfm_handle_work() */
+        unsigned int no_msg:1;          /* no message sent on overflow */
+        unsigned int can_restart:1;     /* allowed to issue a PFM_RESTART */
+        unsigned int reserved:22;
+} pfm_context_flags_t;
+#define PFM_TRAP_REASON_NONE            0x0     /* default value */
+#define PFM_TRAP_REASON_BLOCK           0x1     /* we need to block on overflow */
+#define PFM_TRAP_REASON_RESET           0x2     /* we need to reset PMDs */
+/*
+ * perfmon context: encapsulates all the state of a monitoring session
+ */
+typedef struct pfm_context {
+        spinlock_t              ctx_lock;               /* context protection */
+        pfm_context_flags_t     ctx_flags;              /* bitmask of flags  (block reason incl.) */
+        unsigned int            ctx_state;              /* state: active/inactive (no bitfield) */
+        struct task_struct      *ctx_task;              /* task to which context is attached */
+        unsigned long           ctx_ovfl_regs[4];       /* which registers overflowed (notification) */
+        struct semaphore        ctx_restart_sem;        /* use for blocking notification mode */
+        unsigned long           ctx_used_pmds[4];       /* bitmask of PMD used            */
+        unsigned long           ctx_all_pmds[4];        /* bitmask of all accessible PMDs */
+        unsigned long           ctx_reload_pmds[4];     /* bitmask of force reload PMD on ctxsw in */
+        unsigned long           ctx_all_pmcs[4];        /* bitmask of all accessible PMCs */
+        unsigned long           ctx_reload_pmcs[4];     /* bitmask of force reload PMC on ctxsw in */
+        unsigned long           ctx_used_monitors[4];   /* bitmask of monitor PMC being used */
+        unsigned long           ctx_pmcs[IA64_NUM_PMC_REGS];    /*  saved copies of PMC values */
+        unsigned int            ctx_used_ibrs[1];               /* bitmask of used IBR (speedup ctxsw in) */
+        unsigned int            ctx_used_dbrs[1];               /* bitmask of used DBR (speedup ctxsw in) */
+        unsigned long           ctx_dbrs[IA64_NUM_DBG_REGS];    /* DBR values (cache) when not loaded */
+        unsigned long           ctx_ibrs[IA64_NUM_DBG_REGS];    /* IBR values (cache) when not loaded */
+        pfm_counter_t           ctx_pmds[IA64_NUM_PMD_REGS]; /* software state for PMDS */
+        u64                     ctx_saved_psr_up;       /* only contains psr.up value */
+        unsigned long           ctx_last_activation;    /* context last activation number for last_cpu */
+        unsigned int            ctx_last_cpu;           /* CPU id of current or last CPU used (SMP only) */
+        unsigned int            ctx_cpu;                /* cpu to which perfmon is applied (system wide) */
+        int                     ctx_fd;                 /* file descriptor used my this context */
+        pfm_ovfl_arg_t          ctx_ovfl_arg;           /* argument to custom buffer format handler */
+        pfm_buffer_fmt_t        *ctx_buf_fmt;           /* buffer format callbacks */
+        void                    *ctx_smpl_hdr;          /* points to sampling buffer header kernel vaddr */
+        unsigned long           ctx_smpl_size;          /* size of sampling buffer */
+        void                    *ctx_smpl_vaddr;        /* user level virtual address of smpl buffer */
+        wait_queue_head_t       ctx_msgq_wait;
+        pfm_msg_t               ctx_msgq[PFM_MAX_MSGS];
+        int                     ctx_msgq_head;
+        int                     ctx_msgq_tail;
+        struct fasync_struct    *ctx_async_queue;
+        wait_queue_head_t       ctx_zombieq;            /* termination cleanup wait queue */
+} pfm_context_t;
+/*
+ * magic number used to verify that structure is really
+ * a perfmon context
+ */
+#define PFM_IS_FILE(f)          ((f)->f_op == &pfm_file_ops)
+#define PFM_GET_CTX(t)          ((pfm_context_t *)(t)->thread.pfm_context)
+#ifdef CONFIG_SMP
+#define SET_LAST_CPU(ctx, v)    (ctx)->ctx_last_cpu = (v)
+#define GET_LAST_CPU(ctx)       (ctx)->ctx_last_cpu
+#else
+#define SET_LAST_CPU(ctx, v)    do {} while(0)
+#define GET_LAST_CPU(ctx)       do {} while(0)
+#endif
+#define ctx_fl_block            ctx_flags.block
+#define ctx_fl_system           ctx_flags.system
+#define ctx_fl_using_dbreg      ctx_flags.using_dbreg
+#define ctx_fl_is_sampling      ctx_flags.is_sampling
+#define ctx_fl_excl_idle        ctx_flags.excl_idle
+#define ctx_fl_going_zombie     ctx_flags.going_zombie
+#define ctx_fl_trap_reason      ctx_flags.trap_reason
+#define ctx_fl_no_msg           ctx_flags.no_msg
+#define ctx_fl_can_restart      ctx_flags.can_restart
+#define PFM_SET_WORK_PENDING(t, v)      do { (t)->thread.pfm_needs_checking = v; } while(0);
+#define PFM_GET_WORK_PENDING(t)         (t)->thread.pfm_needs_checking
+/*
+ * global information about all sessions
+ * mostly used to synchronize between system wide and per-process
+ */
+typedef struct {
+        spinlock_t              pfs_lock;                  /* lock the structure */
+        unsigned int            pfs_task_sessions;         /* number of per task sessions */
+        unsigned int            pfs_sys_sessions;          /* number of per system wide sessions */
+        unsigned int            pfs_sys_use_dbregs;        /* incremented when a system wide session uses debug regs */
+        unsigned int            pfs_ptrace_use_dbregs;     /* incremented when a process uses debug regs */
+        struct task_struct      *pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */
+} pfm_session_t;
+/*
+ * information about a PMC or PMD.
+ * dep_pmd[]: a bitmask of dependent PMD registers
+ * dep_pmc[]: a bitmask of dependent PMC registers
+ */
+typedef int (*pfm_reg_check_t)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
+typedef struct {
+        unsigned int            type;
+        int                     pm_pos;
+        unsigned long           default_value;  /* power-on default value */
+        unsigned long           reserved_mask;  /* bitmask of reserved bits */
+        pfm_reg_check_t         read_check;
+        pfm_reg_check_t         write_check;
+        unsigned long           dep_pmd[4];
+        unsigned long           dep_pmc[4];
+} pfm_reg_desc_t;
+/* assume cnum is a valid monitor */
+#define PMC_PM(cnum, val)       (((val) >> (pmu_conf->pmc_desc[cnum].pm_pos)) & 0x1)
+/*
+ * This structure is initialized at boot time and contains
+ * a description of the PMU main characteristics.
+ *
+ * If the probe function is defined, detection is based
+ * on its return value: 
+ *      - 0 means recognized PMU
+ *      - anything else means not supported
+ * When the probe function is not defined, then the pmu_family field
+ * is used and it must match the host CPU family such that:
+ *      - cpu->family & config->pmu_family != 0
+ */
+typedef struct {
+        unsigned long  ovfl_val;        /* overflow value for counters */
+        pfm_reg_desc_t *pmc_desc;       /* detailed PMC register dependencies descriptions */
+        pfm_reg_desc_t *pmd_desc;       /* detailed PMD register dependencies descriptions */
+        unsigned int   num_pmcs;        /* number of PMCS: computed at init time */
+        unsigned int   num_pmds;        /* number of PMDS: computed at init time */
+        unsigned long  impl_pmcs[4];    /* bitmask of implemented PMCS */
+        unsigned long  impl_pmds[4];    /* bitmask of implemented PMDS */
+        char          *pmu_name;        /* PMU family name */
+        unsigned int  pmu_family;       /* cpuid family pattern used to identify pmu */
+        unsigned int  flags;            /* pmu specific flags */
+        unsigned int  num_ibrs;         /* number of IBRS: computed at init time */
+        unsigned int  num_dbrs;         /* number of DBRS: computed at init time */
+        unsigned int  num_counters;     /* PMC/PMD counting pairs : computed at init time */
+        int           (*probe)(void);   /* customized probe routine */
+        unsigned int  use_rr_dbregs:1;  /* set if debug registers used for range restriction */
+} pmu_config_t;
+/*
+ * PMU specific flags
+ */
+#define PFM_PMU_IRQ_RESEND      1       /* PMU needs explicit IRQ resend */
+/*
+ * debug register related type definitions
+ */
+typedef struct {
+        unsigned long ibr_mask:56;
+        unsigned long ibr_plm:4;
+        unsigned long ibr_ig:3;
+        unsigned long ibr_x:1;
+} ibr_mask_reg_t;
+typedef struct {
+        unsigned long dbr_mask:56;
+        unsigned long dbr_plm:4;
+        unsigned long dbr_ig:2;
+        unsigned long dbr_w:1;
+        unsigned long dbr_r:1;
+} dbr_mask_reg_t;
+typedef union {
+        unsigned long  val;
+        ibr_mask_reg_t ibr;
+        dbr_mask_reg_t dbr;
+} dbreg_t;
+/*
+ * perfmon command descriptions
+ */
+typedef struct {
+        int             (*cmd_func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
+        char            *cmd_name;
+        int             cmd_flags;
+        unsigned int    cmd_narg;
+        size_t          cmd_argsize;
+        int             (*cmd_getsize)(void *arg, size_t *sz);
+} pfm_cmd_desc_t;
+#define PFM_CMD_FD              0x01    /* command requires a file descriptor */
+#define PFM_CMD_ARG_READ        0x02    /* command must read argument(s) */
+#define PFM_CMD_ARG_RW          0x04    /* command must read/write argument(s) */
+#define PFM_CMD_STOP            0x08    /* command does not work on zombie context */
+#define PFM_CMD_NAME(cmd)       pfm_cmd_tab[(cmd)].cmd_name
+#define PFM_CMD_READ_ARG(cmd)   (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_READ)
+#define PFM_CMD_RW_ARG(cmd)     (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_RW)
+#define PFM_CMD_USE_FD(cmd)     (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_FD)
+#define PFM_CMD_STOPPED(cmd)    (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_STOP)
+#define PFM_CMD_ARG_MANY        -1 /* cannot be zero */
+typedef struct {
+        int     debug;          /* turn on/off debugging via syslog */
+        int     debug_ovfl;     /* turn on/off debug printk in overflow handler */
+        int     fastctxsw;      /* turn on/off fast (unsecure) ctxsw */
+        int     expert_mode;    /* turn on/off value checking */
+        int     debug_pfm_read;
+} pfm_sysctl_t;
+typedef struct {
+        unsigned long pfm_spurious_ovfl_intr_count;     /* keep track of spurious ovfl interrupts */
+        unsigned long pfm_replay_ovfl_intr_count;       /* keep track of replayed ovfl interrupts */
+        unsigned long pfm_ovfl_intr_count;              /* keep track of ovfl interrupts */
+        unsigned long pfm_ovfl_intr_cycles;             /* cycles spent processing ovfl interrupts */
+        unsigned long pfm_ovfl_intr_cycles_min;         /* min cycles spent processing ovfl interrupts */
+        unsigned long pfm_ovfl_intr_cycles_max;         /* max cycles spent processing ovfl interrupts */
+        unsigned long pfm_smpl_handler_calls;
+        unsigned long pfm_smpl_handler_cycles;
+        char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
+} pfm_stats_t;
+/*
+ * perfmon internal variables
+ */
+static pfm_stats_t              pfm_stats[NR_CPUS];
+static pfm_session_t            pfm_sessions;   /* global sessions information */
+static struct proc_dir_entry    *perfmon_dir;
+static pfm_uuid_t               pfm_null_uuid = {0,};
+static spinlock_t               pfm_buffer_fmt_lock;
+static LIST_HEAD(pfm_buffer_fmt_list);
+static pmu_config_t             *pmu_conf;
+/* sysctl() controls */
+static pfm_sysctl_t pfm_sysctl;
+int pfm_debug_var;
+static ctl_table pfm_ctl_table[]={
+        {1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
+        {2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
+        {3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
+        {4, "expert_mode", &pfm_sysctl.expert_mode, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
+        { 0, },
+};
+static ctl_table pfm_sysctl_dir[] = {
+        {1, "perfmon", NULL, 0, 0755, pfm_ctl_table, },
+        {0,},
+};
+static ctl_table pfm_sysctl_root[] = {
+        {1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, },
+        {0,},
+};
+static struct ctl_table_header *pfm_sysctl_header;
+static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
+static int pfm_flush(struct file *filp);
+#define pfm_get_cpu_var(v)              __ia64_per_cpu_var(v)
+#define pfm_get_cpu_data(a,b)           per_cpu(a, b)
+static inline void
+pfm_put_task(struct task_struct *task)
+{
+        if (task != current) put_task_struct(task);
+}
+static inline void
+pfm_set_task_notify(struct task_struct *task)
+{
+        struct thread_info *info;
+        info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE);
+        set_bit(TIF_NOTIFY_RESUME, &info->flags);
+}
+static inline void
+pfm_clear_task_notify(void)
+{
+        clear_thread_flag(TIF_NOTIFY_RESUME);
+}
+static inline void
+pfm_reserve_page(unsigned long a)
+{
+        SetPageReserved(vmalloc_to_page((void *)a));
+}
+static inline void
+pfm_unreserve_page(unsigned long a)
+{
+        ClearPageReserved(vmalloc_to_page((void*)a));
+}
+static inline unsigned long
+pfm_protect_ctx_ctxsw(pfm_context_t *x)
+{
+        spin_lock(&(x)->ctx_lock);
+        return 0UL;
+}
+static inline unsigned long
+pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
+{
+        spin_unlock(&(x)->ctx_lock);
+}
+static inline unsigned int
+pfm_do_munmap(struct mm_struct *mm, unsigned long addr, size_t len, int acct)
+{
+        return do_munmap(mm, addr, len);
+}
+static inline unsigned long 
+pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
+{
+        return get_unmapped_area(file, addr, len, pgoff, flags);
+}
+static struct super_block *
+pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
+{
+        return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC);
+}
+static struct file_system_type pfm_fs_type = {
+        .name     = "pfmfs",
+        .get_sb   = pfmfs_get_sb,
+        .kill_sb  = kill_anon_super,
+};
+DEFINE_PER_CPU(unsigned long, pfm_syst_info);
+DEFINE_PER_CPU(struct task_struct *, pmu_owner);
+DEFINE_PER_CPU(pfm_context_t  *, pmu_ctx);
+DEFINE_PER_CPU(unsigned long, pmu_activation_number);
+/* forward declaration */
+static struct file_operations pfm_file_ops;
+/*
+ * forward declarations
+ */
+#ifndef CONFIG_SMP
+static void pfm_lazy_save_regs (struct task_struct *ta);
+#endif
+void dump_pmu_state(const char *);
+static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
+#include "perfmon_itanium.h"
+#include "perfmon_mckinley.h"
+#include "perfmon_generic.h"
+static pmu_config_t *pmu_confs[]={
+        &pmu_conf_mck,
+        &pmu_conf_ita,
+        &pmu_conf_gen, /* must be last */
+        NULL
+};
+static int pfm_end_notify_user(pfm_context_t *ctx);
+static inline void
+pfm_clear_psr_pp(void)
+{
+        ia64_rsm(IA64_PSR_PP);
+        ia64_srlz_i();
+}
+static inline void
+pfm_set_psr_pp(void)
+{
+        ia64_ssm(IA64_PSR_PP);
+        ia64_srlz_i();
+}
+static inline void
+pfm_clear_psr_up(void)
+{
+        ia64_rsm(IA64_PSR_UP);
+        ia64_srlz_i();
+}
+static inline void
+pfm_set_psr_up(void)
+{
+        ia64_ssm(IA64_PSR_UP);
+        ia64_srlz_i();
+}
+static inline unsigned long
+pfm_get_psr(void)
+{
+        unsigned long tmp;
+        tmp = ia64_getreg(_IA64_REG_PSR);
+        ia64_srlz_i();
+        return tmp;
+}
+static inline void
+pfm_set_psr_l(unsigned long val)
+{
+        ia64_setreg(_IA64_REG_PSR_L, val);
+        ia64_srlz_i();
+}
+static inline void
+pfm_freeze_pmu(void)
+{
+        ia64_set_pmc(0,1UL);
+        ia64_srlz_d();
+}
+static inline void
+pfm_unfreeze_pmu(void)
+{
+        ia64_set_pmc(0,0UL);
+        ia64_srlz_d();
+}
+static inline void
+pfm_restore_ibrs(unsigned long *ibrs, unsigned int nibrs)
+{
+        int i;
+        for (i=0; i < nibrs; i++) {
+                ia64_set_ibr(i, ibrs[i]);
+                ia64_dv_serialize_instruction();
+        }
+        ia64_srlz_i();
+}
+static inline void
+pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs)
+{
+        int i;
+        for (i=0; i < ndbrs; i++) {
+                ia64_set_dbr(i, dbrs[i]);
+                ia64_dv_serialize_data();
+        }
+        ia64_srlz_d();
+}
+/*
+ * PMD[i] must be a counter. no check is made
+ */
+static inline unsigned long
+pfm_read_soft_counter(pfm_context_t *ctx, int i)
+{
+        return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf->ovfl_val);
+}
+/*
+ * PMD[i] must be a counter. no check is made
+ */
+static inline void
+pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
+{
+        unsigned long ovfl_val = pmu_conf->ovfl_val;
+        ctx->ctx_pmds[i].val = val  & ~ovfl_val;
+        /*
+         * writing to unimplemented part is ignore, so we do not need to
+         * mask off top part
+         */
+        ia64_set_pmd(i, val & ovfl_val);
+}
+static pfm_msg_t *
+pfm_get_new_msg(pfm_context_t *ctx)
+{
+        int idx, next;
+        next = (ctx->ctx_msgq_tail+1) % PFM_MAX_MSGS;
+        DPRINT(("ctx_fd=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
+        if (next == ctx->ctx_msgq_head) return NULL;
+        idx =   ctx->ctx_msgq_tail;
+        ctx->ctx_msgq_tail = next;
+        DPRINT(("ctx=%p head=%d tail=%d msg=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, idx));
+        return ctx->ctx_msgq+idx;
+}
+static pfm_msg_t *
+pfm_get_next_msg(pfm_context_t *ctx)
+{
+        pfm_msg_t *msg;
+        DPRINT(("ctx=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
+        if (PFM_CTXQ_EMPTY(ctx)) return NULL;
+        /*
+         * get oldest message
+         */
+        msg = ctx->ctx_msgq+ctx->ctx_msgq_head;
+        /*
+         * and move forward
+         */
+        ctx->ctx_msgq_head = (ctx->ctx_msgq_head+1) % PFM_MAX_MSGS;
+        DPRINT(("ctx=%p head=%d tail=%d type=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, msg->pfm_gen_msg.msg_type));
+        return msg;
+}
+static void
+pfm_reset_msgq(pfm_context_t *ctx)
+{
+        ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
+        DPRINT(("ctx=%p msgq reset\n", ctx));
+}
+static void *
+pfm_rvmalloc(unsigned long size)
+{
+        void *mem;
+        unsigned long addr;
+        size = PAGE_ALIGN(size);
+        mem  = vmalloc(size);
+        if (mem) {
+                //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
+                memset(mem, 0, size);
+                addr = (unsigned long)mem;
+                while (size > 0) {
+                        pfm_reserve_page(addr);
+                        addr+=PAGE_SIZE;
+                        size-=PAGE_SIZE;
+                }
+        }
+        return mem;
+}
+static void
+pfm_rvfree(void *mem, unsigned long size)
+{
+        unsigned long addr;
+        if (mem) {
+                DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size));
+                addr = (unsigned long) mem;
+                while ((long) size > 0) {
+                        pfm_unreserve_page(addr);
+                        addr+=PAGE_SIZE;
+                        size-=PAGE_SIZE;
+                }
+                vfree(mem);
+        }
+        return;
+}
+static pfm_context_t *
+pfm_context_alloc(void)
+{
+        pfm_context_t *ctx;
+        /* 
+         * allocate context descriptor 
+         * must be able to free with interrupts disabled
+         */
+        ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL);
+        if (ctx) {
+                memset(ctx, 0, sizeof(pfm_context_t));
+                DPRINT(("alloc ctx @%p\n", ctx));
+        }
+        return ctx;
+}
+static void
+pfm_context_free(pfm_context_t *ctx)
+{
+        if (ctx) {
+                DPRINT(("free ctx @%p\n", ctx));
+                kfree(ctx);
+        }
+}
+static void
+pfm_mask_monitoring(struct task_struct *task)
+{
+        pfm_context_t *ctx = PFM_GET_CTX(task);
+        struct thread_struct *th = &task->thread;
+        unsigned long mask, val, ovfl_mask;
+        int i;
+        DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid));
+        ovfl_mask = pmu_conf->ovfl_val;
+        /*
+         * monitoring can only be masked as a result of a valid
+         * counter overflow. In UP, it means that the PMU still
+         * has an owner. Note that the owner can be different
+         * from the current task. However the PMU state belongs
+         * to the owner.
+         * In SMP, a valid overflow only happens when task is
+         * current. Therefore if we come here, we know that
+         * the PMU state belongs to the current task, therefore
+         * we can access the live registers.
+         *
+         * So in both cases, the live register contains the owner's
+         * state. We can ONLY touch the PMU registers and NOT the PSR.
+         *
+         * As a consequence to this call, the thread->pmds[] array
+         * contains stale information which must be ignored
+         * when context is reloaded AND monitoring is active (see
+         * pfm_restart).
+         */
+        mask = ctx->ctx_used_pmds[0];
+        for (i = 0; mask; i++, mask>>=1) {
+                /* skip non used pmds */
+                if ((mask & 0x1) == 0) continue;
+                val = ia64_get_pmd(i);
+                if (PMD_IS_COUNTING(i)) {
+                        /*
+                         * we rebuild the full 64 bit value of the counter
+                         */
+                        ctx->ctx_pmds[i].val += (val & ovfl_mask);
+                } else {
+                        ctx->ctx_pmds[i].val = val;
+                }
+                DPRINT_ovfl(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
+                        i,
+                        ctx->ctx_pmds[i].val,
+                        val & ovfl_mask));
+        }
+        /*
+         * mask monitoring by setting the privilege level to 0
+         * we cannot use psr.pp/psr.up for this, it is controlled by
+         * the user
+         *
+         * if task is current, modify actual registers, otherwise modify
+         * thread save state, i.e., what will be restored in pfm_load_regs()
+         */
+        mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
+        for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
+                if ((mask & 0x1) == 0UL) continue;
+                ia64_set_pmc(i, th->pmcs[i] & ~0xfUL);
+                th->pmcs[i] &= ~0xfUL;
+                DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, th->pmcs[i]));
+        }
+        /*
+         * make all of this visible
+         */
+        ia64_srlz_d();
+}
+/*
+ * must always be done with task == current
+ *
+ * context must be in MASKED state when calling
+ */
+static void
+pfm_restore_monitoring(struct task_struct *task)
+{
+        pfm_context_t *ctx = PFM_GET_CTX(task);
+        struct thread_struct *th = &task->thread;
+        unsigned long mask, ovfl_mask;
+        unsigned long psr, val;
+        int i, is_system;
+        is_system = ctx->ctx_fl_system;
+        ovfl_mask = pmu_conf->ovfl_val;
+        if (task != current) {
+                printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid);
+                return;
+        }
+        if (ctx->ctx_state != PFM_CTX_MASKED) {
+                printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__,
+                        task->pid, current->pid, ctx->ctx_state);
+                return;
+        }
+        psr = pfm_get_psr();
+        /*
+         * monitoring is masked via the PMC.
+         * As we restore their value, we do not want each counter to
+         * restart right away. We stop monitoring using the PSR,
+         * restore the PMC (and PMD) and then re-establish the psr
+         * as it was. Note that there can be no pending overflow at
+         * this point, because monitoring was MASKED.
+         *
+         * system-wide session are pinned and self-monitoring
+         */
+        if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
+                /* disable dcr pp */
+                ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
+                pfm_clear_psr_pp();
+        } else {
+                pfm_clear_psr_up();
+        }
+        /*
+         * first, we restore the PMD
+         */
+        mask = ctx->ctx_used_pmds[0];
+        for (i = 0; mask; i++, mask>>=1) {
+                /* skip non used pmds */
+                if ((mask & 0x1) == 0) continue;
+                if (PMD_IS_COUNTING(i)) {
+                        /*
+                         * we split the 64bit value according to
+                         * counter width
+                         */
+                        val = ctx->ctx_pmds[i].val & ovfl_mask;
+                        ctx->ctx_pmds[i].val &= ~ovfl_mask;
+                } else {
+                        val = ctx->ctx_pmds[i].val;
+                }
+                ia64_set_pmd(i, val);
+                DPRINT(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
+                        i,
+                        ctx->ctx_pmds[i].val,
+                        val));
+        }
+        /*
+         * restore the PMCs
+         */
+        mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
+        for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
+                if ((mask & 0x1) == 0UL) continue;
+                th->pmcs[i] = ctx->ctx_pmcs[i];
+                ia64_set_pmc(i, th->pmcs[i]);
+                DPRINT(("[%d] pmc[%d]=0x%lx\n", task->pid, i, th->pmcs[i]));
+        }
+        ia64_srlz_d();
+        /*
+         * must restore DBR/IBR because could be modified while masked
+         * XXX: need to optimize 
+         */
+        if (ctx->ctx_fl_using_dbreg) {
+                pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+                pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
+        }
+        /*
+         * now restore PSR
+         */
+        if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
+                /* enable dcr pp */
+                ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP);
+                ia64_srlz_i();
+        }
+        pfm_set_psr_l(psr);
+}
+static inline void
+pfm_save_pmds(unsigned long *pmds, unsigned long mask)
+{
+        int i;
+        ia64_srlz_d();
+        for (i=0; mask; i++, mask>>=1) {
+                if (mask & 0x1) pmds[i] = ia64_get_pmd(i);
+        }
+}
+/*
+ * reload from thread state (used for ctxw only)
+ */
+static inline void
+pfm_restore_pmds(unsigned long *pmds, unsigned long mask)
+{
+        int i;
+        unsigned long val, ovfl_val = pmu_conf->ovfl_val;
+        for (i=0; mask; i++, mask>>=1) {
+                if ((mask & 0x1) == 0) continue;
+                val = PMD_IS_COUNTING(i) ? pmds[i] & ovfl_val : pmds[i];
+                ia64_set_pmd(i, val);
+        }
+        ia64_srlz_d();
+}
+/*
+ * propagate PMD from context to thread-state
+ */
+static inline void
+pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx)
+{
+        struct thread_struct *thread = &task->thread;
+        unsigned long ovfl_val = pmu_conf->ovfl_val;
+        unsigned long mask = ctx->ctx_all_pmds[0];
+        unsigned long val;
+        int i;
+        DPRINT(("mask=0x%lx\n", mask));
+        for (i=0; mask; i++, mask>>=1) {
+                val = ctx->ctx_pmds[i].val;
+                /*
+                 * We break up the 64 bit value into 2 pieces
+                 * the lower bits go to the machine state in the
+                 * thread (will be reloaded on ctxsw in).
+                 * The upper part stays in the soft-counter.
+                 */
+                if (PMD_IS_COUNTING(i)) {
+                        ctx->ctx_pmds[i].val = val & ~ovfl_val;
+                         val &= ovfl_val;
+                }
+                thread->pmds[i] = val;
+                DPRINT(("pmd[%d]=0x%lx soft_val=0x%lx\n",
+                        i,
+                        thread->pmds[i],
+                        ctx->ctx_pmds[i].val));
+        }
+}
+/*
+ * propagate PMC from context to thread-state
+ */
+static inline void
+pfm_copy_pmcs(struct task_struct *task, pfm_context_t *ctx)
+{
+        struct thread_struct *thread = &task->thread;
+        unsigned long mask = ctx->ctx_all_pmcs[0];
+        int i;
+        DPRINT(("mask=0x%lx\n", mask));
+        for (i=0; mask; i++, mask>>=1) {
+                /* masking 0 with ovfl_val yields 0 */
+                thread->pmcs[i] = ctx->ctx_pmcs[i];
+                DPRINT(("pmc[%d]=0x%lx\n", i, thread->pmcs[i]));
+        }
+}
+static inline void
+pfm_restore_pmcs(unsigned long *pmcs, unsigned long mask)
+{
+        int i;
+        for (i=0; mask; i++, mask>>=1) {
+                if ((mask & 0x1) == 0) continue;
+                ia64_set_pmc(i, pmcs[i]);
+        }
+        ia64_srlz_d();
+}
+static inline int
+pfm_uuid_cmp(pfm_uuid_t a, pfm_uuid_t b)
+{
+        return memcmp(a, b, sizeof(pfm_uuid_t));
+}
+static inline int
+pfm_buf_fmt_exit(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, struct pt_regs *regs)
+{
+        int ret = 0;
+        if (fmt->fmt_exit) ret = (*fmt->fmt_exit)(task, buf, regs);
+        return ret;
+}
+static inline int
+pfm_buf_fmt_getsize(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size)
+{
+        int ret = 0;
+        if (fmt->fmt_getsize) ret = (*fmt->fmt_getsize)(task, flags, cpu, arg, size);
+        return ret;
+}
+static inline int
+pfm_buf_fmt_validate(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags,
+                     int cpu, void *arg)
+{
+        int ret = 0;
+        if (fmt->fmt_validate) ret = (*fmt->fmt_validate)(task, flags, cpu, arg);
+        return ret;
+}
+static inline int
+pfm_buf_fmt_init(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, unsigned int flags,
+                     int cpu, void *arg)
+{
+        int ret = 0;
+        if (fmt->fmt_init) ret = (*fmt->fmt_init)(task, buf, flags, cpu, arg);
+        return ret;
+}
+static inline int
+pfm_buf_fmt_restart(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
+{
+        int ret = 0;
+        if (fmt->fmt_restart) ret = (*fmt->fmt_restart)(task, ctrl, buf, regs);
+        return ret;
+}
+static inline int
+pfm_buf_fmt_restart_active(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
+{
+        int ret = 0;
+        if (fmt->fmt_restart_active) ret = (*fmt->fmt_restart_active)(task, ctrl, buf, regs);
+        return ret;
+}
+static pfm_buffer_fmt_t *
+__pfm_find_buffer_fmt(pfm_uuid_t uuid)
+{
+        struct list_head * pos;
+        pfm_buffer_fmt_t * entry;
+        list_for_each(pos, &pfm_buffer_fmt_list) {
+                entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
+                if (pfm_uuid_cmp(uuid, entry->fmt_uuid) == 0)
+                        return entry;
+        }
+        return NULL;
+}
+ 
+/*
+ * find a buffer format based on its uuid
+ */
+static pfm_buffer_fmt_t *
+pfm_find_buffer_fmt(pfm_uuid_t uuid)
+{
+        pfm_buffer_fmt_t * fmt;
+        spin_lock(&pfm_buffer_fmt_lock);
+        fmt = __pfm_find_buffer_fmt(uuid);
+        spin_unlock(&pfm_buffer_fmt_lock);
+        return fmt;
+}
+ 
+int
+pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt)
+{
+        int ret = 0;
+        /* some sanity checks */
+        if (fmt == NULL || fmt->fmt_name == NULL) return -EINVAL;
+        /* we need at least a handler */
+        if (fmt->fmt_handler == NULL) return -EINVAL;
+        /*
+         * XXX: need check validity of fmt_arg_size
+         */
+        spin_lock(&pfm_buffer_fmt_lock);
+        if (__pfm_find_buffer_fmt(fmt->fmt_uuid)) {
+                printk(KERN_ERR "perfmon: duplicate sampling format: %s\n", fmt->fmt_name);
+                ret = -EBUSY;
+                goto out;
+        } 
+        list_add(&fmt->fmt_list, &pfm_buffer_fmt_list);
+        printk(KERN_INFO "perfmon: added sampling format %s\n", fmt->fmt_name);
+out:
+        spin_unlock(&pfm_buffer_fmt_lock);
+        return ret;
+}
+EXPORT_SYMBOL(pfm_register_buffer_fmt);
+int
+pfm_unregister_buffer_fmt(pfm_uuid_t uuid)
+{
+        pfm_buffer_fmt_t *fmt;
+        int ret = 0;
+        spin_lock(&pfm_buffer_fmt_lock);
+        fmt = __pfm_find_buffer_fmt(uuid);
+        if (!fmt) {
+                printk(KERN_ERR "perfmon: cannot unregister format, not found\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        list_del_init(&fmt->fmt_list);
+        printk(KERN_INFO "perfmon: removed sampling format: %s\n", fmt->fmt_name);
+out:
+        spin_unlock(&pfm_buffer_fmt_lock);
+        return ret;
+}
+EXPORT_SYMBOL(pfm_unregister_buffer_fmt);
+static int
+pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
+{
+        unsigned long flags;
+        /*
+         * validy checks on cpu_mask have been done upstream
+         */
+        LOCK_PFS(flags);
+        DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
+                pfm_sessions.pfs_sys_sessions,
+                pfm_sessions.pfs_task_sessions,
+                pfm_sessions.pfs_sys_use_dbregs,
+                is_syswide,
+                cpu));
+        if (is_syswide) {
+                /*
+                 * cannot mix system wide and per-task sessions
+                 */
+                if (pfm_sessions.pfs_task_sessions > 0UL) {
+                        DPRINT(("system wide not possible, %u conflicting task_sessions\n",
+                                pfm_sessions.pfs_task_sessions));
+                        goto abort;
+                }
+                if (pfm_sessions.pfs_sys_session[cpu]) goto error_conflict;
+                DPRINT(("reserving system wide session on CPU%u currently on CPU%u\n", cpu, smp_processor_id()));
+                pfm_sessions.pfs_sys_session[cpu] = task;
+                pfm_sessions.pfs_sys_sessions++ ;
+        } else {
+                if (pfm_sessions.pfs_sys_sessions) goto abort;
+                pfm_sessions.pfs_task_sessions++;
+        }
+        DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
+                pfm_sessions.pfs_sys_sessions,
+                pfm_sessions.pfs_task_sessions,
+                pfm_sessions.pfs_sys_use_dbregs,
+                is_syswide,
+                cpu));
+        UNLOCK_PFS(flags);
+        return 0;
+error_conflict:
+        DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n",
+                pfm_sessions.pfs_sys_session[cpu]->pid,
+                smp_processor_id()));
+abort:
+        UNLOCK_PFS(flags);
+        return -EBUSY;
+}
+static int
+pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu)
+{
+        unsigned long flags;
+        /*
+         * validy checks on cpu_mask have been done upstream
+         */
+        LOCK_PFS(flags);
+        DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
+                pfm_sessions.pfs_sys_sessions,
+                pfm_sessions.pfs_task_sessions,
+                pfm_sessions.pfs_sys_use_dbregs,
+                is_syswide,
+                cpu));
+        if (is_syswide) {
+                pfm_sessions.pfs_sys_session[cpu] = NULL;
+                /*
+                 * would not work with perfmon+more than one bit in cpu_mask
+                 */
+                if (ctx && ctx->ctx_fl_using_dbreg) {
+                        if (pfm_sessions.pfs_sys_use_dbregs == 0) {
+                                printk(KERN_ERR "perfmon: invalid release for ctx %p sys_use_dbregs=0\n", ctx);
+                        } else {
+                                pfm_sessions.pfs_sys_use_dbregs--;
+                        }
+                }
+                pfm_sessions.pfs_sys_sessions--;
+        } else {
+                pfm_sessions.pfs_task_sessions--;
+        }
+        DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
+                pfm_sessions.pfs_sys_sessions,
+                pfm_sessions.pfs_task_sessions,
+                pfm_sessions.pfs_sys_use_dbregs,
+                is_syswide,
+                cpu));
+        UNLOCK_PFS(flags);
+        return 0;
+}
+/*
+ * removes virtual mapping of the sampling buffer.
+ * IMPORTANT: cannot be called with interrupts disable, e.g. inside
+ * a PROTECT_CTX() section.
+ */
+static int
+pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long size)
+{
+        int r;
+        /* sanity checks */
+        if (task->mm == NULL || size == 0UL || vaddr == NULL) {
+                printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task->pid, task->mm);
+                return -EINVAL;
+        }
+        DPRINT(("smpl_vaddr=%p size=%lu\n", vaddr, size));
+        /*
+         * does the actual unmapping
+         */
+        down_write(&task->mm->mmap_sem);
+        DPRINT(("down_write done smpl_vaddr=%p size=%lu\n", vaddr, size));
+        r = pfm_do_munmap(task->mm, (unsigned long)vaddr, size, 0);
+        up_write(&task->mm->mmap_sem);
+        if (r !=0) {
+                printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task->pid, vaddr, size);
+        }
+        DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r));
+        return 0;
+}
+/*
+ * free actual physical storage used by sampling buffer
+ */
+#if 0
+static int
+pfm_free_smpl_buffer(pfm_context_t *ctx)
+{
+        pfm_buffer_fmt_t *fmt;
+        if (ctx->ctx_smpl_hdr == NULL) goto invalid_free;
+        /*
+         * we won't use the buffer format anymore
+         */
+        fmt = ctx->ctx_buf_fmt;
+        DPRINT(("sampling buffer @%p size %lu vaddr=%p\n",
+                ctx->ctx_smpl_hdr,
+                ctx->ctx_smpl_size,
+                ctx->ctx_smpl_vaddr));
+        pfm_buf_fmt_exit(fmt, current, NULL, NULL);
+        /*
+         * free the buffer
+         */
+        pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size);
+        ctx->ctx_smpl_hdr  = NULL;
+        ctx->ctx_smpl_size = 0UL;
+        return 0;
+invalid_free:
+        printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", current->pid);
+        return -EINVAL;
+}
+#endif
+static inline void
+pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt)
+{
+        if (fmt == NULL) return;
+        pfm_buf_fmt_exit(fmt, current, NULL, NULL);
+}
+/*
+ * pfmfs should _never_ be mounted by userland - too much of security hassle,
+ * no real gain from having the whole whorehouse mounted. So we don't need
+ * any operations on the root directory. However, we need a non-trivial
+ * d_name - pfm: will go nicely and kill the special-casing in procfs.
+ */
+static struct vfsmount *pfmfs_mnt;
+static int __init
+init_pfm_fs(void)
+{
+        int err = register_filesystem(&pfm_fs_type);
+        if (!err) {
+                pfmfs_mnt = kern_mount(&pfm_fs_type);
+                err = PTR_ERR(pfmfs_mnt);
+                if (IS_ERR(pfmfs_mnt))
+                        unregister_filesystem(&pfm_fs_type);
+                else
+                        err = 0;
+        }
+        return err;
+}
+static void __exit
+exit_pfm_fs(void)
+{
+        unregister_filesystem(&pfm_fs_type);
+        mntput(pfmfs_mnt);
+}
+static ssize_t
+pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
+{
+        pfm_context_t *ctx;
+        pfm_msg_t *msg;
+        ssize_t ret;
+        unsigned long flags;
+        DECLARE_WAITQUEUE(wait, current);
+        if (PFM_IS_FILE(filp) == 0) {
+                printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
+                return -EINVAL;
+        }
+        ctx = (pfm_context_t *)filp->private_data;
+        if (ctx == NULL) {
+                printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", current->pid);
+                return -EINVAL;
+        }
+        /*
+         * check even when there is no message
+         */
+        if (size < sizeof(pfm_msg_t)) {
+                DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t)));
+                return -EINVAL;
+        }
+        PROTECT_CTX(ctx, flags);
+        /*
+         * put ourselves on the wait queue
+         */
+        add_wait_queue(&ctx->ctx_msgq_wait, &wait);
+        for(;;) {
+                /*
+                 * check wait queue
+                 */
+                set_current_state(TASK_INTERRUPTIBLE);
+                DPRINT(("head=%d tail=%d\n", ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
+                ret = 0;
+                if(PFM_CTXQ_EMPTY(ctx) == 0) break;
+                UNPROTECT_CTX(ctx, flags);
+                /*
+                 * check non-blocking read
+                 */
+                ret = -EAGAIN;
+                if(filp->f_flags & O_NONBLOCK) break;
+                /*
+                 * check pending signals
+                 */
+                if(signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                /*
+                 * no message, so wait
+                 */
+                schedule();
+                PROTECT_CTX(ctx, flags);
+        }
+        DPRINT(("[%d] back to running ret=%ld\n", current->pid, ret));
+        set_current_state(TASK_RUNNING);
+        remove_wait_queue(&ctx->ctx_msgq_wait, &wait);
+        if (ret < 0) goto abort;
+        ret = -EINVAL;
+        msg = pfm_get_next_msg(ctx);
+        if (msg == NULL) {
+                printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, current->pid);
+                goto abort_locked;
+        }
+        DPRINT(("[%d] fd=%d type=%d\n", current->pid, msg->pfm_gen_msg.msg_ctx_fd, msg->pfm_gen_msg.msg_type));
+        ret = -EFAULT;
+        if(copy_to_user(buf, msg, sizeof(pfm_msg_t)) == 0) ret = sizeof(pfm_msg_t);
+abort_locked:
+        UNPROTECT_CTX(ctx, flags);
+abort:
+        return ret;
+}
+static ssize_t
+pfm_write(struct file *file, const char __user *ubuf,
+                          size_t size, loff_t *ppos)
+{
+        DPRINT(("pfm_write called\n"));
+        return -EINVAL;
+}
+static unsigned int
+pfm_poll(struct file *filp, poll_table * wait)
+{
+        pfm_context_t *ctx;
+        unsigned long flags;
+        unsigned int mask = 0;
+        if (PFM_IS_FILE(filp) == 0) {
+                printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
+                return 0;
+        }
+        ctx = (pfm_context_t *)filp->private_data;
+        if (ctx == NULL) {
+                printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", current->pid);
+                return 0;
+        }
+        DPRINT(("pfm_poll ctx_fd=%d before poll_wait\n", ctx->ctx_fd));
+        poll_wait(filp, &ctx->ctx_msgq_wait, wait);
+        PROTECT_CTX(ctx, flags);
+        if (PFM_CTXQ_EMPTY(ctx) == 0)
+                mask =  POLLIN | POLLRDNORM;
+        UNPROTECT_CTX(ctx, flags);
+        DPRINT(("pfm_poll ctx_fd=%d mask=0x%x\n", ctx->ctx_fd, mask));
+        return mask;
+}
+static int
+pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+{
+        DPRINT(("pfm_ioctl called\n"));
+        return -EINVAL;
+}
+/*
+ * interrupt cannot be masked when coming here
+ */
+static inline int
+pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on)
+{
+        int ret;
+        ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue);
+        DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
+                current->pid,
+                fd,
+                on,
+                ctx->ctx_async_queue, ret));
+        return ret;
+}
+static int
+pfm_fasync(int fd, struct file *filp, int on)
+{
+        pfm_context_t *ctx;
+        int ret;
+        if (PFM_IS_FILE(filp) == 0) {
+                printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", current->pid);
+                return -EBADF;
+        }
+        ctx = (pfm_context_t *)filp->private_data;
+        if (ctx == NULL) {
+                printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid);
+                return -EBADF;
+        }
+        /*
+         * we cannot mask interrupts during this call because this may
+         * may go to sleep if memory is not readily avalaible.
+         *
+         * We are protected from the conetxt disappearing by the get_fd()/put_fd()
+         * done in caller. Serialization of this function is ensured by caller.
+         */
+        ret = pfm_do_fasync(fd, filp, ctx, on);
+        DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
+                fd,
+                on,
+                ctx->ctx_async_queue, ret));
+        return ret;
+}
+#ifdef CONFIG_SMP
+/*
+ * this function is exclusively called from pfm_close().
+ * The context is not protected at that time, nor are interrupts
+ * on the remote CPU. That's necessary to avoid deadlocks.
+ */
+static void
+pfm_syswide_force_stop(void *info)
+{
+        pfm_context_t   *ctx = (pfm_context_t *)info;
+        struct pt_regs *regs = ia64_task_regs(current);
+        struct task_struct *owner;
+        unsigned long flags;
+        int ret;
+        if (ctx->ctx_cpu != smp_processor_id()) {
+                printk(KERN_ERR "perfmon: pfm_syswide_force_stop for CPU%d  but on CPU%d\n",
+                        ctx->ctx_cpu,
+                        smp_processor_id());
+                return;
+        }
+        owner = GET_PMU_OWNER();
+        if (owner != ctx->ctx_task) {
+                printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n",
+                        smp_processor_id(),
+                        owner->pid, ctx->ctx_task->pid);
+                return;
+        }
+        if (GET_PMU_CTX() != ctx) {
+                printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected ctx %p instead of %p\n",
+                        smp_processor_id(),
+                        GET_PMU_CTX(), ctx);
+                return;
+        }
+        DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), ctx->ctx_task->pid));       
+        /*
+         * the context is already protected in pfm_close(), we simply
+         * need to mask interrupts to avoid a PMU interrupt race on
+         * this CPU
+         */
+        local_irq_save(flags);
+        ret = pfm_context_unload(ctx, NULL, 0, regs);
+        if (ret) {
+                DPRINT(("context_unload returned %d\n", ret));
+        }
+        /*
+         * unmask interrupts, PMU interrupts are now spurious here
+         */
+        local_irq_restore(flags);
+}
+static void
+pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
+{
+        int ret;
+        DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu));
+        ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1);
+        DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret));
+}
+#endif /* CONFIG_SMP */
+/*
+ * called for each close(). Partially free resources.
+ * When caller is self-monitoring, the context is unloaded.
+ */
+static int
+pfm_flush(struct file *filp)
+{
+        pfm_context_t *ctx;
+        struct task_struct *task;
+        struct pt_regs *regs;
+        unsigned long flags;
+        unsigned long smpl_buf_size = 0UL;
+        void *smpl_buf_vaddr = NULL;
+        int state, is_system;
+        if (PFM_IS_FILE(filp) == 0) {
+                DPRINT(("bad magic for\n"));
+                return -EBADF;
+        }
+        ctx = (pfm_context_t *)filp->private_data;
+        if (ctx == NULL) {
+                printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", current->pid);
+                return -EBADF;
+        }
+        /*
+         * remove our file from the async queue, if we use this mode.
+         * This can be done without the context being protected. We come
+         * here when the context has become unreacheable by other tasks.
+         *
+         * We may still have active monitoring at this point and we may
+         * end up in pfm_overflow_handler(). However, fasync_helper()
+         * operates with interrupts disabled and it cleans up the
+         * queue. If the PMU handler is called prior to entering
+         * fasync_helper() then it will send a signal. If it is
+         * invoked after, it will find an empty queue and no
+         * signal will be sent. In both case, we are safe
+         */
+        if (filp->f_flags & FASYNC) {
+                DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue));
+                pfm_do_fasync (-1, filp, ctx, 0);
+        }
+        PROTECT_CTX(ctx, flags);
+        state     = ctx->ctx_state;
+        is_system = ctx->ctx_fl_system;
+        task = PFM_CTX_TASK(ctx);
+        regs = ia64_task_regs(task);
+        DPRINT(("ctx_state=%d is_current=%d\n",
+                state,
+                task == current ? 1 : 0));
+        /*
+         * if state == UNLOADED, then task is NULL
+         */
+        /*
+         * we must stop and unload because we are losing access to the context.
+         */
+        if (task == current) {
+#ifdef CONFIG_SMP
+                /*
+                 * the task IS the owner but it migrated to another CPU: that's bad
+                 * but we must handle this cleanly. Unfortunately, the kernel does
+                 * not provide a mechanism to block migration (while the context is loaded).
+                 *
+                 * We need to release the resource on the ORIGINAL cpu.
+                 */
+                if (is_system && ctx->ctx_cpu != smp_processor_id()) {
+                        DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
+                        /*
+                         * keep context protected but unmask interrupt for IPI
+                         */
+                        local_irq_restore(flags);
+                        pfm_syswide_cleanup_other_cpu(ctx);
+                        /*
+                         * restore interrupt masking
+                         */
+                        local_irq_save(flags);
+                        /*
+                         * context is unloaded at this point
+                         */
+                } else
+#endif /* CONFIG_SMP */
+                {
+                        DPRINT(("forcing unload\n"));
+                        /*
+                        * stop and unload, returning with state UNLOADED
+                        * and session unreserved.
+                        */
+                        pfm_context_unload(ctx, NULL, 0, regs);
+                        DPRINT(("ctx_state=%d\n", ctx->ctx_state));
+                }
+        }
+        /*
+         * remove virtual mapping, if any, for the calling task.
+         * cannot reset ctx field until last user is calling close().
+         *
+         * ctx_smpl_vaddr must never be cleared because it is needed
+         * by every task with access to the context
+         *
+         * When called from do_exit(), the mm context is gone already, therefore
+         * mm is NULL, i.e., the VMA is already gone  and we do not have to
+         * do anything here
+         */
+        if (ctx->ctx_smpl_vaddr && current->mm) {
+                smpl_buf_vaddr = ctx->ctx_smpl_vaddr;
+                smpl_buf_size  = ctx->ctx_smpl_size;
+        }
+        UNPROTECT_CTX(ctx, flags);
+        /*
+         * if there was a mapping, then we systematically remove it
+         * at this point. Cannot be done inside critical section
+         * because some VM function reenables interrupts.
+         *
+         */
+        if (smpl_buf_vaddr) pfm_remove_smpl_mapping(current, smpl_buf_vaddr, smpl_buf_size);
+        return 0;
+}
+/*
+ * called either on explicit close() or from exit_files(). 
+ * Only the LAST user of the file gets to this point, i.e., it is
+ * called only ONCE.
+ *
+ * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero 
+ * (fput()),i.e, last task to access the file. Nobody else can access the 
+ * file at this point.
+ *
+ * When called from exit_files(), the VMA has been freed because exit_mm()
+ * is executed before exit_files().
+ *
+ * When called from exit_files(), the current task is not yet ZOMBIE but we
+ * flush the PMU state to the context. 
+ */
+static int
+pfm_close(struct inode *inode, struct file *filp)
+{
+        pfm_context_t *ctx;
+        struct task_struct *task;
+        struct pt_regs *regs;
+        DECLARE_WAITQUEUE(wait, current);
+        unsigned long flags;
+        unsigned long smpl_buf_size = 0UL;
+        void *smpl_buf_addr = NULL;
+        int free_possible = 1;
+        int state, is_system;
+        DPRINT(("pfm_close called private=%p\n", filp->private_data));
+        if (PFM_IS_FILE(filp) == 0) {
+                DPRINT(("bad magic\n"));
+                return -EBADF;
+        }
+        
+        ctx = (pfm_context_t *)filp->private_data;
+        if (ctx == NULL) {
+                printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid);
+                return -EBADF;
+        }
+        PROTECT_CTX(ctx, flags);
+        state     = ctx->ctx_state;
+        is_system = ctx->ctx_fl_system;
+        task = PFM_CTX_TASK(ctx);
+        regs = ia64_task_regs(task);
+        DPRINT(("ctx_state=%d is_current=%d\n", 
+                state,
+                task == current ? 1 : 0));
+        /*
+         * if task == current, then pfm_flush() unloaded the context
+         */
+        if (state == PFM_CTX_UNLOADED) goto doit;
+        /*
+         * context is loaded/masked and task != current, we need to
+         * either force an unload or go zombie
+         */
+        /*
+         * The task is currently blocked or will block after an overflow.
+         * we must force it to wakeup to get out of the
+         * MASKED state and transition to the unloaded state by itself.
+         *
+         * This situation is only possible for per-task mode
+         */
+        if (state == PFM_CTX_MASKED && CTX_OVFL_NOBLOCK(ctx) == 0) {
+                /*
+                 * set a "partial" zombie state to be checked
+                 * upon return from down() in pfm_handle_work().
+                 *
+                 * We cannot use the ZOMBIE state, because it is checked
+                 * by pfm_load_regs() which is called upon wakeup from down().
+                 * In such case, it would free the context and then we would
+                 * return to pfm_handle_work() which would access the
+                 * stale context. Instead, we set a flag invisible to pfm_load_regs()
+                 * but visible to pfm_handle_work().
+                 *
+                 * For some window of time, we have a zombie context with
+                 * ctx_state = MASKED  and not ZOMBIE
+                 */
+                ctx->ctx_fl_going_zombie = 1;
+                /*
+                 * force task to wake up from MASKED state
+                 */
+                up(&ctx->ctx_restart_sem);
+                DPRINT(("waking up ctx_state=%d\n", state));
+                /*
+                 * put ourself to sleep waiting for the other
+                 * task to report completion
+                 *
+                 * the context is protected by mutex, therefore there
+                 * is no risk of being notified of completion before
+                 * begin actually on the waitq.
+                 */
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue(&ctx->ctx_zombieq, &wait);
+                UNPROTECT_CTX(ctx, flags);
+                /*
+                 * XXX: check for signals :
+                 *      - ok for explicit close
+                 *      - not ok when coming from exit_files()
+                 */
+                schedule();
+                PROTECT_CTX(ctx, flags);
+                remove_wait_queue(&ctx->ctx_zombieq, &wait);
+                set_current_state(TASK_RUNNING);
+                /*
+                 * context is unloaded at this point
+                 */
+                DPRINT(("after zombie wakeup ctx_state=%d for\n", state));
+        }
+        else if (task != current) {
+#ifdef CONFIG_SMP
+                /*
+                 * switch context to zombie state
+                 */
+                ctx->ctx_state = PFM_CTX_ZOMBIE;
+                DPRINT(("zombie ctx for [%d]\n", task->pid));
+                /*
+                 * cannot free the context on the spot. deferred until
+                 * the task notices the ZOMBIE state
+                 */
+                free_possible = 0;
+#else
+                pfm_context_unload(ctx, NULL, 0, regs);
+#endif
+        }
+doit:
+        /* reload state, may have changed during  opening of critical section */
+        state = ctx->ctx_state;
+        /*
+         * the context is still attached to a task (possibly current)
+         * we cannot destroy it right now
+         */
+        /*
+         * we must free the sampling buffer right here because
+         * we cannot rely on it being cleaned up later by the
+         * monitored task. It is not possible to free vmalloc'ed
+         * memory in pfm_load_regs(). Instead, we remove the buffer
+         * now. should there be subsequent PMU overflow originally
+         * meant for sampling, the will be converted to spurious
+         * and that's fine because the monitoring tools is gone anyway.
+         */
+        if (ctx->ctx_smpl_hdr) {
+                smpl_buf_addr = ctx->ctx_smpl_hdr;
+                smpl_buf_size = ctx->ctx_smpl_size;
+                /* no more sampling */
+                ctx->ctx_smpl_hdr = NULL;
+                ctx->ctx_fl_is_sampling = 0;
+        }
+        DPRINT(("ctx_state=%d free_possible=%d addr=%p size=%lu\n",
+                state,
+                free_possible,
+                smpl_buf_addr,
+                smpl_buf_size));
+        if (smpl_buf_addr) pfm_exit_smpl_buffer(ctx->ctx_buf_fmt);
+        /*
+         * UNLOADED that the session has already been unreserved.
+         */
+        if (state == PFM_CTX_ZOMBIE) {
+                pfm_unreserve_session(ctx, ctx->ctx_fl_system , ctx->ctx_cpu);
+        }
+        /*
+         * disconnect file descriptor from context must be done
+         * before we unlock.
+         */
+        filp->private_data = NULL;
+        /*
+         * if we free on the spot, the context is now completely unreacheable
+         * from the callers side. The monitored task side is also cut, so we
+         * can freely cut.
+         *
+         * If we have a deferred free, only the caller side is disconnected.
+         */
+        UNPROTECT_CTX(ctx, flags);
+        /*
+         * All memory free operations (especially for vmalloc'ed memory)
+         * MUST be done with interrupts ENABLED.
+         */
+        if (smpl_buf_addr)  pfm_rvfree(smpl_buf_addr, smpl_buf_size);
+        /*
+         * return the memory used by the context
+         */
+        if (free_possible) pfm_context_free(ctx);
+        return 0;
+}
+static int
+pfm_no_open(struct inode *irrelevant, struct file *dontcare)
+{
+        DPRINT(("pfm_no_open called\n"));
+        return -ENXIO;
+}
+static struct file_operations pfm_file_ops = {
+        .llseek   = no_llseek,
+        .read     = pfm_read,
+        .write    = pfm_write,
+        .poll     = pfm_poll,
+        .ioctl    = pfm_ioctl,
+        .open     = pfm_no_open,        /* special open code to disallow open via /proc */
+        .fasync   = pfm_fasync,
+        .release  = pfm_close,
+        .flush    = pfm_flush
+};
+static int
+pfmfs_delete_dentry(struct dentry *dentry)
+{
+        return 1;
+}
+static struct dentry_operations pfmfs_dentry_operations = {
+        .d_delete = pfmfs_delete_dentry,
+};
+static int
+pfm_alloc_fd(struct file **cfile)
+{
+        int fd, ret = 0;
+        struct file *file = NULL;
+        struct inode * inode;
+        char name[32];
+        struct qstr this;
+        fd = get_unused_fd();
+        if (fd < 0) return -ENFILE;
+        ret = -ENFILE;
+        file = get_empty_filp();
+        if (!file) goto out;
+        /*
+         * allocate a new inode
+         */
+        inode = new_inode(pfmfs_mnt->mnt_sb);
+        if (!inode) goto out;
+        DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode));
+        inode->i_mode = S_IFCHR|S_IRUGO;
+        inode->i_uid  = current->fsuid;
+        inode->i_gid  = current->fsgid;
+        sprintf(name, "[%lu]", inode->i_ino);
+        this.name = name;
+        this.len  = strlen(name);
+        this.hash = inode->i_ino;
+        ret = -ENOMEM;
+        /*
+         * allocate a new dcache entry
+         */
+        file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
+        if (!file->f_dentry) goto out;
+        file->f_dentry->d_op = &pfmfs_dentry_operations;
+        d_add(file->f_dentry, inode);
+        file->f_vfsmnt = mntget(pfmfs_mnt);
+        file->f_mapping = inode->i_mapping;
+        file->f_op    = &pfm_file_ops;
+        file->f_mode  = FMODE_READ;
+        file->f_flags = O_RDONLY;
+        file->f_pos   = 0;
+        /*
+         * may have to delay until context is attached?
+         */
+        fd_install(fd, file);
+        /*
+         * the file structure we will use
+         */
+        *cfile = file;
+        return fd;
+out:
+        if (file) put_filp(file);
+        put_unused_fd(fd);
+        return ret;
+}
+static void
+pfm_free_fd(int fd, struct file *file)
+{
+        struct files_struct *files = current->files;
+        /* 
+         * there ie no fd_uninstall(), so we do it here
+         */
+        spin_lock(&files->file_lock);
+        files->fd[fd] = NULL;
+        spin_unlock(&files->file_lock);
+        if (file) put_filp(file);
+        put_unused_fd(fd);
+}
+static int
+pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
+{
+        DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
+        while (size > 0) {
+                unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT;
+                if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY))
+                        return -ENOMEM;
+                addr  += PAGE_SIZE;
+                buf   += PAGE_SIZE;
+                size  -= PAGE_SIZE;
+        }
+        return 0;
+}
+/*
+ * allocate a sampling buffer and remaps it into the user address space of the task
+ */
+static int
+pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned long rsize, void **user_vaddr)
+{
+        struct mm_struct *mm = task->mm;
+        struct vm_area_struct *vma = NULL;
+        unsigned long size;
+        void *smpl_buf;
+        /*
+         * the fixed header + requested size and align to page boundary
+         */
+        size = PAGE_ALIGN(rsize);
+        DPRINT(("sampling buffer rsize=%lu size=%lu bytes\n", rsize, size));
+        /*
+         * check requested size to avoid Denial-of-service attacks
+         * XXX: may have to refine this test
+         * Check against address space limit.
+         *
+         * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
+         *      return -ENOMEM;
+         */
+        if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
+                return -ENOMEM;
+        /*
+         * We do the easy to undo allocations first.
+         *
+         * pfm_rvmalloc(), clears the buffer, so there is no leak
+         */
+        smpl_buf = pfm_rvmalloc(size);
+        if (smpl_buf == NULL) {
+                DPRINT(("Can't allocate sampling buffer\n"));
+                return -ENOMEM;
+        }
+        DPRINT(("smpl_buf @%p\n", smpl_buf));
+        /* allocate vma */
+        vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+        if (!vma) {
+                DPRINT(("Cannot allocate vma\n"));
+                goto error_kmem;
+        }
+        memset(vma, 0, sizeof(*vma));
+        /*
+         * partially initialize the vma for the sampling buffer
+         */
+        vma->vm_mm           = mm;
+        vma->vm_flags        = VM_READ| VM_MAYREAD |VM_RESERVED;
+        vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
+        /*
+         * Now we have everything we need and we can initialize
+         * and connect all the data structures
+         */
+        ctx->ctx_smpl_hdr   = smpl_buf;
+        ctx->ctx_smpl_size  = size; /* aligned size */
+        /*
+         * Let's do the difficult operations next.
+         *
+         * now we atomically find some area in the address space and
+         * remap the buffer in it.
+         */
+        down_write(&task->mm->mmap_sem);
+        /* find some free area in address space, must have mmap sem held */
+        vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0);
+        if (vma->vm_start == 0UL) {
+                DPRINT(("Cannot find unmapped area for size %ld\n", size));
+                up_write(&task->mm->mmap_sem);
+                goto error;
+        }
+        vma->vm_end = vma->vm_start + size;
+        vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
+        DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start));
+        /* can only be applied to current task, need to have the mm semaphore held when called */
+        if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) {
+                DPRINT(("Can't remap buffer\n"));
+                up_write(&task->mm->mmap_sem);
+                goto error;
+        }
+        /*
+         * now insert the vma in the vm list for the process, must be
+         * done with mmap lock held
+         */
+        insert_vm_struct(mm, vma);
+        mm->total_vm  += size >> PAGE_SHIFT;
+        vm_stat_account(vma);
+        up_write(&task->mm->mmap_sem);
+        /*
+         * keep track of user level virtual address
+         */
+        ctx->ctx_smpl_vaddr = (void *)vma->vm_start;
+        *(unsigned long *)user_vaddr = vma->vm_start;
+        return 0;
+error:
+        kmem_cache_free(vm_area_cachep, vma);
+error_kmem:
+        pfm_rvfree(smpl_buf, size);
+        return -ENOMEM;
+}
+/*
+ * XXX: do something better here
+ */
+static int
+pfm_bad_permissions(struct task_struct *task)
+{
+        /* inspired by ptrace_attach() */
+        DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n",
+                current->uid,
+                current->gid,
+                task->euid,
+                task->suid,
+                task->uid,
+                task->egid,
+                task->sgid));
+        return ((current->uid != task->euid)
+            || (current->uid != task->suid)
+            || (current->uid != task->uid)
+            || (current->gid != task->egid)
+            || (current->gid != task->sgid)
+            || (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE);
+}
+static int
+pfarg_is_sane(struct task_struct *task, pfarg_context_t *pfx)
+{
+        int ctx_flags;
+        /* valid signal */
+        ctx_flags = pfx->ctx_flags;
+        if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
+                /*
+                 * cannot block in this mode
+                 */
+                if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
+                        DPRINT(("cannot use blocking mode when in system wide monitoring\n"));
+                        return -EINVAL;
+                }
+        } else {
+        }
+        /* probably more to add here */
+        return 0;
+}
+static int
+pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int ctx_flags,
+                     unsigned int cpu, pfarg_context_t *arg)
+{
+        pfm_buffer_fmt_t *fmt = NULL;
+        unsigned long size = 0UL;
+        void *uaddr = NULL;
+        void *fmt_arg = NULL;
+        int ret = 0;
+#define PFM_CTXARG_BUF_ARG(a)   (pfm_buffer_fmt_t *)(a+1)
+        /* invoke and lock buffer format, if found */
+        fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id);
+        if (fmt == NULL) {
+                DPRINT(("[%d] cannot find buffer format\n", task->pid));
+                return -EINVAL;
+        }
+        /*
+         * buffer argument MUST be contiguous to pfarg_context_t
+         */
+        if (fmt->fmt_arg_size) fmt_arg = PFM_CTXARG_BUF_ARG(arg);
+        ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg);
+        DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task->pid, ctx_flags, cpu, fmt_arg, ret));
+        if (ret) goto error;
+        /* link buffer format and context */
+        ctx->ctx_buf_fmt = fmt;
+        /*
+         * check if buffer format wants to use perfmon buffer allocation/mapping service
+         */
+        ret = pfm_buf_fmt_getsize(fmt, task, ctx_flags, cpu, fmt_arg, &size);
+        if (ret) goto error;
+        if (size) {
+                /*
+                 * buffer is always remapped into the caller's address space
+                 */
+                ret = pfm_smpl_buffer_alloc(current, ctx, size, &uaddr);
+                if (ret) goto error;
+                /* keep track of user address of buffer */
+                arg->ctx_smpl_vaddr = uaddr;
+        }
+        ret = pfm_buf_fmt_init(fmt, task, ctx->ctx_smpl_hdr, ctx_flags, cpu, fmt_arg);
+error:
+        return ret;
+}
+static void
+pfm_reset_pmu_state(pfm_context_t *ctx)
+{
+        int i;
+        /*
+         * install reset values for PMC.
+         */
+        for (i=1; PMC_IS_LAST(i) == 0; i++) {
+                if (PMC_IS_IMPL(i) == 0) continue;
+                ctx->ctx_pmcs[i] = PMC_DFL_VAL(i);
+                DPRINT(("pmc[%d]=0x%lx\n", i, ctx->ctx_pmcs[i]));
+        }
+        /*
+         * PMD registers are set to 0UL when the context in memset()
+         */
+        /*
+         * On context switched restore, we must restore ALL pmc and ALL pmd even
+         * when they are not actively used by the task. In UP, the incoming process
+         * may otherwise pick up left over PMC, PMD state from the previous process.
+         * As opposed to PMD, stale PMC can cause harm to the incoming
+         * process because they may change what is being measured.
+         * Therefore, we must systematically reinstall the entire
+         * PMC state. In SMP, the same thing is possible on the
+         * same CPU but also on between 2 CPUs.
+         *
+         * The problem with PMD is information leaking especially
+         * to user level when psr.sp=0
+         *
+         * There is unfortunately no easy way to avoid this problem
+         * on either UP or SMP. This definitively slows down the
+         * pfm_load_regs() function.
+         */
+         /*
+          * bitmask of all PMCs accessible to this context
+          *
+          * PMC0 is treated differently.
+          */
+        ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1;
+        /*
+         * bitmask of all PMDs that are accesible to this context
+         */
+        ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0];
+        DPRINT(("<%d> all_pmcs=0x%lx all_pmds=0x%lx\n", ctx->ctx_fd, ctx->ctx_all_pmcs[0],ctx->ctx_all_pmds[0]));
+        /*
+         * useful in case of re-enable after disable
+         */
+        ctx->ctx_used_ibrs[0] = 0UL;
+        ctx->ctx_used_dbrs[0] = 0UL;
+}
+static int
+pfm_ctx_getsize(void *arg, size_t *sz)
+{
+        pfarg_context_t *req = (pfarg_context_t *)arg;
+        pfm_buffer_fmt_t *fmt;
+        *sz = 0;
+        if (!pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) return 0;
+        fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id);
+        if (fmt == NULL) {
+                DPRINT(("cannot find buffer format\n"));
+                return -EINVAL;
+        }
+        /* get just enough to copy in user parameters */
+        *sz = fmt->fmt_arg_size;
+        DPRINT(("arg_size=%lu\n", *sz));
+        return 0;
+}
+/*
+ * cannot attach if :
+ *      - kernel task
+ *      - task not owned by caller
+ *      - task incompatible with context mode
+ */
+static int
+pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
+{
+        /*
+         * no kernel task or task not owner by caller
+         */
+        if (task->mm == NULL) {
+                DPRINT(("task [%d] has not memory context (kernel thread)\n", task->pid));
+                return -EPERM;
+        }
+        if (pfm_bad_permissions(task)) {
+                DPRINT(("no permission to attach to  [%d]\n", task->pid));
+                return -EPERM;
+        }
+        /*
+         * cannot block in self-monitoring mode
+         */
+        if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) {
+                DPRINT(("cannot load a blocking context on self for [%d]\n", task->pid));
+                return -EINVAL;
+        }
+        if (task->exit_state == EXIT_ZOMBIE) {
+                DPRINT(("cannot attach to  zombie task [%d]\n", task->pid));
+                return -EBUSY;
+        }
+        /*
+         * always ok for self
+         */
+        if (task == current) return 0;
+        if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
+                DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state));
+                return -EBUSY;
+        }
+        /*
+         * make sure the task is off any CPU
+         */
+        wait_task_inactive(task);
+        /* more to come... */
+        return 0;
+}
+static int
+pfm_get_task(pfm_context_t *ctx, pid_t pid, struct task_struct **task)
+{
+        struct task_struct *p = current;
+        int ret;
+        /* XXX: need to add more checks here */
+        if (pid < 2) return -EPERM;
+        if (pid != current->pid) {
+                read_lock(&tasklist_lock);
+                p = find_task_by_pid(pid);
+                /* make sure task cannot go away while we operate on it */
+                if (p) get_task_struct(p);
+                read_unlock(&tasklist_lock);
+                if (p == NULL) return -ESRCH;
+        }
+        ret = pfm_task_incompatible(ctx, p);
+        if (ret == 0) {
+                *task = p;
+        } else if (p != current) {
+                pfm_put_task(p);
+        }
+        return ret;
+}
+static int
+pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        pfarg_context_t *req = (pfarg_context_t *)arg;
+        struct file *filp;
+        int ctx_flags;
+        int ret;
+        /* let's check the arguments first */
+        ret = pfarg_is_sane(current, req);
+        if (ret < 0) return ret;
+        ctx_flags = req->ctx_flags;
+        ret = -ENOMEM;
+        ctx = pfm_context_alloc();
+        if (!ctx) goto error;
+        ret = pfm_alloc_fd(&filp);
+        if (ret < 0) goto error_file;
+        req->ctx_fd = ctx->ctx_fd = ret;
+        /*
+         * attach context to file
+         */
+        filp->private_data = ctx;
+        /*
+         * does the user want to sample?
+         */
+        if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) {
+                ret = pfm_setup_buffer_fmt(current, ctx, ctx_flags, 0, req);
+                if (ret) goto buffer_error;
+        }
+        /*
+         * init context protection lock
+         */
+        spin_lock_init(&ctx->ctx_lock);
+        /*
+         * context is unloaded
+         */
+        ctx->ctx_state = PFM_CTX_UNLOADED;
+        /*
+         * initialization of context's flags
+         */
+        ctx->ctx_fl_block       = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
+        ctx->ctx_fl_system      = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
+        ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */
+        ctx->ctx_fl_no_msg      = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
+        /*
+         * will move to set properties
+         * ctx->ctx_fl_excl_idle   = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
+         */
+        /*
+         * init restart semaphore to locked
+         */
+        sema_init(&ctx->ctx_restart_sem, 0);
+        /*
+         * activation is used in SMP only
+         */
+        ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
+        SET_LAST_CPU(ctx, -1);
+        /*
+         * initialize notification message queue
+         */
+        ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
+        init_waitqueue_head(&ctx->ctx_msgq_wait);
+        init_waitqueue_head(&ctx->ctx_zombieq);
+        DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n",
+                ctx,
+                ctx_flags,
+                ctx->ctx_fl_system,
+                ctx->ctx_fl_block,
+                ctx->ctx_fl_excl_idle,
+                ctx->ctx_fl_no_msg,
+                ctx->ctx_fd));
+        /*
+         * initialize soft PMU state
+         */
+        pfm_reset_pmu_state(ctx);
+        return 0;
+buffer_error:
+        pfm_free_fd(ctx->ctx_fd, filp);
+        if (ctx->ctx_buf_fmt) {
+                pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs);
+        }
+error_file:
+        pfm_context_free(ctx);
+error:
+        return ret;
+}
+static inline unsigned long
+pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset)
+{
+        unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset;
+        unsigned long new_seed, old_seed = reg->seed, mask = reg->mask;
+        extern unsigned long carta_random32 (unsigned long seed);
+        if (reg->flags & PFM_REGFL_RANDOM) {
+                new_seed = carta_random32(old_seed);
+                val -= (old_seed & mask);       /* counter values are negative numbers! */
+                if ((mask >> 32) != 0)
+                        /* construct a full 64-bit random value: */
+                        new_seed |= carta_random32(old_seed >> 32) << 32;
+                reg->seed = new_seed;
+        }
+        reg->lval = val;
+        return val;
+}
+static void
+pfm_reset_regs_masked(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
+{
+        unsigned long mask = ovfl_regs[0];
+        unsigned long reset_others = 0UL;
+        unsigned long val;
+        int i;
+        /*
+         * now restore reset value on sampling overflowed counters
+         */
+        mask >>= PMU_FIRST_COUNTER;
+        for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
+                if ((mask & 0x1UL) == 0UL) continue;
+                ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
+                reset_others        |= ctx->ctx_pmds[i].reset_pmds[0];
+                DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
+        }
+        /*
+         * Now take care of resetting the other registers
+         */
+        for(i = 0; reset_others; i++, reset_others >>= 1) {
+                if ((reset_others & 0x1) == 0) continue;
+                ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
+                DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
+                          is_long_reset ? "long" : "short", i, val));
+        }
+}
+static void
+pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
+{
+        unsigned long mask = ovfl_regs[0];
+        unsigned long reset_others = 0UL;
+        unsigned long val;
+        int i;
+        DPRINT_ovfl(("ovfl_regs=0x%lx is_long_reset=%d\n", ovfl_regs[0], is_long_reset));
+        if (ctx->ctx_state == PFM_CTX_MASKED) {
+                pfm_reset_regs_masked(ctx, ovfl_regs, is_long_reset);
+                return;
+        }
+        /*
+         * now restore reset value on sampling overflowed counters
+         */
+        mask >>= PMU_FIRST_COUNTER;
+        for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
+                if ((mask & 0x1UL) == 0UL) continue;
+                val           = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
+                reset_others |= ctx->ctx_pmds[i].reset_pmds[0];
+                DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
+                pfm_write_soft_counter(ctx, i, val);
+        }
+        /*
+         * Now take care of resetting the other registers
+         */
+        for(i = 0; reset_others; i++, reset_others >>= 1) {
+                if ((reset_others & 0x1) == 0) continue;
+                val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
+                if (PMD_IS_COUNTING(i)) {
+                        pfm_write_soft_counter(ctx, i, val);
+                } else {
+                        ia64_set_pmd(i, val);
+                }
+                DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
+                          is_long_reset ? "long" : "short", i, val));
+        }
+        ia64_srlz_d();
+}
+static int
+pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        struct thread_struct *thread = NULL;
+        struct task_struct *task;
+        pfarg_reg_t *req = (pfarg_reg_t *)arg;
+        unsigned long value, pmc_pm;
+        unsigned long smpl_pmds, reset_pmds, impl_pmds;
+        unsigned int cnum, reg_flags, flags, pmc_type;
+        int i, can_access_pmu = 0, is_loaded, is_system, expert_mode;
+        int is_monitor, is_counting, state;
+        int ret = -EINVAL;
+        pfm_reg_check_t wr_func;
+#define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
+        state     = ctx->ctx_state;
+        is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
+        is_system = ctx->ctx_fl_system;
+        task      = ctx->ctx_task;
+        impl_pmds = pmu_conf->impl_pmds[0];
+        if (state == PFM_CTX_ZOMBIE) return -EINVAL;
+        if (is_loaded) {
+                thread = &task->thread;
+                /*
+                 * In system wide and when the context is loaded, access can only happen
+                 * when the caller is running on the CPU being monitored by the session.
+                 * It does not have to be the owner (ctx_task) of the context per se.
+                 */
+                if (is_system && ctx->ctx_cpu != smp_processor_id()) {
+                        DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
+                        return -EBUSY;
+                }
+                can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
+        }
+        expert_mode = pfm_sysctl.expert_mode; 
+        for (i = 0; i < count; i++, req++) {
+                cnum       = req->reg_num;
+                reg_flags  = req->reg_flags;
+                value      = req->reg_value;
+                smpl_pmds  = req->reg_smpl_pmds[0];
+                reset_pmds = req->reg_reset_pmds[0];
+                flags      = 0;
+                if (cnum >= PMU_MAX_PMCS) {
+                        DPRINT(("pmc%u is invalid\n", cnum));
+                        goto error;
+                }
+                pmc_type   = pmu_conf->pmc_desc[cnum].type;
+                pmc_pm     = (value >> pmu_conf->pmc_desc[cnum].pm_pos) & 0x1;
+                is_counting = (pmc_type & PFM_REG_COUNTING) == PFM_REG_COUNTING ? 1 : 0;
+                is_monitor  = (pmc_type & PFM_REG_MONITOR) == PFM_REG_MONITOR ? 1 : 0;
+                /*
+                 * we reject all non implemented PMC as well
+                 * as attempts to modify PMC[0-3] which are used
+                 * as status registers by the PMU
+                 */
+                if ((pmc_type & PFM_REG_IMPL) == 0 || (pmc_type & PFM_REG_CONTROL) == PFM_REG_CONTROL) {
+                        DPRINT(("pmc%u is unimplemented or no-access pmc_type=%x\n", cnum, pmc_type));
+                        goto error;
+                }
+                wr_func = pmu_conf->pmc_desc[cnum].write_check;
+                /*
+                 * If the PMC is a monitor, then if the value is not the default:
+                 *      - system-wide session: PMCx.pm=1 (privileged monitor)
+                 *      - per-task           : PMCx.pm=0 (user monitor)
+                 */
+                if (is_monitor && value != PMC_DFL_VAL(cnum) && is_system ^ pmc_pm) {
+                        DPRINT(("pmc%u pmc_pm=%lu is_system=%d\n",
+                                cnum,
+                                pmc_pm,
+                                is_system));
+                        goto error;
+                }
+                if (is_counting) {
+                        /*
+                         * enforce generation of overflow interrupt. Necessary on all
+                         * CPUs.
+                         */
+                        value |= 1 << PMU_PMC_OI;
+                        if (reg_flags & PFM_REGFL_OVFL_NOTIFY) {
+                                flags |= PFM_REGFL_OVFL_NOTIFY;
+                        }
+                        if (reg_flags & PFM_REGFL_RANDOM) flags |= PFM_REGFL_RANDOM;
+                        /* verify validity of smpl_pmds */
+                        if ((smpl_pmds & impl_pmds) != smpl_pmds) {
+                                DPRINT(("invalid smpl_pmds 0x%lx for pmc%u\n", smpl_pmds, cnum));
+                                goto error;
+                        }
+                        /* verify validity of reset_pmds */
+                        if ((reset_pmds & impl_pmds) != reset_pmds) {
+                                DPRINT(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum));
+                                goto error;
+                        }
+                } else {
+                        if (reg_flags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) {
+                                DPRINT(("cannot set ovfl_notify or random on pmc%u\n", cnum));
+                                goto error;
+                        }
+                        /* eventid on non-counting monitors are ignored */
+                }
+                /*
+                 * execute write checker, if any
+                 */
+                if (likely(expert_mode == 0 && wr_func)) {
+                        ret = (*wr_func)(task, ctx, cnum, &value, regs);
+                        if (ret) goto error;
+                        ret = -EINVAL;
+                }
+                /*
+                 * no error on this register
+                 */
+                PFM_REG_RETFLAG_SET(req->reg_flags, 0);
+                /*
+                 * Now we commit the changes to the software state
+                 */
+                /*
+                 * update overflow information
+                 */
+                if (is_counting) {
+                        /*
+                         * full flag update each time a register is programmed
+                         */
+                        ctx->ctx_pmds[cnum].flags = flags;
+                        ctx->ctx_pmds[cnum].reset_pmds[0] = reset_pmds;
+                        ctx->ctx_pmds[cnum].smpl_pmds[0]  = smpl_pmds;
+                        ctx->ctx_pmds[cnum].eventid       = req->reg_smpl_eventid;
+                        /*
+                         * Mark all PMDS to be accessed as used.
+                         *
+                         * We do not keep track of PMC because we have to
+                         * systematically restore ALL of them.
+                         *
+                         * We do not update the used_monitors mask, because
+                         * if we have not programmed them, then will be in
+                         * a quiescent state, therefore we will not need to
+                         * mask/restore then when context is MASKED.
+                         */
+                        CTX_USED_PMD(ctx, reset_pmds);
+                        CTX_USED_PMD(ctx, smpl_pmds);
+                        /*
+                         * make sure we do not try to reset on
+                         * restart because we have established new values
+                         */
+                        if (state == PFM_CTX_MASKED) ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
+                }
+                /*
+                 * Needed in case the user does not initialize the equivalent
+                 * PMD. Clearing is done indirectly via pfm_reset_pmu_state() so there is no
+                 * possible leak here.
+                 */
+                CTX_USED_PMD(ctx, pmu_conf->pmc_desc[cnum].dep_pmd[0]);
+                /*
+                 * keep track of the monitor PMC that we are using.
+                 * we save the value of the pmc in ctx_pmcs[] and if
+                 * the monitoring is not stopped for the context we also
+                 * place it in the saved state area so that it will be
+                 * picked up later by the context switch code.
+                 *
+                 * The value in ctx_pmcs[] can only be changed in pfm_write_pmcs().
+                 *
+                 * The value in thread->pmcs[] may be modified on overflow, i.e.,  when
+                 * monitoring needs to be stopped.
+                 */
+                if (is_monitor) CTX_USED_MONITOR(ctx, 1UL << cnum);
+                /*
+                 * update context state
+                 */
+                ctx->ctx_pmcs[cnum] = value;
+                if (is_loaded) {
+                        /*
+                         * write thread state
+                         */
+                        if (is_system == 0) thread->pmcs[cnum] = value;
+                        /*
+                         * write hardware register if we can
+                         */
+                        if (can_access_pmu) {
+                                ia64_set_pmc(cnum, value);
+                        }
+#ifdef CONFIG_SMP
+                        else {
+                                /*
+                                 * per-task SMP only here
+                                 *
+                                 * we are guaranteed that the task is not running on the other CPU,
+                                 * we indicate that this PMD will need to be reloaded if the task
+                                 * is rescheduled on the CPU it ran last on.
+                                 */
+                                ctx->ctx_reload_pmcs[0] |= 1UL << cnum;
+                        }
+#endif
+                }
+                DPRINT(("pmc[%u]=0x%lx ld=%d apmu=%d flags=0x%x all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
+                          cnum,
+                          value,
+                          is_loaded,
+                          can_access_pmu,
+                          flags,
+                          ctx->ctx_all_pmcs[0],
+                          ctx->ctx_used_pmds[0],
+                          ctx->ctx_pmds[cnum].eventid,
+                          smpl_pmds,
+                          reset_pmds,
+                          ctx->ctx_reload_pmcs[0],
+                          ctx->ctx_used_monitors[0],
+                          ctx->ctx_ovfl_regs[0]));
+        }
+        /*
+         * make sure the changes are visible
+         */
+        if (can_access_pmu) ia64_srlz_d();
+        return 0;
+error:
+        PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
+        return ret;
+}
+static int
+pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        struct thread_struct *thread = NULL;
+        struct task_struct *task;
+        pfarg_reg_t *req = (pfarg_reg_t *)arg;
+        unsigned long value, hw_value, ovfl_mask;
+        unsigned int cnum;
+        int i, can_access_pmu = 0, state;
+        int is_counting, is_loaded, is_system, expert_mode;
+        int ret = -EINVAL;
+        pfm_reg_check_t wr_func;
+        state     = ctx->ctx_state;
+        is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
+        is_system = ctx->ctx_fl_system;
+        ovfl_mask = pmu_conf->ovfl_val;
+        task      = ctx->ctx_task;
+        if (unlikely(state == PFM_CTX_ZOMBIE)) return -EINVAL;
+        /*
+         * on both UP and SMP, we can only write to the PMC when the task is
+         * the owner of the local PMU.
+         */
+        if (likely(is_loaded)) {
+                thread = &task->thread;
+                /*
+                 * In system wide and when the context is loaded, access can only happen
+                 * when the caller is running on the CPU being monitored by the session.
+                 * It does not have to be the owner (ctx_task) of the context per se.
+                 */
+                if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
+                        DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
+                        return -EBUSY;
+                }
+                can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
+        }
+        expert_mode = pfm_sysctl.expert_mode; 
+        for (i = 0; i < count; i++, req++) {
+                cnum  = req->reg_num;
+                value = req->reg_value;
+                if (!PMD_IS_IMPL(cnum)) {
+                        DPRINT(("pmd[%u] is unimplemented or invalid\n", cnum));
+                        goto abort_mission;
+                }
+                is_counting = PMD_IS_COUNTING(cnum);
+                wr_func     = pmu_conf->pmd_desc[cnum].write_check;
+                /*
+                 * execute write checker, if any
+                 */
+                if (unlikely(expert_mode == 0 && wr_func)) {
+                        unsigned long v = value;
+                        ret = (*wr_func)(task, ctx, cnum, &v, regs);
+                        if (ret) goto abort_mission;
+                        value = v;
+                        ret   = -EINVAL;
+                }
+                /*
+                 * no error on this register
+                 */
+                PFM_REG_RETFLAG_SET(req->reg_flags, 0);
+                /*
+                 * now commit changes to software state
+                 */
+                hw_value = value;
+                /*
+                 * update virtualized (64bits) counter
+                 */
+                if (is_counting) {
+                        /*
+                         * write context state
+                         */
+                        ctx->ctx_pmds[cnum].lval = value;
+                        /*
+                         * when context is load we use the split value
+                         */
+                        if (is_loaded) {
+                                hw_value = value &  ovfl_mask;
+                                value    = value & ~ovfl_mask;
+                        }
+                }
+                /*
+                 * update reset values (not just for counters)
+                 */
+                ctx->ctx_pmds[cnum].long_reset  = req->reg_long_reset;
+                ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset;
+                /*
+                 * update randomization parameters (not just for counters)
+                 */
+                ctx->ctx_pmds[cnum].seed = req->reg_random_seed;
+                ctx->ctx_pmds[cnum].mask = req->reg_random_mask;
+                /*
+                 * update context value
+                 */
+                ctx->ctx_pmds[cnum].val  = value;
+                /*
+                 * Keep track of what we use
+                 *
+                 * We do not keep track of PMC because we have to
+                 * systematically restore ALL of them.
+                 */
+                CTX_USED_PMD(ctx, PMD_PMD_DEP(cnum));
+                /*
+                 * mark this PMD register used as well
+                 */
+                CTX_USED_PMD(ctx, RDEP(cnum));
+                /*
+                 * make sure we do not try to reset on
+                 * restart because we have established new values
+                 */
+                if (is_counting && state == PFM_CTX_MASKED) {
+                        ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
+                }
+                if (is_loaded) {
+                        /*
+                         * write thread state
+                         */
+                        if (is_system == 0) thread->pmds[cnum] = hw_value;
+                        /*
+                         * write hardware register if we can
+                         */
+                        if (can_access_pmu) {
+                                ia64_set_pmd(cnum, hw_value);
+                        } else {
+#ifdef CONFIG_SMP
+                                /*
+                                 * we are guaranteed that the task is not running on the other CPU,
+                                 * we indicate that this PMD will need to be reloaded if the task
+                                 * is rescheduled on the CPU it ran last on.
+                                 */
+                                ctx->ctx_reload_pmds[0] |= 1UL << cnum;
+#endif
+                        }
+                }
+                DPRINT(("pmd[%u]=0x%lx ld=%d apmu=%d, hw_value=0x%lx ctx_pmd=0x%lx  short_reset=0x%lx "
+                          "long_reset=0x%lx notify=%c seed=0x%lx mask=0x%lx used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
+                        cnum,
+                        value,
+                        is_loaded,
+                        can_access_pmu,
+                        hw_value,
+                        ctx->ctx_pmds[cnum].val,
+                        ctx->ctx_pmds[cnum].short_reset,
+                        ctx->ctx_pmds[cnum].long_reset,
+                        PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
+                        ctx->ctx_pmds[cnum].seed,
+                        ctx->ctx_pmds[cnum].mask,
+                        ctx->ctx_used_pmds[0],
+                        ctx->ctx_pmds[cnum].reset_pmds[0],
+                        ctx->ctx_reload_pmds[0],
+                        ctx->ctx_all_pmds[0],
+                        ctx->ctx_ovfl_regs[0]));
+        }
+        /*
+         * make changes visible
+         */
+        if (can_access_pmu) ia64_srlz_d();
+        return 0;
+abort_mission:
+        /*
+         * for now, we have only one possibility for error
+         */
+        PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
+        return ret;
+}
+/*
+ * By the way of PROTECT_CONTEXT(), interrupts are masked while we are in this function.
+ * Therefore we know, we do not have to worry about the PMU overflow interrupt. If an
+ * interrupt is delivered during the call, it will be kept pending until we leave, making
+ * it appears as if it had been generated at the UNPROTECT_CONTEXT(). At least we are
+ * guaranteed to return consistent data to the user, it may simply be old. It is not
+ * trivial to treat the overflow while inside the call because you may end up in
+ * some module sampling buffer code causing deadlocks.
+ */
+static int
+pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        struct thread_struct *thread = NULL;
+        struct task_struct *task;
+        unsigned long val = 0UL, lval, ovfl_mask, sval;
+        pfarg_reg_t *req = (pfarg_reg_t *)arg;
+        unsigned int cnum, reg_flags = 0;
+        int i, can_access_pmu = 0, state;
+        int is_loaded, is_system, is_counting, expert_mode;
+        int ret = -EINVAL;
+        pfm_reg_check_t rd_func;
+        /*
+         * access is possible when loaded only for
+         * self-monitoring tasks or in UP mode
+         */
+        state     = ctx->ctx_state;
+        is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
+        is_system = ctx->ctx_fl_system;
+        ovfl_mask = pmu_conf->ovfl_val;
+        task      = ctx->ctx_task;
+        if (state == PFM_CTX_ZOMBIE) return -EINVAL;
+        if (likely(is_loaded)) {
+                thread = &task->thread;
+                /*
+                 * In system wide and when the context is loaded, access can only happen
+                 * when the caller is running on the CPU being monitored by the session.
+                 * It does not have to be the owner (ctx_task) of the context per se.
+                 */
+                if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
+                        DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
+                        return -EBUSY;
+                }
+                /*
+                 * this can be true when not self-monitoring only in UP
+                 */
+                can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
+                if (can_access_pmu) ia64_srlz_d();
+        }
+        expert_mode = pfm_sysctl.expert_mode; 
+        DPRINT(("ld=%d apmu=%d ctx_state=%d\n",
+                is_loaded,
+                can_access_pmu,
+                state));
+        /*
+         * on both UP and SMP, we can only read the PMD from the hardware register when
+         * the task is the owner of the local PMU.
+         */
+        for (i = 0; i < count; i++, req++) {
+                cnum        = req->reg_num;
+                reg_flags   = req->reg_flags;
+                if (unlikely(!PMD_IS_IMPL(cnum))) goto error;
+                /*
+                 * we can only read the register that we use. That includes
+                 * the one we explicitely initialize AND the one we want included
+                 * in the sampling buffer (smpl_regs).
+                 *
+                 * Having this restriction allows optimization in the ctxsw routine
+                 * without compromising security (leaks)
+                 */
+                if (unlikely(!CTX_IS_USED_PMD(ctx, cnum))) goto error;
+                sval        = ctx->ctx_pmds[cnum].val;
+                lval        = ctx->ctx_pmds[cnum].lval;
+                is_counting = PMD_IS_COUNTING(cnum);
+                /*
+                 * If the task is not the current one, then we check if the
+                 * PMU state is still in the local live register due to lazy ctxsw.
+                 * If true, then we read directly from the registers.
+                 */
+                if (can_access_pmu){
+                        val = ia64_get_pmd(cnum);
+                } else {
+                        /*
+                         * context has been saved
+                         * if context is zombie, then task does not exist anymore.
+                         * In this case, we use the full value saved in the context (pfm_flush_regs()).
+                         */
+                        val = is_loaded ? thread->pmds[cnum] : 0UL;
+                }
+                rd_func = pmu_conf->pmd_desc[cnum].read_check;
+                if (is_counting) {
+                        /*
+                         * XXX: need to check for overflow when loaded
+                         */
+                        val &= ovfl_mask;
+                        val += sval;
+                }
+                /*
+                 * execute read checker, if any
+                 */
+                if (unlikely(expert_mode == 0 && rd_func)) {
+                        unsigned long v = val;
+                        ret = (*rd_func)(ctx->ctx_task, ctx, cnum, &v, regs);
+                        if (ret) goto error;
+                        val = v;
+                        ret = -EINVAL;
+                }
+                PFM_REG_RETFLAG_SET(reg_flags, 0);
+                DPRINT(("pmd[%u]=0x%lx\n", cnum, val));
+                /*
+                 * update register return value, abort all if problem during copy.
+                 * we only modify the reg_flags field. no check mode is fine because
+                 * access has been verified upfront in sys_perfmonctl().
+                 */
+                req->reg_value            = val;
+                req->reg_flags            = reg_flags;
+                req->reg_last_reset_val   = lval;
+        }
+        return 0;
+error:
+        PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
+        return ret;
+}
+int
+pfm_mod_write_pmcs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
+{
+        pfm_context_t *ctx;
+        if (req == NULL) return -EINVAL;
+        ctx = GET_PMU_CTX();
+        if (ctx == NULL) return -EINVAL;
+        /*
+         * for now limit to current task, which is enough when calling
+         * from overflow handler
+         */
+        if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
+        return pfm_write_pmcs(ctx, req, nreq, regs);
+}
+EXPORT_SYMBOL(pfm_mod_write_pmcs);
+int
+pfm_mod_read_pmds(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
+{
+        pfm_context_t *ctx;
+        if (req == NULL) return -EINVAL;
+        ctx = GET_PMU_CTX();
+        if (ctx == NULL) return -EINVAL;
+        /*
+         * for now limit to current task, which is enough when calling
+         * from overflow handler
+         */
+        if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
+        return pfm_read_pmds(ctx, req, nreq, regs);
+}
+EXPORT_SYMBOL(pfm_mod_read_pmds);
+/*
+ * Only call this function when a process it trying to
+ * write the debug registers (reading is always allowed)
+ */
+int
+pfm_use_debug_registers(struct task_struct *task)
+{
+        pfm_context_t *ctx = task->thread.pfm_context;
+        unsigned long flags;
+        int ret = 0;
+        if (pmu_conf->use_rr_dbregs == 0) return 0;
+        DPRINT(("called for [%d]\n", task->pid));
+        /*
+         * do it only once
+         */
+        if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0;
+        /*
+         * Even on SMP, we do not need to use an atomic here because
+         * the only way in is via ptrace() and this is possible only when the
+         * process is stopped. Even in the case where the ctxsw out is not totally
+         * completed by the time we come here, there is no way the 'stopped' process
+         * could be in the middle of fiddling with the pfm_write_ibr_dbr() routine.
+         * So this is always safe.
+         */
+        if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1;
+        LOCK_PFS(flags);
+        /*
+         * We cannot allow setting breakpoints when system wide monitoring
+         * sessions are using the debug registers.
+         */
+        if (pfm_sessions.pfs_sys_use_dbregs> 0)
+                ret = -1;
+        else
+                pfm_sessions.pfs_ptrace_use_dbregs++;
+        DPRINT(("ptrace_use_dbregs=%u  sys_use_dbregs=%u by [%d] ret = %d\n",
+                  pfm_sessions.pfs_ptrace_use_dbregs,
+                  pfm_sessions.pfs_sys_use_dbregs,
+                  task->pid, ret));
+        UNLOCK_PFS(flags);
+        return ret;
+}
+/*
+ * This function is called for every task that exits with the
+ * IA64_THREAD_DBG_VALID set. This indicates a task which was
+ * able to use the debug registers for debugging purposes via
+ * ptrace(). Therefore we know it was not using them for
+ * perfmormance monitoring, so we only decrement the number
+ * of "ptraced" debug register users to keep the count up to date
+ */
+int
+pfm_release_debug_registers(struct task_struct *task)
+{
+        unsigned long flags;
+        int ret;
+        if (pmu_conf->use_rr_dbregs == 0) return 0;
+        LOCK_PFS(flags);
+        if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
+                printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid);
+                ret = -1;
+        }  else {
+                pfm_sessions.pfs_ptrace_use_dbregs--;
+                ret = 0;
+        }
+        UNLOCK_PFS(flags);
+        return ret;
+}
+static int
+pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        struct task_struct *task;
+        pfm_buffer_fmt_t *fmt;
+        pfm_ovfl_ctrl_t rst_ctrl;
+        int state, is_system;
+        int ret = 0;
+        state     = ctx->ctx_state;
+        fmt       = ctx->ctx_buf_fmt;
+        is_system = ctx->ctx_fl_system;
+        task      = PFM_CTX_TASK(ctx);
+        switch(state) {
+                case PFM_CTX_MASKED:
+                        break;
+                case PFM_CTX_LOADED: 
+                        if (CTX_HAS_SMPL(ctx) && fmt->fmt_restart_active) break;
+                        /* fall through */
+                case PFM_CTX_UNLOADED:
+                case PFM_CTX_ZOMBIE:
+                        DPRINT(("invalid state=%d\n", state));
+                        return -EBUSY;
+                default:
+                        DPRINT(("state=%d, cannot operate (no active_restart handler)\n", state));
+                        return -EINVAL;
+        }
+        /*
+         * In system wide and when the context is loaded, access can only happen
+         * when the caller is running on the CPU being monitored by the session.
+         * It does not have to be the owner (ctx_task) of the context per se.
+         */
+        if (is_system && ctx->ctx_cpu != smp_processor_id()) {
+                DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
+                return -EBUSY;
+        }
+        /* sanity check */
+        if (unlikely(task == NULL)) {
+                printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", current->pid);
+                return -EINVAL;
+        }
+        if (task == current || is_system) {
+                fmt = ctx->ctx_buf_fmt;
+                DPRINT(("restarting self %d ovfl=0x%lx\n",
+                        task->pid,
+                        ctx->ctx_ovfl_regs[0]));
+                if (CTX_HAS_SMPL(ctx)) {
+                        prefetch(ctx->ctx_smpl_hdr);
+                        rst_ctrl.bits.mask_monitoring = 0;
+                        rst_ctrl.bits.reset_ovfl_pmds = 0;
+                        if (state == PFM_CTX_LOADED)
+                                ret = pfm_buf_fmt_restart_active(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
+                        else
+                                ret = pfm_buf_fmt_restart(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
+                } else {
+                        rst_ctrl.bits.mask_monitoring = 0;
+                        rst_ctrl.bits.reset_ovfl_pmds = 1;
+                }
+                if (ret == 0) {
+                        if (rst_ctrl.bits.reset_ovfl_pmds)
+                                pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
+                        if (rst_ctrl.bits.mask_monitoring == 0) {
+                                DPRINT(("resuming monitoring for [%d]\n", task->pid));
+                                if (state == PFM_CTX_MASKED) pfm_restore_monitoring(task);
+                        } else {
+                                DPRINT(("keeping monitoring stopped for [%d]\n", task->pid));
+                                // cannot use pfm_stop_monitoring(task, regs);
+                        }
+                }
+                /*
+                 * clear overflowed PMD mask to remove any stale information
+                 */
+                ctx->ctx_ovfl_regs[0] = 0UL;
+                /*
+                 * back to LOADED state
+                 */
+                ctx->ctx_state = PFM_CTX_LOADED;
+                /*
+                 * XXX: not really useful for self monitoring
+                 */
+                ctx->ctx_fl_can_restart = 0;
+                return 0;
+        }
+        /* 
+         * restart another task
+         */
+        /*
+         * When PFM_CTX_MASKED, we cannot issue a restart before the previous 
+         * one is seen by the task.
+         */
+        if (state == PFM_CTX_MASKED) {
+                if (ctx->ctx_fl_can_restart == 0) return -EINVAL;
+                /*
+                 * will prevent subsequent restart before this one is
+                 * seen by other task
+                 */
+                ctx->ctx_fl_can_restart = 0;
+        }
+        /*
+         * if blocking, then post the semaphore is PFM_CTX_MASKED, i.e.
+         * the task is blocked or on its way to block. That's the normal
+         * restart path. If the monitoring is not masked, then the task
+         * can be actively monitoring and we cannot directly intervene.
+         * Therefore we use the trap mechanism to catch the task and
+         * force it to reset the buffer/reset PMDs.
+         *
+         * if non-blocking, then we ensure that the task will go into
+         * pfm_handle_work() before returning to user mode.
+         *
+         * We cannot explicitely reset another task, it MUST always
+         * be done by the task itself. This works for system wide because
+         * the tool that is controlling the session is logically doing 
+         * "self-monitoring".
+         */
+        if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) {
+                DPRINT(("unblocking [%d] \n", task->pid));
+                up(&ctx->ctx_restart_sem);
+        } else {
+                DPRINT(("[%d] armed exit trap\n", task->pid));
+                ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET;
+                PFM_SET_WORK_PENDING(task, 1);
+                pfm_set_task_notify(task);
+                /*
+                 * XXX: send reschedule if task runs on another CPU
+                 */
+        }
+        return 0;
+}
+static int
+pfm_debug(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        unsigned int m = *(unsigned int *)arg;
+        pfm_sysctl.debug = m == 0 ? 0 : 1;
+        pfm_debug_var = pfm_sysctl.debug;
+        printk(KERN_INFO "perfmon debugging %s (timing reset)\n", pfm_sysctl.debug ? "on" : "off");
+        if (m == 0) {
+                memset(pfm_stats, 0, sizeof(pfm_stats));
+                for(m=0; m < NR_CPUS; m++) pfm_stats[m].pfm_ovfl_intr_cycles_min = ~0UL;
+        }
+        return 0;
+}
+/*
+ * arg can be NULL and count can be zero for this function
+ */
+static int
+pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        struct thread_struct *thread = NULL;
+        struct task_struct *task;
+        pfarg_dbreg_t *req = (pfarg_dbreg_t *)arg;
+        unsigned long flags;
+        dbreg_t dbreg;
+        unsigned int rnum;
+        int first_time;
+        int ret = 0, state;
+        int i, can_access_pmu = 0;
+        int is_system, is_loaded;
+        if (pmu_conf->use_rr_dbregs == 0) return -EINVAL;
+        state     = ctx->ctx_state;
+        is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
+        is_system = ctx->ctx_fl_system;
+        task      = ctx->ctx_task;
+        if (state == PFM_CTX_ZOMBIE) return -EINVAL;
+        /*
+         * on both UP and SMP, we can only write to the PMC when the task is
+         * the owner of the local PMU.
+         */
+        if (is_loaded) {
+                thread = &task->thread;
+                /*
+                 * In system wide and when the context is loaded, access can only happen
+                 * when the caller is running on the CPU being monitored by the session.
+                 * It does not have to be the owner (ctx_task) of the context per se.
+                 */
+                if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
+                        DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
+                        return -EBUSY;
+                }
+                can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
+        }
+        /*
+         * we do not need to check for ipsr.db because we do clear ibr.x, dbr.r, and dbr.w
+         * ensuring that no real breakpoint can be installed via this call.
+         *
+         * IMPORTANT: regs can be NULL in this function
+         */
+        first_time = ctx->ctx_fl_using_dbreg == 0;
+        /*
+         * don't bother if we are loaded and task is being debugged
+         */
+        if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) {
+                DPRINT(("debug registers already in use for [%d]\n", task->pid));
+                return -EBUSY;
+        }
+        /*
+         * check for debug registers in system wide mode
+         *
+         * If though a check is done in pfm_context_load(),
+         * we must repeat it here, in case the registers are
+         * written after the context is loaded
+         */
+        if (is_loaded) {
+                LOCK_PFS(flags);
+                if (first_time && is_system) {
+                        if (pfm_sessions.pfs_ptrace_use_dbregs)
+                                ret = -EBUSY;
+                        else
+                                pfm_sessions.pfs_sys_use_dbregs++;
+                }
+                UNLOCK_PFS(flags);
+        }
+        if (ret != 0) return ret;
+        /*
+         * mark ourself as user of the debug registers for
+         * perfmon purposes.
+         */
+        ctx->ctx_fl_using_dbreg = 1;
+        /*
+         * clear hardware registers to make sure we don't
+         * pick up stale state.
+         *
+         * for a system wide session, we do not use
+         * thread.dbr, thread.ibr because this process
+         * never leaves the current CPU and the state
+         * is shared by all processes running on it
+         */
+        if (first_time && can_access_pmu) {
+                DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid));
+                for (i=0; i < pmu_conf->num_ibrs; i++) {
+                        ia64_set_ibr(i, 0UL);
+                        ia64_dv_serialize_instruction();
+                }
+                ia64_srlz_i();
+                for (i=0; i < pmu_conf->num_dbrs; i++) {
+                        ia64_set_dbr(i, 0UL);
+                        ia64_dv_serialize_data();
+                }
+                ia64_srlz_d();
+        }
+        /*
+         * Now install the values into the registers
+         */
+        for (i = 0; i < count; i++, req++) {
+                rnum      = req->dbreg_num;
+                dbreg.val = req->dbreg_value;
+                ret = -EINVAL;
+                if ((mode == PFM_CODE_RR && rnum >= PFM_NUM_IBRS) || ((mode == PFM_DATA_RR) && rnum >= PFM_NUM_DBRS)) {
+                        DPRINT(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n",
+                                  rnum, dbreg.val, mode, i, count));
+                        goto abort_mission;
+                }
+                /*
+                 * make sure we do not install enabled breakpoint
+                 */
+                if (rnum & 0x1) {
+                        if (mode == PFM_CODE_RR)
+                                dbreg.ibr.ibr_x = 0;
+                        else
+                                dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0;
+                }
+                PFM_REG_RETFLAG_SET(req->dbreg_flags, 0);
+                /*
+                 * Debug registers, just like PMC, can only be modified
+                 * by a kernel call. Moreover, perfmon() access to those
+                 * registers are centralized in this routine. The hardware
+                 * does not modify the value of these registers, therefore,
+                 * if we save them as they are written, we can avoid having
+                 * to save them on context switch out. This is made possible
+                 * by the fact that when perfmon uses debug registers, ptrace()
+                 * won't be able to modify them concurrently.
+                 */
+                if (mode == PFM_CODE_RR) {
+                        CTX_USED_IBR(ctx, rnum);
+                        if (can_access_pmu) {
+                                ia64_set_ibr(rnum, dbreg.val);
+                                ia64_dv_serialize_instruction();
+                        }
+                        ctx->ctx_ibrs[rnum] = dbreg.val;
+                        DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x ld=%d apmu=%d\n",
+                                rnum, dbreg.val, ctx->ctx_used_ibrs[0], is_loaded, can_access_pmu));
+                } else {
+                        CTX_USED_DBR(ctx, rnum);
+                        if (can_access_pmu) {
+                                ia64_set_dbr(rnum, dbreg.val);
+                                ia64_dv_serialize_data();
+                        }
+                        ctx->ctx_dbrs[rnum] = dbreg.val;
+                        DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x ld=%d apmu=%d\n",
+                                rnum, dbreg.val, ctx->ctx_used_dbrs[0], is_loaded, can_access_pmu));
+                }
+        }
+        return 0;
+abort_mission:
+        /*
+         * in case it was our first attempt, we undo the global modifications
+         */
+        if (first_time) {
+                LOCK_PFS(flags);
+                if (ctx->ctx_fl_system) {
+                        pfm_sessions.pfs_sys_use_dbregs--;
+                }
+                UNLOCK_PFS(flags);
+                ctx->ctx_fl_using_dbreg = 0;
+        }
+        /*
+         * install error return flag
+         */
+        PFM_REG_RETFLAG_SET(req->dbreg_flags, PFM_REG_RETFL_EINVAL);
+        return ret;
+}
+static int
+pfm_write_ibrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        return pfm_write_ibr_dbr(PFM_CODE_RR, ctx, arg, count, regs);
+}
+static int
+pfm_write_dbrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        return pfm_write_ibr_dbr(PFM_DATA_RR, ctx, arg, count, regs);
+}
+int
+pfm_mod_write_ibrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
+{
+        pfm_context_t *ctx;
+        if (req == NULL) return -EINVAL;
+        ctx = GET_PMU_CTX();
+        if (ctx == NULL) return -EINVAL;
+        /*
+         * for now limit to current task, which is enough when calling
+         * from overflow handler
+         */
+        if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
+        return pfm_write_ibrs(ctx, req, nreq, regs);
+}
+EXPORT_SYMBOL(pfm_mod_write_ibrs);
+int
+pfm_mod_write_dbrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
+{
+        pfm_context_t *ctx;
+        if (req == NULL) return -EINVAL;
+        ctx = GET_PMU_CTX();
+        if (ctx == NULL) return -EINVAL;
+        /*
+         * for now limit to current task, which is enough when calling
+         * from overflow handler
+         */
+        if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
+        return pfm_write_dbrs(ctx, req, nreq, regs);
+}
+EXPORT_SYMBOL(pfm_mod_write_dbrs);
+static int
+pfm_get_features(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        pfarg_features_t *req = (pfarg_features_t *)arg;
+        req->ft_version = PFM_VERSION;
+        return 0;
+}
+static int
+pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        struct pt_regs *tregs;
+        struct task_struct *task = PFM_CTX_TASK(ctx);
+        int state, is_system;
+        state     = ctx->ctx_state;
+        is_system = ctx->ctx_fl_system;
+        /*
+         * context must be attached to issue the stop command (includes LOADED,MASKED,ZOMBIE)
+         */
+        if (state == PFM_CTX_UNLOADED) return -EINVAL;
+        /*
+         * In system wide and when the context is loaded, access can only happen
+         * when the caller is running on the CPU being monitored by the session.
+         * It does not have to be the owner (ctx_task) of the context per se.
+         */
+        if (is_system && ctx->ctx_cpu != smp_processor_id()) {
+                DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
+                return -EBUSY;
+        }
+        DPRINT(("task [%d] ctx_state=%d is_system=%d\n",
+                PFM_CTX_TASK(ctx)->pid,
+                state,
+                is_system));
+        /*
+         * in system mode, we need to update the PMU directly
+         * and the user level state of the caller, which may not
+         * necessarily be the creator of the context.
+         */
+        if (is_system) {
+                /*
+                 * Update local PMU first
+                 *
+                 * disable dcr pp
+                 */
+                ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
+                ia64_srlz_i();
+                /*
+                 * update local cpuinfo
+                 */
+                PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
+                /*
+                 * stop monitoring, does srlz.i
+                 */
+                pfm_clear_psr_pp();
+                /*
+                 * stop monitoring in the caller
+                 */
+                ia64_psr(regs)->pp = 0;
+                return 0;
+        }
+        /*
+         * per-task mode
+         */
+        if (task == current) {
+                /* stop monitoring  at kernel level */
+                pfm_clear_psr_up();
+                /*
+                 * stop monitoring at the user level
+                 */
+                ia64_psr(regs)->up = 0;
+        } else {
+                tregs = ia64_task_regs(task);
+                /*
+                 * stop monitoring at the user level
+                 */
+                ia64_psr(tregs)->up = 0;
+                /*
+                 * monitoring disabled in kernel at next reschedule
+                 */
+                ctx->ctx_saved_psr_up = 0;
+                DPRINT(("task=[%d]\n", task->pid));
+        }
+        return 0;
+}
+static int
+pfm_start(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        struct pt_regs *tregs;
+        int state, is_system;
+        state     = ctx->ctx_state;
+        is_system = ctx->ctx_fl_system;
+        if (state != PFM_CTX_LOADED) return -EINVAL;
+        /*
+         * In system wide and when the context is loaded, access can only happen
+         * when the caller is running on the CPU being monitored by the session.
+         * It does not have to be the owner (ctx_task) of the context per se.
+         */
+        if (is_system && ctx->ctx_cpu != smp_processor_id()) {
+                DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
+                return -EBUSY;
+        }
+        /*
+         * in system mode, we need to update the PMU directly
+         * and the user level state of the caller, which may not
+         * necessarily be the creator of the context.
+         */
+        if (is_system) {
+                /*
+                 * set user level psr.pp for the caller
+                 */
+                ia64_psr(regs)->pp = 1;
+                /*
+                 * now update the local PMU and cpuinfo
+                 */
+                PFM_CPUINFO_SET(PFM_CPUINFO_DCR_PP);
+                /*
+                 * start monitoring at kernel level
+                 */
+                pfm_set_psr_pp();
+                /* enable dcr pp */
+                ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP);
+                ia64_srlz_i();
+                return 0;
+        }
+        /*
+         * per-process mode
+         */
+        if (ctx->ctx_task == current) {
+                /* start monitoring at kernel level */
+                pfm_set_psr_up();
+                /*
+                 * activate monitoring at user level
+                 */
+                ia64_psr(regs)->up = 1;
+        } else {
+                tregs = ia64_task_regs(ctx->ctx_task);
+                /*
+                 * start monitoring at the kernel level the next
+                 * time the task is scheduled
+                 */
+                ctx->ctx_saved_psr_up = IA64_PSR_UP;
+                /*
+                 * activate monitoring at user level
+                 */
+                ia64_psr(tregs)->up = 1;
+        }
+        return 0;
+}
+static int
+pfm_get_pmc_reset(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        pfarg_reg_t *req = (pfarg_reg_t *)arg;
+        unsigned int cnum;
+        int i;
+        int ret = -EINVAL;
+        for (i = 0; i < count; i++, req++) {
+                cnum = req->reg_num;
+                if (!PMC_IS_IMPL(cnum)) goto abort_mission;
+                req->reg_value = PMC_DFL_VAL(cnum);
+                PFM_REG_RETFLAG_SET(req->reg_flags, 0);
+                DPRINT(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, req->reg_value));
+        }
+        return 0;
+abort_mission:
+        PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
+        return ret;
+}
+static int
+pfm_check_task_exist(pfm_context_t *ctx)
+{
+        struct task_struct *g, *t;
+        int ret = -ESRCH;
+        read_lock(&tasklist_lock);
+        do_each_thread (g, t) {
+                if (t->thread.pfm_context == ctx) {
+                        ret = 0;
+                        break;
+                }
+        } while_each_thread (g, t);
+        read_unlock(&tasklist_lock);
+        DPRINT(("pfm_check_task_exist: ret=%d ctx=%p\n", ret, ctx));
+        return ret;
+}
+static int
+pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        struct task_struct *task;
+        struct thread_struct *thread;
+        struct pfm_context_t *old;
+        unsigned long flags;
+#ifndef CONFIG_SMP
+        struct task_struct *owner_task = NULL;
+#endif
+        pfarg_load_t *req = (pfarg_load_t *)arg;
+        unsigned long *pmcs_source, *pmds_source;
+        int the_cpu;
+        int ret = 0;
+        int state, is_system, set_dbregs = 0;
+        state     = ctx->ctx_state;
+        is_system = ctx->ctx_fl_system;
+        /*
+         * can only load from unloaded or terminated state
+         */
+        if (state != PFM_CTX_UNLOADED) {
+                DPRINT(("cannot load to [%d], invalid ctx_state=%d\n",
+                        req->load_pid,
+                        ctx->ctx_state));
+                return -EINVAL;
+        }
+        DPRINT(("load_pid [%d] using_dbreg=%d\n", req->load_pid, ctx->ctx_fl_using_dbreg));
+        if (CTX_OVFL_NOBLOCK(ctx) == 0 && req->load_pid == current->pid) {
+                DPRINT(("cannot use blocking mode on self\n"));
+                return -EINVAL;
+        }
+        ret = pfm_get_task(ctx, req->load_pid, &task);
+        if (ret) {
+                DPRINT(("load_pid [%d] get_task=%d\n", req->load_pid, ret));
+                return ret;
+        }
+        ret = -EINVAL;
+        /*
+         * system wide is self monitoring only
+         */
+        if (is_system && task != current) {
+                DPRINT(("system wide is self monitoring only load_pid=%d\n",
+                        req->load_pid));
+                goto error;
+        }
+        thread = &task->thread;
+        ret = 0;
+        /*
+         * cannot load a context which is using range restrictions,
+         * into a task that is being debugged.
+         */
+        if (ctx->ctx_fl_using_dbreg) {
+                if (thread->flags & IA64_THREAD_DBG_VALID) {
+                        ret = -EBUSY;
+                        DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid));
+                        goto error;
+                }
+                LOCK_PFS(flags);
+                if (is_system) {
+                        if (pfm_sessions.pfs_ptrace_use_dbregs) {
+                                DPRINT(("cannot load [%d] dbregs in use\n", task->pid));
+                                ret = -EBUSY;
+                        } else {
+                                pfm_sessions.pfs_sys_use_dbregs++;
+                                DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs));
+                                set_dbregs = 1;
+                        }
+                }
+                UNLOCK_PFS(flags);
+                if (ret) goto error;
+        }
+        /*
+         * SMP system-wide monitoring implies self-monitoring.
+         *
+         * The programming model expects the task to
+         * be pinned on a CPU throughout the session.
+         * Here we take note of the current CPU at the
+         * time the context is loaded. No call from
+         * another CPU will be allowed.
+         *
+         * The pinning via shed_setaffinity()
+         * must be done by the calling task prior
+         * to this call.
+         *
+         * systemwide: keep track of CPU this session is supposed to run on
+         */
+        the_cpu = ctx->ctx_cpu = smp_processor_id();
+        ret = -EBUSY;
+        /*
+         * now reserve the session
+         */
+        ret = pfm_reserve_session(current, is_system, the_cpu);
+        if (ret) goto error;
+        /*
+         * task is necessarily stopped at this point.
+         *
+         * If the previous context was zombie, then it got removed in
+         * pfm_save_regs(). Therefore we should not see it here.
+         * If we see a context, then this is an active context
+         *
+         * XXX: needs to be atomic
+         */
+        DPRINT(("before cmpxchg() old_ctx=%p new_ctx=%p\n",
+                thread->pfm_context, ctx));
+        old = ia64_cmpxchg(acq, &thread->pfm_context, NULL, ctx, sizeof(pfm_context_t *));
+        if (old != NULL) {
+                DPRINT(("load_pid [%d] already has a context\n", req->load_pid));
+                goto error_unres;
+        }
+        pfm_reset_msgq(ctx);
+        ctx->ctx_state = PFM_CTX_LOADED;
+        /*
+         * link context to task
+         */
+        ctx->ctx_task = task;
+        if (is_system) {
+                /*
+                 * we load as stopped
+                 */
+                PFM_CPUINFO_SET(PFM_CPUINFO_SYST_WIDE);
+                PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
+                if (ctx->ctx_fl_excl_idle) PFM_CPUINFO_SET(PFM_CPUINFO_EXCL_IDLE);
+        } else {
+                thread->flags |= IA64_THREAD_PM_VALID;
+        }
+        /*
+         * propagate into thread-state
+         */
+        pfm_copy_pmds(task, ctx);
+        pfm_copy_pmcs(task, ctx);
+        pmcs_source = thread->pmcs;
+        pmds_source = thread->pmds;
+        /*
+         * always the case for system-wide
+         */
+        if (task == current) {
+                if (is_system == 0) {
+                        /* allow user level control */
+                        ia64_psr(regs)->sp = 0;
+                        DPRINT(("clearing psr.sp for [%d]\n", task->pid));
+                        SET_LAST_CPU(ctx, smp_processor_id());
+                        INC_ACTIVATION();
+                        SET_ACTIVATION(ctx);
+#ifndef CONFIG_SMP
+                        /*
+                         * push the other task out, if any
+                         */
+                        owner_task = GET_PMU_OWNER();
+                        if (owner_task) pfm_lazy_save_regs(owner_task);
+#endif
+                }
+                /*
+                 * load all PMD from ctx to PMU (as opposed to thread state)
+                 * restore all PMC from ctx to PMU
+                 */
+                pfm_restore_pmds(pmds_source, ctx->ctx_all_pmds[0]);
+                pfm_restore_pmcs(pmcs_source, ctx->ctx_all_pmcs[0]);
+                ctx->ctx_reload_pmcs[0] = 0UL;
+                ctx->ctx_reload_pmds[0] = 0UL;
+                /*
+                 * guaranteed safe by earlier check against DBG_VALID
+                 */
+                if (ctx->ctx_fl_using_dbreg) {
+                        pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+                        pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
+                }
+                /*
+                 * set new ownership
+                 */
+                SET_PMU_OWNER(task, ctx);
+                DPRINT(("context loaded on PMU for [%d]\n", task->pid));
+        } else {
+                /*
+                 * when not current, task MUST be stopped, so this is safe
+                 */
+                regs = ia64_task_regs(task);
+                /* force a full reload */
+                ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
+                SET_LAST_CPU(ctx, -1);
+                /* initial saved psr (stopped) */
+                ctx->ctx_saved_psr_up = 0UL;
+                ia64_psr(regs)->up = ia64_psr(regs)->pp = 0;
+        }
+        ret = 0;
+error_unres:
+        if (ret) pfm_unreserve_session(ctx, ctx->ctx_fl_system, the_cpu);
+error:
+        /*
+         * we must undo the dbregs setting (for system-wide)
+         */
+        if (ret && set_dbregs) {
+                LOCK_PFS(flags);
+                pfm_sessions.pfs_sys_use_dbregs--;
+                UNLOCK_PFS(flags);
+        }
+        /*
+         * release task, there is now a link with the context
+         */
+        if (is_system == 0 && task != current) {
+                pfm_put_task(task);
+                if (ret == 0) {
+                        ret = pfm_check_task_exist(ctx);
+                        if (ret) {
+                                ctx->ctx_state = PFM_CTX_UNLOADED;
+                                ctx->ctx_task  = NULL;
+                        }
+                }
+        }
+        return ret;
+}
+/*
+ * in this function, we do not need to increase the use count
+ * for the task via get_task_struct(), because we hold the
+ * context lock. If the task were to disappear while having
+ * a context attached, it would go through pfm_exit_thread()
+ * which also grabs the context lock  and would therefore be blocked
+ * until we are here.
+ */
+static void pfm_flush_pmds(struct task_struct *, pfm_context_t *ctx);
+static int
+pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+{
+        struct task_struct *task = PFM_CTX_TASK(ctx);
+        struct pt_regs *tregs;
+        int prev_state, is_system;
+        int ret;
+        DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1));
+        prev_state = ctx->ctx_state;
+        is_system  = ctx->ctx_fl_system;
+        /*
+         * unload only when necessary
+         */
+        if (prev_state == PFM_CTX_UNLOADED) {
+                DPRINT(("ctx_state=%d, nothing to do\n", prev_state));
+                return 0;
+        }
+        /*
+         * clear psr and dcr bits
+         */
+        ret = pfm_stop(ctx, NULL, 0, regs);
+        if (ret) return ret;
+        ctx->ctx_state = PFM_CTX_UNLOADED;
+        /*
+         * in system mode, we need to update the PMU directly
+         * and the user level state of the caller, which may not
+         * necessarily be the creator of the context.
+         */
+        if (is_system) {
+                /*
+                 * Update cpuinfo
+                 *
+                 * local PMU is taken care of in pfm_stop()
+                 */
+                PFM_CPUINFO_CLEAR(PFM_CPUINFO_SYST_WIDE);
+                PFM_CPUINFO_CLEAR(PFM_CPUINFO_EXCL_IDLE);
+                /*
+                 * save PMDs in context
+                 * release ownership
+                 */
+                pfm_flush_pmds(current, ctx);
+                /*
+                 * at this point we are done with the PMU
+                 * so we can unreserve the resource.
+                 */
+                if (prev_state != PFM_CTX_ZOMBIE) 
+                        pfm_unreserve_session(ctx, 1 , ctx->ctx_cpu);
+                /*
+                 * disconnect context from task
+                 */
+                task->thread.pfm_context = NULL;
+                /*
+                 * disconnect task from context
+                 */
+                ctx->ctx_task = NULL;
+                /*
+                 * There is nothing more to cleanup here.
+                 */
+                return 0;
+        }
+        /*
+         * per-task mode
+         */
+        tregs = task == current ? regs : ia64_task_regs(task);
+        if (task == current) {
+                /*
+                 * cancel user level control
+                 */
+                ia64_psr(regs)->sp = 1;
+                DPRINT(("setting psr.sp for [%d]\n", task->pid));
+        }
+        /*
+         * save PMDs to context
+         * release ownership
+         */
+        pfm_flush_pmds(task, ctx);
+        /*
+         * at this point we are done with the PMU
+         * so we can unreserve the resource.
+         *
+         * when state was ZOMBIE, we have already unreserved.
+         */
+        if (prev_state != PFM_CTX_ZOMBIE) 
+                pfm_unreserve_session(ctx, 0 , ctx->ctx_cpu);
+        /*
+         * reset activation counter and psr
+         */
+        ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
+        SET_LAST_CPU(ctx, -1);
+        /*
+         * PMU state will not be restored
+         */
+        task->thread.flags &= ~IA64_THREAD_PM_VALID;
+        /*
+         * break links between context and task
+         */
+        task->thread.pfm_context  = NULL;
+        ctx->ctx_task             = NULL;
+        PFM_SET_WORK_PENDING(task, 0);
+        ctx->ctx_fl_trap_reason  = PFM_TRAP_REASON_NONE;
+        ctx->ctx_fl_can_restart  = 0;
+        ctx->ctx_fl_going_zombie = 0;
+        DPRINT(("disconnected [%d] from context\n", task->pid));
+        return 0;
+}
+/*
+ * called only from exit_thread(): task == current
+ * we come here only if current has a context attached (loaded or masked)
+ */
+void
+pfm_exit_thread(struct task_struct *task)
+{
+        pfm_context_t *ctx;
+        unsigned long flags;
+        struct pt_regs *regs = ia64_task_regs(task);
+        int ret, state;
+        int free_ok = 0;
+        ctx = PFM_GET_CTX(task);
+        PROTECT_CTX(ctx, flags);
+        DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task->pid));
+        state = ctx->ctx_state;
+        switch(state) {
+                case PFM_CTX_UNLOADED:
+                        /*
+                         * only comes to thios function if pfm_context is not NULL, i.e., cannot
+                         * be in unloaded state
+                         */
+                        printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task->pid);
+                        break;
+                case PFM_CTX_LOADED:
+                case PFM_CTX_MASKED:
+                        ret = pfm_context_unload(ctx, NULL, 0, regs);
+                        if (ret) {
+                                printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
+                        }
+                        DPRINT(("ctx unloaded for current state was %d\n", state));
+                        pfm_end_notify_user(ctx);
+                        break;
+                case PFM_CTX_ZOMBIE:
+                        ret = pfm_context_unload(ctx, NULL, 0, regs);
+                        if (ret) {
+                                printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
+                        }
+                        free_ok = 1;
+                        break;
+                default:
+                        printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state);
+                        break;
+        }
+        UNPROTECT_CTX(ctx, flags);
+        { u64 psr = pfm_get_psr();
+          BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
+          BUG_ON(GET_PMU_OWNER());
+          BUG_ON(ia64_psr(regs)->up);
+          BUG_ON(ia64_psr(regs)->pp);
+        }
+        /*
+         * All memory free operations (especially for vmalloc'ed memory)
+         * MUST be done with interrupts ENABLED.
+         */
+        if (free_ok) pfm_context_free(ctx);
+}
+/*
+ * functions MUST be listed in the increasing order of their index (see permfon.h)
+ */
+#define PFM_CMD(name, flags, arg_count, arg_type, getsz) { name, #name, flags, arg_count, sizeof(arg_type), getsz }
+#define PFM_CMD_S(name, flags) { name, #name, flags, 0, 0, NULL }
+#define PFM_CMD_PCLRWS  (PFM_CMD_FD|PFM_CMD_ARG_RW|PFM_CMD_STOP)
+#define PFM_CMD_PCLRW   (PFM_CMD_FD|PFM_CMD_ARG_RW)
+#define PFM_CMD_NONE    { NULL, "no-cmd", 0, 0, 0, NULL}
+static pfm_cmd_desc_t pfm_cmd_tab[]={
+/* 0  */PFM_CMD_NONE,
+/* 1  */PFM_CMD(pfm_write_pmcs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
+/* 2  */PFM_CMD(pfm_write_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
+/* 3  */PFM_CMD(pfm_read_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
+/* 4  */PFM_CMD_S(pfm_stop, PFM_CMD_PCLRWS),
+/* 5  */PFM_CMD_S(pfm_start, PFM_CMD_PCLRWS),
+/* 6  */PFM_CMD_NONE,
+/* 7  */PFM_CMD_NONE,
+/* 8  */PFM_CMD(pfm_context_create, PFM_CMD_ARG_RW, 1, pfarg_context_t, pfm_ctx_getsize),
+/* 9  */PFM_CMD_NONE,
+/* 10 */PFM_CMD_S(pfm_restart, PFM_CMD_PCLRW),
+/* 11 */PFM_CMD_NONE,
+/* 12 */PFM_CMD(pfm_get_features, PFM_CMD_ARG_RW, 1, pfarg_features_t, NULL),
+/* 13 */PFM_CMD(pfm_debug, 0, 1, unsigned int, NULL),
+/* 14 */PFM_CMD_NONE,
+/* 15 */PFM_CMD(pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
+/* 16 */PFM_CMD(pfm_context_load, PFM_CMD_PCLRWS, 1, pfarg_load_t, NULL),
+/* 17 */PFM_CMD_S(pfm_context_unload, PFM_CMD_PCLRWS),
+/* 18 */PFM_CMD_NONE,
+/* 19 */PFM_CMD_NONE,
+/* 20 */PFM_CMD_NONE,
+/* 21 */PFM_CMD_NONE,
+/* 22 */PFM_CMD_NONE,
+/* 23 */PFM_CMD_NONE,
+/* 24 */PFM_CMD_NONE,
+/* 25 */PFM_CMD_NONE,
+/* 26 */PFM_CMD_NONE,
+/* 27 */PFM_CMD_NONE,
+/* 28 */PFM_CMD_NONE,
+/* 29 */PFM_CMD_NONE,
+/* 30 */PFM_CMD_NONE,
+/* 31 */PFM_CMD_NONE,
+/* 32 */PFM_CMD(pfm_write_ibrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL),
+/* 33 */PFM_CMD(pfm_write_dbrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL)
+};
+#define PFM_CMD_COUNT   (sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t))
+static int
+pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
+{
+        struct task_struct *task;
+        int state, old_state;
+recheck:
+        state = ctx->ctx_state;
+        task  = ctx->ctx_task;
+        if (task == NULL) {
+                DPRINT(("context %d no task, state=%d\n", ctx->ctx_fd, state));
+                return 0;
+        }
+        DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n",
+                ctx->ctx_fd,
+                state,
+                task->pid,
+                task->state, PFM_CMD_STOPPED(cmd)));
+        /*
+         * self-monitoring always ok.
+         *
+         * for system-wide the caller can either be the creator of the
+         * context (to one to which the context is attached to) OR
+         * a task running on the same CPU as the session.
+         */
+        if (task == current || ctx->ctx_fl_system) return 0;
+        /*
+         * if context is UNLOADED we are safe to go
+         */
+        if (state == PFM_CTX_UNLOADED) return 0;
+        /*
+         * no command can operate on a zombie context
+         */
+        if (state == PFM_CTX_ZOMBIE) {
+                DPRINT(("cmd %d state zombie cannot operate on context\n", cmd));
+                return -EINVAL;
+        }
+        /*
+         * context is LOADED or MASKED. Some commands may need to have 
+         * the task stopped.
+         *
+         * We could lift this restriction for UP but it would mean that
+         * the user has no guarantee the task would not run between
+         * two successive calls to perfmonctl(). That's probably OK.
+         * If this user wants to ensure the task does not run, then
+         * the task must be stopped.
+         */
+        if (PFM_CMD_STOPPED(cmd)) {
+                if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
+                        DPRINT(("[%d] task not in stopped state\n", task->pid));
+                        return -EBUSY;
+                }
+                /*
+                 * task is now stopped, wait for ctxsw out
+                 *
+                 * This is an interesting point in the code.
+                 * We need to unprotect the context because
+                 * the pfm_save_regs() routines needs to grab
+                 * the same lock. There are danger in doing
+                 * this because it leaves a window open for
+                 * another task to get access to the context
+                 * and possibly change its state. The one thing
+                 * that is not possible is for the context to disappear
+                 * because we are protected by the VFS layer, i.e.,
+                 * get_fd()/put_fd().
+                 */
+                old_state = state;
+                UNPROTECT_CTX(ctx, flags);
+                wait_task_inactive(task);
+                PROTECT_CTX(ctx, flags);
+                /*
+                 * we must recheck to verify if state has changed
+                 */
+                if (ctx->ctx_state != old_state) {
+                        DPRINT(("old_state=%d new_state=%d\n", old_state, ctx->ctx_state));
+                        goto recheck;
+                }
+        }
+        return 0;
+}
+/*
+ * system-call entry point (must return long)
+ */
+asmlinkage long
+sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
+{
+        struct file *file = NULL;
+        pfm_context_t *ctx = NULL;
+        unsigned long flags = 0UL;
+        void *args_k = NULL;
+        long ret; /* will expand int return types */
+        size_t base_sz, sz, xtra_sz = 0;
+        int narg, completed_args = 0, call_made = 0, cmd_flags;
+        int (*func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
+        int (*getsize)(void *arg, size_t *sz);
+#define PFM_MAX_ARGSIZE 4096
+        /*
+         * reject any call if perfmon was disabled at initialization
+         */
+        if (unlikely(pmu_conf == NULL)) return -ENOSYS;
+        if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT)) {
+                DPRINT(("invalid cmd=%d\n", cmd));
+                return -EINVAL;
+        }
+        func      = pfm_cmd_tab[cmd].cmd_func;
+        narg      = pfm_cmd_tab[cmd].cmd_narg;
+        base_sz   = pfm_cmd_tab[cmd].cmd_argsize;
+        getsize   = pfm_cmd_tab[cmd].cmd_getsize;
+        cmd_flags = pfm_cmd_tab[cmd].cmd_flags;
+        if (unlikely(func == NULL)) {
+                DPRINT(("invalid cmd=%d\n", cmd));
+                return -EINVAL;
+        }
+        DPRINT(("cmd=%s idx=%d narg=0x%x argsz=%lu count=%d\n",
+                PFM_CMD_NAME(cmd),
+                cmd,
+                narg,
+                base_sz,
+                count));
+        /*
+         * check if number of arguments matches what the command expects
+         */
+        if (unlikely((narg == PFM_CMD_ARG_MANY && count <= 0) || (narg > 0 && narg != count)))
+                return -EINVAL;
+restart_args:
+        sz = xtra_sz + base_sz*count;
+        /*
+         * limit abuse to min page size
+         */
+        if (unlikely(sz > PFM_MAX_ARGSIZE)) {
+                printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", current->pid, sz);
+                return -E2BIG;
+        }
+        /*
+         * allocate default-sized argument buffer
+         */
+        if (likely(count && args_k == NULL)) {
+                args_k = kmalloc(PFM_MAX_ARGSIZE, GFP_KERNEL);
+                if (args_k == NULL) return -ENOMEM;
+        }
+        ret = -EFAULT;
+        /*
+         * copy arguments
+         *
+         * assume sz = 0 for command without parameters
+         */
+        if (sz && copy_from_user(args_k, arg, sz)) {
+                DPRINT(("cannot copy_from_user %lu bytes @%p\n", sz, arg));
+                goto error_args;
+        }
+        /*
+         * check if command supports extra parameters
+         */
+        if (completed_args == 0 && getsize) {
+                /*
+                 * get extra parameters size (based on main argument)
+                 */
+                ret = (*getsize)(args_k, &xtra_sz);
+                if (ret) goto error_args;
+                completed_args = 1;
+                DPRINT(("restart_args sz=%lu xtra_sz=%lu\n", sz, xtra_sz));
+                /* retry if necessary */
+                if (likely(xtra_sz)) goto restart_args;
+        }
+        if (unlikely((cmd_flags & PFM_CMD_FD) == 0)) goto skip_fd;
+        ret = -EBADF;
+        file = fget(fd);
+        if (unlikely(file == NULL)) {
+                DPRINT(("invalid fd %d\n", fd));
+                goto error_args;
+        }
+        if (unlikely(PFM_IS_FILE(file) == 0)) {
+                DPRINT(("fd %d not related to perfmon\n", fd));
+                goto error_args;
+        }
+        ctx = (pfm_context_t *)file->private_data;
+        if (unlikely(ctx == NULL)) {
+                DPRINT(("no context for fd %d\n", fd));
+                goto error_args;
+        }
+        prefetch(&ctx->ctx_state);
+        PROTECT_CTX(ctx, flags);
+        /*
+         * check task is stopped
+         */
+        ret = pfm_check_task_state(ctx, cmd, flags);
+        if (unlikely(ret)) goto abort_locked;
+skip_fd:
+        ret = (*func)(ctx, args_k, count, ia64_task_regs(current));
+        call_made = 1;
+abort_locked:
+        if (likely(ctx)) {
+                DPRINT(("context unlocked\n"));
+                UNPROTECT_CTX(ctx, flags);
+                fput(file);
+        }
+        /* copy argument back to user, if needed */
+        if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT;
+error_args:
+        if (args_k) kfree(args_k);
+        DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret));
+        return ret;
+}
+static void
+pfm_resume_after_ovfl(pfm_context_t *ctx, unsigned long ovfl_regs, struct pt_regs *regs)
+{
+        pfm_buffer_fmt_t *fmt = ctx->ctx_buf_fmt;
+        pfm_ovfl_ctrl_t rst_ctrl;
+        int state;
+        int ret = 0;
+        state = ctx->ctx_state;
+        /*
+         * Unlock sampling buffer and reset index atomically
+         * XXX: not really needed when blocking
+         */
+        if (CTX_HAS_SMPL(ctx)) {
+                rst_ctrl.bits.mask_monitoring = 0;
+                rst_ctrl.bits.reset_ovfl_pmds = 0;
+                if (state == PFM_CTX_LOADED)
+                        ret = pfm_buf_fmt_restart_active(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
+                else
+                        ret = pfm_buf_fmt_restart(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
+        } else {
+                rst_ctrl.bits.mask_monitoring = 0;
+                rst_ctrl.bits.reset_ovfl_pmds = 1;
+        }
+        if (ret == 0) {
+                if (rst_ctrl.bits.reset_ovfl_pmds) {
+                        pfm_reset_regs(ctx, &ovfl_regs, PFM_PMD_LONG_RESET);
+                }
+                if (rst_ctrl.bits.mask_monitoring == 0) {
+                        DPRINT(("resuming monitoring\n"));
+                        if (ctx->ctx_state == PFM_CTX_MASKED) pfm_restore_monitoring(current);
+                } else {
+                        DPRINT(("stopping monitoring\n"));
+                        //pfm_stop_monitoring(current, regs);
+                }
+                ctx->ctx_state = PFM_CTX_LOADED;
+        }
+}
+/*
+ * context MUST BE LOCKED when calling
+ * can only be called for current
+ */
+static void
+pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs)
+{
+        int ret;
+        DPRINT(("entering for [%d]\n", current->pid));
+        ret = pfm_context_unload(ctx, NULL, 0, regs);
+        if (ret) {
+                printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret);
+        }
+        /*
+         * and wakeup controlling task, indicating we are now disconnected
+         */
+        wake_up_interruptible(&ctx->ctx_zombieq);
+        /*
+         * given that context is still locked, the controlling
+         * task will only get access when we return from
+         * pfm_handle_work().
+         */
+}
+static int pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds);
+void
+pfm_handle_work(void)
+{
+        pfm_context_t *ctx;
+        struct pt_regs *regs;
+        unsigned long flags;
+        unsigned long ovfl_regs;
+        unsigned int reason;
+        int ret;
+        ctx = PFM_GET_CTX(current);
+        if (ctx == NULL) {
+                printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid);
+                return;
+        }
+        PROTECT_CTX(ctx, flags);
+        PFM_SET_WORK_PENDING(current, 0);
+        pfm_clear_task_notify();
+        regs = ia64_task_regs(current);
+        /*
+         * extract reason for being here and clear
+         */
+        reason = ctx->ctx_fl_trap_reason;
+        ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
+        ovfl_regs = ctx->ctx_ovfl_regs[0];
+        DPRINT(("reason=%d state=%d\n", reason, ctx->ctx_state));
+        /*
+         * must be done before we check for simple-reset mode
+         */
+        if (ctx->ctx_fl_going_zombie || ctx->ctx_state == PFM_CTX_ZOMBIE) goto do_zombie;
+        //if (CTX_OVFL_NOBLOCK(ctx)) goto skip_blocking;
+        if (reason == PFM_TRAP_REASON_RESET) goto skip_blocking;
+        UNPROTECT_CTX(ctx, flags);
+         /*
+          * pfm_handle_work() is currently called with interrupts disabled.
+          * The down_interruptible call may sleep, therefore we
+          * must re-enable interrupts to avoid deadlocks. It is
+          * safe to do so because this function is called ONLY
+          * when returning to user level (PUStk=1), in which case
+          * there is no risk of kernel stack overflow due to deep
+          * interrupt nesting.
+          */
+        BUG_ON(flags & IA64_PSR_I);
+        local_irq_enable();
+        DPRINT(("before block sleeping\n"));
+        /*
+         * may go through without blocking on SMP systems
+         * if restart has been received already by the time we call down()
+         */
+        ret = down_interruptible(&ctx->ctx_restart_sem);
+        DPRINT(("after block sleeping ret=%d\n", ret));
+        /*
+         * disable interrupts to restore state we had upon entering
+         * this function
+         */
+        local_irq_disable();
+        PROTECT_CTX(ctx, flags);
+        /*
+         * we need to read the ovfl_regs only after wake-up
+         * because we may have had pfm_write_pmds() in between
+         * and that can changed PMD values and therefore 
+         * ovfl_regs is reset for these new PMD values.
+         */
+        ovfl_regs = ctx->ctx_ovfl_regs[0];
+        if (ctx->ctx_fl_going_zombie) {
+do_zombie:
+                DPRINT(("context is zombie, bailing out\n"));
+                pfm_context_force_terminate(ctx, regs);
+                goto nothing_to_do;
+        }
+        /*
+         * in case of interruption of down() we don't restart anything
+         */
+        if (ret < 0) goto nothing_to_do;
+skip_blocking:
+        pfm_resume_after_ovfl(ctx, ovfl_regs, regs);
+        ctx->ctx_ovfl_regs[0] = 0UL;
+nothing_to_do:
+        UNPROTECT_CTX(ctx, flags);
+}
+static int
+pfm_notify_user(pfm_context_t *ctx, pfm_msg_t *msg)
+{
+        if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
+                DPRINT(("ignoring overflow notification, owner is zombie\n"));
+                return 0;
+        }
+        DPRINT(("waking up somebody\n"));
+        if (msg) wake_up_interruptible(&ctx->ctx_msgq_wait);
+        /*
+         * safe, we are not in intr handler, nor in ctxsw when
+         * we come here
+         */
+        kill_fasync (&ctx->ctx_async_queue, SIGIO, POLL_IN);
+        return 0;
+}
+static int
+pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds)
+{
+        pfm_msg_t *msg = NULL;
+        if (ctx->ctx_fl_no_msg == 0) {
+                msg = pfm_get_new_msg(ctx);
+                if (msg == NULL) {
+                        printk(KERN_ERR "perfmon: pfm_ovfl_notify_user no more notification msgs\n");
+                        return -1;
+                }
+                msg->pfm_ovfl_msg.msg_type         = PFM_MSG_OVFL;
+                msg->pfm_ovfl_msg.msg_ctx_fd       = ctx->ctx_fd;
+                msg->pfm_ovfl_msg.msg_active_set   = 0;
+                msg->pfm_ovfl_msg.msg_ovfl_pmds[0] = ovfl_pmds;
+                msg->pfm_ovfl_msg.msg_ovfl_pmds[1] = 0UL;
+                msg->pfm_ovfl_msg.msg_ovfl_pmds[2] = 0UL;
+                msg->pfm_ovfl_msg.msg_ovfl_pmds[3] = 0UL;
+                msg->pfm_ovfl_msg.msg_tstamp       = 0UL;
+        }
+        DPRINT(("ovfl msg: msg=%p no_msg=%d fd=%d ovfl_pmds=0x%lx\n",
+                msg,
+                ctx->ctx_fl_no_msg,
+                ctx->ctx_fd,
+                ovfl_pmds));
+        return pfm_notify_user(ctx, msg);
+}
+static int
+pfm_end_notify_user(pfm_context_t *ctx)
+{
+        pfm_msg_t *msg;
+        msg = pfm_get_new_msg(ctx);
+        if (msg == NULL) {
+                printk(KERN_ERR "perfmon: pfm_end_notify_user no more notification msgs\n");
+                return -1;
+        }
+        /* no leak */
+        memset(msg, 0, sizeof(*msg));
+        msg->pfm_end_msg.msg_type    = PFM_MSG_END;
+        msg->pfm_end_msg.msg_ctx_fd  = ctx->ctx_fd;
+        msg->pfm_ovfl_msg.msg_tstamp = 0UL;
+        DPRINT(("end msg: msg=%p no_msg=%d ctx_fd=%d\n",
+                msg,
+                ctx->ctx_fl_no_msg,
+                ctx->ctx_fd));
+        return pfm_notify_user(ctx, msg);
+}
+/*
+ * main overflow processing routine.
+ * it can be called from the interrupt path or explicitely during the context switch code
+ */
+static void
+pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
+{
+        pfm_ovfl_arg_t *ovfl_arg;
+        unsigned long mask;
+        unsigned long old_val, ovfl_val, new_val;
+        unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds;
+        unsigned long tstamp;
+        pfm_ovfl_ctrl_t ovfl_ctrl;
+        unsigned int i, has_smpl;
+        int must_notify = 0;
+        if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) goto stop_monitoring;
+        /*
+         * sanity test. Should never happen
+         */
+        if (unlikely((pmc0 & 0x1) == 0)) goto sanity_check;
+        tstamp   = ia64_get_itc();
+        mask     = pmc0 >> PMU_FIRST_COUNTER;
+        ovfl_val = pmu_conf->ovfl_val;
+        has_smpl = CTX_HAS_SMPL(ctx);
+        DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s "
+                     "used_pmds=0x%lx\n",
+                        pmc0,
+                        task ? task->pid: -1,
+                        (regs ? regs->cr_iip : 0),
+                        CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
+                        ctx->ctx_used_pmds[0]));
+        /*
+         * first we update the virtual counters
+         * assume there was a prior ia64_srlz_d() issued
+         */
+        for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) {
+                /* skip pmd which did not overflow */
+                if ((mask & 0x1) == 0) continue;
+                /*
+                 * Note that the pmd is not necessarily 0 at this point as qualified events
+                 * may have happened before the PMU was frozen. The residual count is not
+                 * taken into consideration here but will be with any read of the pmd via
+                 * pfm_read_pmds().
+                 */
+                old_val              = new_val = ctx->ctx_pmds[i].val;
+                new_val             += 1 + ovfl_val;
+                ctx->ctx_pmds[i].val = new_val;
+                /*
+                 * check for overflow condition
+                 */
+                if (likely(old_val > new_val)) {
+                        ovfl_pmds |= 1UL << i;
+                        if (PMC_OVFL_NOTIFY(ctx, i)) ovfl_notify |= 1UL << i;
+                }
+                DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n",
+                        i,
+                        new_val,
+                        old_val,
+                        ia64_get_pmd(i) & ovfl_val,
+                        ovfl_pmds,
+                        ovfl_notify));
+        }
+        /*
+         * there was no 64-bit overflow, nothing else to do
+         */
+        if (ovfl_pmds == 0UL) return;
+        /* 
+         * reset all control bits
+         */
+        ovfl_ctrl.val = 0;
+        reset_pmds    = 0UL;
+        /*
+         * if a sampling format module exists, then we "cache" the overflow by 
+         * calling the module's handler() routine.
+         */
+        if (has_smpl) {
+                unsigned long start_cycles, end_cycles;
+                unsigned long pmd_mask;
+                int j, k, ret = 0;
+                int this_cpu = smp_processor_id();
+                pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER;
+                ovfl_arg = &ctx->ctx_ovfl_arg;
+                prefetch(ctx->ctx_smpl_hdr);
+                for(i=PMU_FIRST_COUNTER; pmd_mask && ret == 0; i++, pmd_mask >>=1) {
+                        mask = 1UL << i;
+                        if ((pmd_mask & 0x1) == 0) continue;
+                        ovfl_arg->ovfl_pmd      = (unsigned char )i;
+                        ovfl_arg->ovfl_notify   = ovfl_notify & mask ? 1 : 0;
+                        ovfl_arg->active_set    = 0;
+                        ovfl_arg->ovfl_ctrl.val = 0; /* module must fill in all fields */
+                        ovfl_arg->smpl_pmds[0]  = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
+                        ovfl_arg->pmd_value      = ctx->ctx_pmds[i].val;
+                        ovfl_arg->pmd_last_reset = ctx->ctx_pmds[i].lval;
+                        ovfl_arg->pmd_eventid    = ctx->ctx_pmds[i].eventid;
+                        /*
+                         * copy values of pmds of interest. Sampling format may copy them
+                         * into sampling buffer.
+                         */
+                        if (smpl_pmds) {
+                                for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) {
+                                        if ((smpl_pmds & 0x1) == 0) continue;
+                                        ovfl_arg->smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ?  pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
+                                        DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg->smpl_pmds_values[k-1]));
+                                }
+                        }
+                        pfm_stats[this_cpu].pfm_smpl_handler_calls++;
+                        start_cycles = ia64_get_itc();
+                        /*
+                         * call custom buffer format record (handler) routine
+                         */
+                        ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, ovfl_arg, regs, tstamp);
+                        end_cycles = ia64_get_itc();
+                        /*
+                         * For those controls, we take the union because they have
+                         * an all or nothing behavior.
+                         */
+                        ovfl_ctrl.bits.notify_user     |= ovfl_arg->ovfl_ctrl.bits.notify_user;
+                        ovfl_ctrl.bits.block_task      |= ovfl_arg->ovfl_ctrl.bits.block_task;
+                        ovfl_ctrl.bits.mask_monitoring |= ovfl_arg->ovfl_ctrl.bits.mask_monitoring;
+                        /*
+                         * build the bitmask of pmds to reset now
+                         */
+                        if (ovfl_arg->ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask;
+                        pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles;
+                }
+                /*
+                 * when the module cannot handle the rest of the overflows, we abort right here
+                 */
+                if (ret && pmd_mask) {
+                        DPRINT(("handler aborts leftover ovfl_pmds=0x%lx\n",
+                                pmd_mask<<PMU_FIRST_COUNTER));
+                }
+                /*
+                 * remove the pmds we reset now from the set of pmds to reset in pfm_restart()
+                 */
+                ovfl_pmds &= ~reset_pmds;
+        } else {
+                /*
+                 * when no sampling module is used, then the default
+                 * is to notify on overflow if requested by user
+                 */
+                ovfl_ctrl.bits.notify_user     = ovfl_notify ? 1 : 0;
+                ovfl_ctrl.bits.block_task      = ovfl_notify ? 1 : 0;
+                ovfl_ctrl.bits.mask_monitoring = ovfl_notify ? 1 : 0; /* XXX: change for saturation */
+                ovfl_ctrl.bits.reset_ovfl_pmds = ovfl_notify ? 0 : 1;
+                /*
+                 * if needed, we reset all overflowed pmds
+                 */
+                if (ovfl_notify == 0) reset_pmds = ovfl_pmds;
+        }
+        DPRINT_ovfl(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n", ovfl_pmds, reset_pmds));
+        /*
+         * reset the requested PMD registers using the short reset values
+         */
+        if (reset_pmds) {
+                unsigned long bm = reset_pmds;
+                pfm_reset_regs(ctx, &bm, PFM_PMD_SHORT_RESET);
+        }
+        if (ovfl_notify && ovfl_ctrl.bits.notify_user) {
+                /*
+                 * keep track of what to reset when unblocking
+                 */
+                ctx->ctx_ovfl_regs[0] = ovfl_pmds;
+                /*
+                 * check for blocking context 
+                 */
+                if (CTX_OVFL_NOBLOCK(ctx) == 0 && ovfl_ctrl.bits.block_task) {
+                        ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_BLOCK;
+                        /*
+                         * set the perfmon specific checking pending work for the task
+                         */
+                        PFM_SET_WORK_PENDING(task, 1);
+                        /*
+                         * when coming from ctxsw, current still points to the
+                         * previous task, therefore we must work with task and not current.
+                         */
+                        pfm_set_task_notify(task);
+                }
+                /*
+                 * defer until state is changed (shorten spin window). the context is locked
+                 * anyway, so the signal receiver would come spin for nothing.
+                 */
+                must_notify = 1;
+        }
+        DPRINT_ovfl(("owner [%d] pending=%ld reason=%u ovfl_pmds=0x%lx ovfl_notify=0x%lx masked=%d\n",
+                        GET_PMU_OWNER() ? GET_PMU_OWNER()->pid : -1,
+                        PFM_GET_WORK_PENDING(task),
+                        ctx->ctx_fl_trap_reason,
+                        ovfl_pmds,
+                        ovfl_notify,
+                        ovfl_ctrl.bits.mask_monitoring ? 1 : 0));
+        /*
+         * in case monitoring must be stopped, we toggle the psr bits
+         */
+        if (ovfl_ctrl.bits.mask_monitoring) {
+                pfm_mask_monitoring(task);
+                ctx->ctx_state = PFM_CTX_MASKED;
+                ctx->ctx_fl_can_restart = 1;
+        }
+        /*
+         * send notification now
+         */
+        if (must_notify) pfm_ovfl_notify_user(ctx, ovfl_notify);
+        return;
+sanity_check:
+        printk(KERN_ERR "perfmon: CPU%d overflow handler [%d] pmc0=0x%lx\n",
+                        smp_processor_id(),
+                        task ? task->pid : -1,
+                        pmc0);
+        return;
+stop_monitoring:
+        /*
+         * in SMP, zombie context is never restored but reclaimed in pfm_load_regs().
+         * Moreover, zombies are also reclaimed in pfm_save_regs(). Therefore we can
+         * come here as zombie only if the task is the current task. In which case, we
+         * can access the PMU  hardware directly.
+         *
+         * Note that zombies do have PM_VALID set. So here we do the minimal.
+         *
+         * In case the context was zombified it could not be reclaimed at the time
+         * the monitoring program exited. At this point, the PMU reservation has been
+         * returned, the sampiing buffer has been freed. We must convert this call
+         * into a spurious interrupt. However, we must also avoid infinite overflows
+         * by stopping monitoring for this task. We can only come here for a per-task
+         * context. All we need to do is to stop monitoring using the psr bits which
+         * are always task private. By re-enabling secure montioring, we ensure that
+         * the monitored task will not be able to re-activate monitoring.
+         * The task will eventually be context switched out, at which point the context
+         * will be reclaimed (that includes releasing ownership of the PMU).
+         *
+         * So there might be a window of time where the number of per-task session is zero
+         * yet one PMU might have a owner and get at most one overflow interrupt for a zombie
+         * context. This is safe because if a per-task session comes in, it will push this one
+         * out and by the virtue on pfm_save_regs(), this one will disappear. If a system wide
+         * session is force on that CPU, given that we use task pinning, pfm_save_regs() will
+         * also push our zombie context out.
+         *
+         * Overall pretty hairy stuff....
+         */
+        DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task->pid: -1));
+        pfm_clear_psr_up();
+        ia64_psr(regs)->up = 0;
+        ia64_psr(regs)->sp = 1;
+        return;
+}
+static int
+pfm_do_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
+{
+        struct task_struct *task;
+        pfm_context_t *ctx;
+        unsigned long flags;
+        u64 pmc0;
+        int this_cpu = smp_processor_id();
+        int retval = 0;
+        pfm_stats[this_cpu].pfm_ovfl_intr_count++;
+        /*
+         * srlz.d done before arriving here
+         */
+        pmc0 = ia64_get_pmc(0);
+        task = GET_PMU_OWNER();
+        ctx  = GET_PMU_CTX();
+        /*
+         * if we have some pending bits set
+         * assumes : if any PMC0.bit[63-1] is set, then PMC0.fr = 1
+         */
+        if (PMC0_HAS_OVFL(pmc0) && task) {
+                /*
+                 * we assume that pmc0.fr is always set here
+                 */
+                /* sanity check */
+                if (!ctx) goto report_spurious1;
+                if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0) 
+                        goto report_spurious2;
+                PROTECT_CTX_NOPRINT(ctx, flags);
+                pfm_overflow_handler(task, ctx, pmc0, regs);
+                UNPROTECT_CTX_NOPRINT(ctx, flags);
+        } else {
+                pfm_stats[this_cpu].pfm_spurious_ovfl_intr_count++;
+                retval = -1;
+        }
+        /*
+         * keep it unfrozen at all times
+         */
+        pfm_unfreeze_pmu();
+        return retval;
+report_spurious1:
+        printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n",
+                this_cpu, task->pid);
+        pfm_unfreeze_pmu();
+        return -1;
+report_spurious2:
+        printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n", 
+                this_cpu, 
+                task->pid);
+        pfm_unfreeze_pmu();
+        return -1;
+}
+static irqreturn_t
+pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
+{
+        unsigned long start_cycles, total_cycles;
+        unsigned long min, max;
+        int this_cpu;
+        int ret;
+        this_cpu = get_cpu();
+        min      = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min;
+        max      = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max;
+        start_cycles = ia64_get_itc();
+        ret = pfm_do_interrupt_handler(irq, arg, regs);
+        total_cycles = ia64_get_itc();
+        /*
+         * don't measure spurious interrupts
+         */
+        if (likely(ret == 0)) {
+                total_cycles -= start_cycles;
+                if (total_cycles < min) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min = total_cycles;
+                if (total_cycles > max) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max = total_cycles;
+                pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles;
+        }
+        put_cpu_no_resched();
+        return IRQ_HANDLED;
+}
+/*
+ * /proc/perfmon interface, for debug only
+ */
+#define PFM_PROC_SHOW_HEADER    ((void *)NR_CPUS+1)
+static void *
+pfm_proc_start(struct seq_file *m, loff_t *pos)
+{
+        if (*pos == 0) {
+                return PFM_PROC_SHOW_HEADER;
+        }
+        while (*pos <= NR_CPUS) {
+                if (cpu_online(*pos - 1)) {
+                        return (void *)*pos;
+                }
+                ++*pos;
+        }
+        return NULL;
+}
+static void *
+pfm_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        ++*pos;
+        return pfm_proc_start(m, pos);
+}
+static void
+pfm_proc_stop(struct seq_file *m, void *v)
+{
+}
+static void
+pfm_proc_show_header(struct seq_file *m)
+{
+        struct list_head * pos;
+        pfm_buffer_fmt_t * entry;
+        unsigned long flags;
+        seq_printf(m,
+                "perfmon version           : %u.%u\n"
+                "model                     : %s\n"
+                "fastctxsw                 : %s\n"
+                "expert mode               : %s\n"
+                "ovfl_mask                 : 0x%lx\n"
+                "PMU flags                 : 0x%x\n",
+                PFM_VERSION_MAJ, PFM_VERSION_MIN,
+                pmu_conf->pmu_name,
+                pfm_sysctl.fastctxsw > 0 ? "Yes": "No",
+                pfm_sysctl.expert_mode > 0 ? "Yes": "No",
+                pmu_conf->ovfl_val,
+                pmu_conf->flags);
+        LOCK_PFS(flags);
+        seq_printf(m,
+                "proc_sessions             : %u\n"
+                "sys_sessions              : %u\n"
+                "sys_use_dbregs            : %u\n"
+                "ptrace_use_dbregs         : %u\n",
+                pfm_sessions.pfs_task_sessions,
+                pfm_sessions.pfs_sys_sessions,
+                pfm_sessions.pfs_sys_use_dbregs,
+                pfm_sessions.pfs_ptrace_use_dbregs);
+        UNLOCK_PFS(flags);
+        spin_lock(&pfm_buffer_fmt_lock);
+        list_for_each(pos, &pfm_buffer_fmt_list) {
+                entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
+                seq_printf(m, "format                    : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
+                        entry->fmt_uuid[0],
+                        entry->fmt_uuid[1],
+                        entry->fmt_uuid[2],
+                        entry->fmt_uuid[3],
+                        entry->fmt_uuid[4],
+                        entry->fmt_uuid[5],
+                        entry->fmt_uuid[6],
+                        entry->fmt_uuid[7],
+                        entry->fmt_uuid[8],
+                        entry->fmt_uuid[9],
+                        entry->fmt_uuid[10],
+                        entry->fmt_uuid[11],
+                        entry->fmt_uuid[12],
+                        entry->fmt_uuid[13],
+                        entry->fmt_uuid[14],
+                        entry->fmt_uuid[15],
+                        entry->fmt_name);
+        }
+        spin_unlock(&pfm_buffer_fmt_lock);
+}
+static int
+pfm_proc_show(struct seq_file *m, void *v)
+{
+        unsigned long psr;
+        unsigned int i;
+        int cpu;
+        if (v == PFM_PROC_SHOW_HEADER) {
+                pfm_proc_show_header(m);
+                return 0;
+        }
+        /* show info for CPU (v - 1) */
+        cpu = (long)v - 1;
+        seq_printf(m,
+                "CPU%-2d overflow intrs      : %lu\n"
+                "CPU%-2d overflow cycles     : %lu\n"
+                "CPU%-2d overflow min        : %lu\n"
+                "CPU%-2d overflow max        : %lu\n"
+                "CPU%-2d smpl handler calls  : %lu\n"
+                "CPU%-2d smpl handler cycles : %lu\n"
+                "CPU%-2d spurious intrs      : %lu\n"
+                "CPU%-2d replay   intrs      : %lu\n"
+                "CPU%-2d syst_wide           : %d\n"
+                "CPU%-2d dcr_pp              : %d\n"
+                "CPU%-2d exclude idle        : %d\n"
+                "CPU%-2d owner               : %d\n"
+                "CPU%-2d context             : %p\n"
+                "CPU%-2d activations         : %lu\n",
+                cpu, pfm_stats[cpu].pfm_ovfl_intr_count,
+                cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles,
+                cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_min,
+                cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_max,
+                cpu, pfm_stats[cpu].pfm_smpl_handler_calls,
+                cpu, pfm_stats[cpu].pfm_smpl_handler_cycles,
+                cpu, pfm_stats[cpu].pfm_spurious_ovfl_intr_count,
+                cpu, pfm_stats[cpu].pfm_replay_ovfl_intr_count,
+                cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_SYST_WIDE ? 1 : 0,
+                cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_DCR_PP ? 1 : 0,
+                cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0,
+                cpu, pfm_get_cpu_data(pmu_owner, cpu) ? pfm_get_cpu_data(pmu_owner, cpu)->pid: -1,
+                cpu, pfm_get_cpu_data(pmu_ctx, cpu),
+                cpu, pfm_get_cpu_data(pmu_activation_number, cpu));
+        if (num_online_cpus() == 1 && pfm_sysctl.debug > 0) {
+                psr = pfm_get_psr();
+                ia64_srlz_d();
+                seq_printf(m, 
+                        "CPU%-2d psr                 : 0x%lx\n"
+                        "CPU%-2d pmc0                : 0x%lx\n", 
+                        cpu, psr,
+                        cpu, ia64_get_pmc(0));
+                for (i=0; PMC_IS_LAST(i) == 0;  i++) {
+                        if (PMC_IS_COUNTING(i) == 0) continue;
+                        seq_printf(m, 
+                                "CPU%-2d pmc%u                : 0x%lx\n"
+                                "CPU%-2d pmd%u                : 0x%lx\n", 
+                                cpu, i, ia64_get_pmc(i),
+                                cpu, i, ia64_get_pmd(i));
+                }
+        }
+        return 0;
+}
+struct seq_operations pfm_seq_ops = {
+        .start =        pfm_proc_start,
+        .next =         pfm_proc_next,
+        .stop =         pfm_proc_stop,
+        .show =         pfm_proc_show
+};
+static int
+pfm_proc_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &pfm_seq_ops);
+}
+/*
+ * we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens
+ * during pfm_enable() hence before pfm_start(). We cannot assume monitoring
+ * is active or inactive based on mode. We must rely on the value in
+ * local_cpu_data->pfm_syst_info
+ */
+void
+pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
+{
+        struct pt_regs *regs;
+        unsigned long dcr;
+        unsigned long dcr_pp;
+        dcr_pp = info & PFM_CPUINFO_DCR_PP ? 1 : 0;
+        /*
+         * pid 0 is guaranteed to be the idle task. There is one such task with pid 0
+         * on every CPU, so we can rely on the pid to identify the idle task.
+         */
+        if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) {
+                regs = ia64_task_regs(task);
+                ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0;
+                return;
+        }
+        /*
+         * if monitoring has started
+         */
+        if (dcr_pp) {
+                dcr = ia64_getreg(_IA64_REG_CR_DCR);
+                /*
+                 * context switching in?
+                 */
+                if (is_ctxswin) {
+                        /* mask monitoring for the idle task */
+                        ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
+                        pfm_clear_psr_pp();
+                        ia64_srlz_i();
+                        return;
+                }
+                /*
+                 * context switching out
+                 * restore monitoring for next task
+                 *
+                 * Due to inlining this odd if-then-else construction generates
+                 * better code.
+                 */
+                ia64_setreg(_IA64_REG_CR_DCR, dcr |IA64_DCR_PP);
+                pfm_set_psr_pp();
+                ia64_srlz_i();
+        }
+}
+#ifdef CONFIG_SMP
+static void
+pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
+{
+        struct task_struct *task = ctx->ctx_task;
+        ia64_psr(regs)->up = 0;
+        ia64_psr(regs)->sp = 1;
+        if (GET_PMU_OWNER() == task) {
+                DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
+                SET_PMU_OWNER(NULL, NULL);
+        }
+        /*
+         * disconnect the task from the context and vice-versa
+         */
+        PFM_SET_WORK_PENDING(task, 0);
+        task->thread.pfm_context  = NULL;
+        task->thread.flags       &= ~IA64_THREAD_PM_VALID;
+        DPRINT(("force cleanup for [%d]\n",  task->pid));
+}
+/*
+ * in 2.6, interrupts are masked when we come here and the runqueue lock is held
+ */
+void
+pfm_save_regs(struct task_struct *task)
+{
+        pfm_context_t *ctx;
+        struct thread_struct *t;
+        unsigned long flags;
+        u64 psr;
+        ctx = PFM_GET_CTX(task);
+        if (ctx == NULL) return;
+        t = &task->thread;
+        /*
+         * we always come here with interrupts ALREADY disabled by
+         * the scheduler. So we simply need to protect against concurrent
+         * access, not CPU concurrency.
+         */
+        flags = pfm_protect_ctx_ctxsw(ctx);
+        if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
+                struct pt_regs *regs = ia64_task_regs(task);
+                pfm_clear_psr_up();
+                pfm_force_cleanup(ctx, regs);
+                BUG_ON(ctx->ctx_smpl_hdr);
+                pfm_unprotect_ctx_ctxsw(ctx, flags);
+                pfm_context_free(ctx);
+                return;
+        }
+        /*
+         * save current PSR: needed because we modify it
+         */
+        ia64_srlz_d();
+        psr = pfm_get_psr();
+        BUG_ON(psr & (IA64_PSR_I));
+        /*
+         * stop monitoring:
+         * This is the last instruction which may generate an overflow
+         *
+         * We do not need to set psr.sp because, it is irrelevant in kernel.
+         * It will be restored from ipsr when going back to user level
+         */
+        pfm_clear_psr_up();
+        /*
+         * keep a copy of psr.up (for reload)
+         */
+        ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
+        /*
+         * release ownership of this PMU.
+         * PM interrupts are masked, so nothing
+         * can happen.
+         */
+        SET_PMU_OWNER(NULL, NULL);
+        /*
+         * we systematically save the PMD as we have no
+         * guarantee we will be schedule at that same
+         * CPU again.
+         */
+        pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
+        /*
+         * save pmc0 ia64_srlz_d() done in pfm_save_pmds()
+         * we will need it on the restore path to check
+         * for pending overflow.
+         */
+        t->pmcs[0] = ia64_get_pmc(0);
+        /*
+         * unfreeze PMU if had pending overflows
+         */
+        if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
+        /*
+         * finally, allow context access.
+         * interrupts will still be masked after this call.
+         */
+        pfm_unprotect_ctx_ctxsw(ctx, flags);
+}
+#else /* !CONFIG_SMP */
+void
+pfm_save_regs(struct task_struct *task)
+{
+        pfm_context_t *ctx;
+        u64 psr;
+        ctx = PFM_GET_CTX(task);
+        if (ctx == NULL) return;
+        /*
+         * save current PSR: needed because we modify it
+         */
+        psr = pfm_get_psr();
+        BUG_ON(psr & (IA64_PSR_I));
+        /*
+         * stop monitoring:
+         * This is the last instruction which may generate an overflow
+         *
+         * We do not need to set psr.sp because, it is irrelevant in kernel.
+         * It will be restored from ipsr when going back to user level
+         */
+        pfm_clear_psr_up();
+        /*
+         * keep a copy of psr.up (for reload)
+         */
+        ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
+}
+static void
+pfm_lazy_save_regs (struct task_struct *task)
+{
+        pfm_context_t *ctx;
+        struct thread_struct *t;
+        unsigned long flags;
+        { u64 psr  = pfm_get_psr();
+          BUG_ON(psr & IA64_PSR_UP);
+        }
+        ctx = PFM_GET_CTX(task);
+        t   = &task->thread;
+        /*
+         * we need to mask PMU overflow here to
+         * make sure that we maintain pmc0 until
+         * we save it. overflow interrupts are
+         * treated as spurious if there is no
+         * owner.
+         *
+         * XXX: I don't think this is necessary
+         */
+        PROTECT_CTX(ctx,flags);
+        /*
+         * release ownership of this PMU.
+         * must be done before we save the registers.
+         *
+         * after this call any PMU interrupt is treated
+         * as spurious.
+         */
+        SET_PMU_OWNER(NULL, NULL);
+        /*
+         * save all the pmds we use
+         */
+        pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]);
+        /*
+         * save pmc0 ia64_srlz_d() done in pfm_save_pmds()
+         * it is needed to check for pended overflow
+         * on the restore path
+         */
+        t->pmcs[0] = ia64_get_pmc(0);
+        /*
+         * unfreeze PMU if had pending overflows
+         */
+        if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
+        /*
+         * now get can unmask PMU interrupts, they will
+         * be treated as purely spurious and we will not
+         * lose any information
+         */
+        UNPROTECT_CTX(ctx,flags);
+}
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_SMP
+/*
+ * in 2.6, interrupts are masked when we come here and the runqueue lock is held
+ */
+void
+pfm_load_regs (struct task_struct *task)
+{
+        pfm_context_t *ctx;
+        struct thread_struct *t;
+        unsigned long pmc_mask = 0UL, pmd_mask = 0UL;
+        unsigned long flags;
+        u64 psr, psr_up;
+        int need_irq_resend;
+        ctx = PFM_GET_CTX(task);
+        if (unlikely(ctx == NULL)) return;
+        BUG_ON(GET_PMU_OWNER());
+        t     = &task->thread;
+        /*
+         * possible on unload
+         */
+        if (unlikely((t->flags & IA64_THREAD_PM_VALID) == 0)) return;
+        /*
+         * we always come here with interrupts ALREADY disabled by
+         * the scheduler. So we simply need to protect against concurrent
+         * access, not CPU concurrency.
+         */
+        flags = pfm_protect_ctx_ctxsw(ctx);
+        psr   = pfm_get_psr();
+        need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
+        BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
+        BUG_ON(psr & IA64_PSR_I);
+        if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) {
+                struct pt_regs *regs = ia64_task_regs(task);
+                BUG_ON(ctx->ctx_smpl_hdr);
+                pfm_force_cleanup(ctx, regs);
+                pfm_unprotect_ctx_ctxsw(ctx, flags);
+                /*
+                 * this one (kmalloc'ed) is fine with interrupts disabled
+                 */
+                pfm_context_free(ctx);
+                return;
+        }
+        /*
+         * we restore ALL the debug registers to avoid picking up
+         * stale state.
+         */
+        if (ctx->ctx_fl_using_dbreg) {
+                pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+                pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
+        }
+        /*
+         * retrieve saved psr.up
+         */
+        psr_up = ctx->ctx_saved_psr_up;
+        /*
+         * if we were the last user of the PMU on that CPU,
+         * then nothing to do except restore psr
+         */
+        if (GET_LAST_CPU(ctx) == smp_processor_id() && ctx->ctx_last_activation == GET_ACTIVATION()) {
+                /*
+                 * retrieve partial reload masks (due to user modifications)
+                 */
+                pmc_mask = ctx->ctx_reload_pmcs[0];
+                pmd_mask = ctx->ctx_reload_pmds[0];
+        } else {
+                /*
+                 * To avoid leaking information to the user level when psr.sp=0,
+                 * we must reload ALL implemented pmds (even the ones we don't use).
+                 * In the kernel we only allow PFM_READ_PMDS on registers which
+                 * we initialized or requested (sampling) so there is no risk there.
+                 */
+                pmd_mask = pfm_sysctl.fastctxsw ?  ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
+                /*
+                 * ALL accessible PMCs are systematically reloaded, unused registers
+                 * get their default (from pfm_reset_pmu_state()) values to avoid picking
+                 * up stale configuration.
+                 *
+                 * PMC0 is never in the mask. It is always restored separately.
+                 */
+                pmc_mask = ctx->ctx_all_pmcs[0];
+        }
+        /*
+         * when context is MASKED, we will restore PMC with plm=0
+         * and PMD with stale information, but that's ok, nothing
+         * will be captured.
+         *
+         * XXX: optimize here
+         */
+        if (pmd_mask) pfm_restore_pmds(t->pmds, pmd_mask);
+        if (pmc_mask) pfm_restore_pmcs(t->pmcs, pmc_mask);
+        /*
+         * check for pending overflow at the time the state
+         * was saved.
+         */
+        if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
+                /*
+                 * reload pmc0 with the overflow information
+                 * On McKinley PMU, this will trigger a PMU interrupt
+                 */
+                ia64_set_pmc(0, t->pmcs[0]);
+                ia64_srlz_d();
+                t->pmcs[0] = 0UL;
+                /*
+                 * will replay the PMU interrupt
+                 */
+                if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+                pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
+        }
+        /*
+         * we just did a reload, so we reset the partial reload fields
+         */
+        ctx->ctx_reload_pmcs[0] = 0UL;
+        ctx->ctx_reload_pmds[0] = 0UL;
+        SET_LAST_CPU(ctx, smp_processor_id());
+        /*
+         * dump activation value for this PMU
+         */
+        INC_ACTIVATION();
+        /*
+         * record current activation for this context
+         */
+        SET_ACTIVATION(ctx);
+        /*
+         * establish new ownership. 
+         */
+        SET_PMU_OWNER(task, ctx);
+        /*
+         * restore the psr.up bit. measurement
+         * is active again.
+         * no PMU interrupt can happen at this point
+         * because we still have interrupts disabled.
+         */
+        if (likely(psr_up)) pfm_set_psr_up();
+        /*
+         * allow concurrent access to context
+         */
+        pfm_unprotect_ctx_ctxsw(ctx, flags);
+}
+#else /*  !CONFIG_SMP */
+/*
+ * reload PMU state for UP kernels
+ * in 2.5 we come here with interrupts disabled
+ */
+void
+pfm_load_regs (struct task_struct *task)
+{
+        struct thread_struct *t;
+        pfm_context_t *ctx;
+        struct task_struct *owner;
+        unsigned long pmd_mask, pmc_mask;
+        u64 psr, psr_up;
+        int need_irq_resend;
+        owner = GET_PMU_OWNER();
+        ctx   = PFM_GET_CTX(task);
+        t     = &task->thread;
+        psr   = pfm_get_psr();
+        BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
+        BUG_ON(psr & IA64_PSR_I);
+        /*
+         * we restore ALL the debug registers to avoid picking up
+         * stale state.
+         *
+         * This must be done even when the task is still the owner
+         * as the registers may have been modified via ptrace()
+         * (not perfmon) by the previous task.
+         */
+        if (ctx->ctx_fl_using_dbreg) {
+                pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
+                pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
+        }
+        /*
+         * retrieved saved psr.up
+         */
+        psr_up = ctx->ctx_saved_psr_up;
+        need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
+        /*
+         * short path, our state is still there, just
+         * need to restore psr and we go
+         *
+         * we do not touch either PMC nor PMD. the psr is not touched
+         * by the overflow_handler. So we are safe w.r.t. to interrupt
+         * concurrency even without interrupt masking.
+         */
+        if (likely(owner == task)) {
+                if (likely(psr_up)) pfm_set_psr_up();
+                return;
+        }
+        /*
+         * someone else is still using the PMU, first push it out and
+         * then we'll be able to install our stuff !
+         *
+         * Upon return, there will be no owner for the current PMU
+         */
+        if (owner) pfm_lazy_save_regs(owner);
+        /*
+         * To avoid leaking information to the user level when psr.sp=0,
+         * we must reload ALL implemented pmds (even the ones we don't use).
+         * In the kernel we only allow PFM_READ_PMDS on registers which
+         * we initialized or requested (sampling) so there is no risk there.
+         */
+        pmd_mask = pfm_sysctl.fastctxsw ?  ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
+        /*
+         * ALL accessible PMCs are systematically reloaded, unused registers
+         * get their default (from pfm_reset_pmu_state()) values to avoid picking
+         * up stale configuration.
+         *
+         * PMC0 is never in the mask. It is always restored separately
+         */
+        pmc_mask = ctx->ctx_all_pmcs[0];
+        pfm_restore_pmds(t->pmds, pmd_mask);
+        pfm_restore_pmcs(t->pmcs, pmc_mask);
+        /*
+         * check for pending overflow at the time the state
+         * was saved.
+         */
+        if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) {
+                /*
+                 * reload pmc0 with the overflow information
+                 * On McKinley PMU, this will trigger a PMU interrupt
+                 */
+                ia64_set_pmc(0, t->pmcs[0]);
+                ia64_srlz_d();
+                t->pmcs[0] = 0UL;
+                /*
+                 * will replay the PMU interrupt
+                 */
+                if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR);
+                pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
+        }
+        /*
+         * establish new ownership. 
+         */
+        SET_PMU_OWNER(task, ctx);
+        /*
+         * restore the psr.up bit. measurement
+         * is active again.
+         * no PMU interrupt can happen at this point
+         * because we still have interrupts disabled.
+         */
+        if (likely(psr_up)) pfm_set_psr_up();
+}
+#endif /* CONFIG_SMP */
+/*
+ * this function assumes monitoring is stopped
+ */
+static void
+pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
+{
+        u64 pmc0;
+        unsigned long mask2, val, pmd_val, ovfl_val;
+        int i, can_access_pmu = 0;
+        int is_self;
+        /*
+         * is the caller the task being monitored (or which initiated the
+         * session for system wide measurements)
+         */
+        is_self = ctx->ctx_task == task ? 1 : 0;
+        /*
+         * can access PMU is task is the owner of the PMU state on the current CPU
+         * or if we are running on the CPU bound to the context in system-wide mode
+         * (that is not necessarily the task the context is attached to in this mode).
+         * In system-wide we always have can_access_pmu true because a task running on an
+         * invalid processor is flagged earlier in the call stack (see pfm_stop).
+         */
+        can_access_pmu = (GET_PMU_OWNER() == task) || (ctx->ctx_fl_system && ctx->ctx_cpu == smp_processor_id());
+        if (can_access_pmu) {
+                /*
+                 * Mark the PMU as not owned
+                 * This will cause the interrupt handler to do nothing in case an overflow
+                 * interrupt was in-flight
+                 * This also guarantees that pmc0 will contain the final state
+                 * It virtually gives us full control on overflow processing from that point
+                 * on.
+                 */
+                SET_PMU_OWNER(NULL, NULL);
+                DPRINT(("releasing ownership\n"));
+                /*
+                 * read current overflow status:
+                 *
+                 * we are guaranteed to read the final stable state
+                 */
+                ia64_srlz_d();
+                pmc0 = ia64_get_pmc(0); /* slow */
+                /*
+                 * reset freeze bit, overflow status information destroyed
+                 */
+                pfm_unfreeze_pmu();
+        } else {
+                pmc0 = task->thread.pmcs[0];
+                /*
+                 * clear whatever overflow status bits there were
+                 */
+                task->thread.pmcs[0] = 0;
+        }
+        ovfl_val = pmu_conf->ovfl_val;
+        /*
+         * we save all the used pmds
+         * we take care of overflows for counting PMDs
+         *
+         * XXX: sampling situation is not taken into account here
+         */
+        mask2 = ctx->ctx_used_pmds[0];
+        DPRINT(("is_self=%d ovfl_val=0x%lx mask2=0x%lx\n", is_self, ovfl_val, mask2));
+        for (i = 0; mask2; i++, mask2>>=1) {
+                /* skip non used pmds */
+                if ((mask2 & 0x1) == 0) continue;
+                /*
+                 * can access PMU always true in system wide mode
+                 */
+                val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : task->thread.pmds[i];
+                if (PMD_IS_COUNTING(i)) {
+                        DPRINT(("[%d] pmd[%d] ctx_pmd=0x%lx hw_pmd=0x%lx\n",
+                                task->pid,
+                                i,
+                                ctx->ctx_pmds[i].val,
+                                val & ovfl_val));
+                        /*
+                         * we rebuild the full 64 bit value of the counter
+                         */
+                        val = ctx->ctx_pmds[i].val + (val & ovfl_val);
+                        /*
+                         * now everything is in ctx_pmds[] and we need
+                         * to clear the saved context from save_regs() such that
+                         * pfm_read_pmds() gets the correct value
+                         */
+                        pmd_val = 0UL;
+                        /*
+                         * take care of overflow inline
+                         */
+                        if (pmc0 & (1UL << i)) {
+                                val += 1 + ovfl_val;
+                                DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i));
+                        }
+                }
+                DPRINT(("[%d] ctx_pmd[%d]=0x%lx  pmd_val=0x%lx\n", task->pid, i, val, pmd_val));
+                if (is_self) task->thread.pmds[i] = pmd_val;
+                ctx->ctx_pmds[i].val = val;
+        }
+}
+static struct irqaction perfmon_irqaction = {
+        .handler = pfm_interrupt_handler,
+        .flags   = SA_INTERRUPT,
+        .name    = "perfmon"
+};
+/*
+ * perfmon initialization routine, called from the initcall() table
+ */
+static int init_pfm_fs(void);
+static int __init
+pfm_probe_pmu(void)
+{
+        pmu_config_t **p;
+        int family;
+        family = local_cpu_data->family;
+        p      = pmu_confs;
+        while(*p) {
+                if ((*p)->probe) {
+                        if ((*p)->probe() == 0) goto found;
+                } else if ((*p)->pmu_family == family || (*p)->pmu_family == 0xff) {
+                        goto found;
+                }
+                p++;
+        }
+        return -1;
+found:
+        pmu_conf = *p;
+        return 0;
+}
+static struct file_operations pfm_proc_fops = {
+        .open           = pfm_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+int __init
+pfm_init(void)
+{
+        unsigned int n, n_counters, i;
+        printk("perfmon: version %u.%u IRQ %u\n",
+                PFM_VERSION_MAJ,
+                PFM_VERSION_MIN,
+                IA64_PERFMON_VECTOR);
+        if (pfm_probe_pmu()) {
+                printk(KERN_INFO "perfmon: disabled, there is no support for processor family %d\n", 
+                                local_cpu_data->family);
+                return -ENODEV;
+        }
+        /*
+         * compute the number of implemented PMD/PMC from the
+         * description tables
+         */
+        n = 0;
+        for (i=0; PMC_IS_LAST(i) == 0;  i++) {
+                if (PMC_IS_IMPL(i) == 0) continue;
+                pmu_conf->impl_pmcs[i>>6] |= 1UL << (i&63);
+                n++;
+        }
+        pmu_conf->num_pmcs = n;
+        n = 0; n_counters = 0;
+        for (i=0; PMD_IS_LAST(i) == 0;  i++) {
+                if (PMD_IS_IMPL(i) == 0) continue;
+                pmu_conf->impl_pmds[i>>6] |= 1UL << (i&63);
+                n++;
+                if (PMD_IS_COUNTING(i)) n_counters++;
+        }
+        pmu_conf->num_pmds      = n;
+        pmu_conf->num_counters  = n_counters;
+        /*
+         * sanity checks on the number of debug registers
+         */
+        if (pmu_conf->use_rr_dbregs) {
+                if (pmu_conf->num_ibrs > IA64_NUM_DBG_REGS) {
+                        printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf->num_ibrs);
+                        pmu_conf = NULL;
+                        return -1;
+                }
+                if (pmu_conf->num_dbrs > IA64_NUM_DBG_REGS) {
+                        printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf->num_ibrs);
+                        pmu_conf = NULL;
+                        return -1;
+                }
+        }
+        printk("perfmon: %s PMU detected, %u PMCs, %u PMDs, %u counters (%lu bits)\n",
+               pmu_conf->pmu_name,
+               pmu_conf->num_pmcs,
+               pmu_conf->num_pmds,
+               pmu_conf->num_counters,
+               ffz(pmu_conf->ovfl_val));
+        /* sanity check */
+        if (pmu_conf->num_pmds >= IA64_NUM_PMD_REGS || pmu_conf->num_pmcs >= IA64_NUM_PMC_REGS) {
+                printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
+                pmu_conf = NULL;
+                return -1;
+        }
+        /*
+         * create /proc/perfmon (mostly for debugging purposes)
+         */
+        perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL);
+        if (perfmon_dir == NULL) {
+                printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
+                pmu_conf = NULL;
+                return -1;
+        }
+        /*
+         * install customized file operations for /proc/perfmon entry
+         */
+        perfmon_dir->proc_fops = &pfm_proc_fops;
+        /*
+         * create /proc/sys/kernel/perfmon (for debugging purposes)
+         */
+        pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0);
+        /*
+         * initialize all our spinlocks
+         */
+        spin_lock_init(&pfm_sessions.pfs_lock);
+        spin_lock_init(&pfm_buffer_fmt_lock);
+        init_pfm_fs();
+        for(i=0; i < NR_CPUS; i++) pfm_stats[i].pfm_ovfl_intr_cycles_min = ~0UL;
+        return 0;
+}
+__initcall(pfm_init);
+/*
+ * this function is called before pfm_init()
+ */
+void
+pfm_init_percpu (void)
+{
+        /*
+         * make sure no measurement is active
+         * (may inherit programmed PMCs from EFI).
+         */
+        pfm_clear_psr_pp();
+        pfm_clear_psr_up();
+        /*
+         * we run with the PMU not frozen at all times
+         */
+        pfm_unfreeze_pmu();
+        if (smp_processor_id() == 0)
+                register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
+        ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
+        ia64_srlz_d();
+}
+/*
+ * used for debug purposes only
+ */
+void
+dump_pmu_state(const char *from)
+{
+        struct task_struct *task;
+        struct thread_struct *t;
+        struct pt_regs *regs;
+        pfm_context_t *ctx;
+        unsigned long psr, dcr, info, flags;
+        int i, this_cpu;
+        local_irq_save(flags);
+        this_cpu = smp_processor_id();
+        regs     = ia64_task_regs(current);
+        info     = PFM_CPUINFO_GET();
+        dcr      = ia64_getreg(_IA64_REG_CR_DCR);
+        if (info == 0 && ia64_psr(regs)->pp == 0 && (dcr & IA64_DCR_PP) == 0) {
+                local_irq_restore(flags);
+                return;
+        }
+        printk("CPU%d from %s() current [%d] iip=0x%lx %s\n", 
+                this_cpu, 
+                from, 
+                current->pid, 
+                regs->cr_iip,
+                current->comm);
+        task = GET_PMU_OWNER();
+        ctx  = GET_PMU_CTX();
+        printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx);
+        psr = pfm_get_psr();
+        printk("->CPU%d pmc0=0x%lx psr.pp=%d psr.up=%d dcr.pp=%d syst_info=0x%lx user_psr.up=%d user_psr.pp=%d\n", 
+                this_cpu,
+                ia64_get_pmc(0),
+                psr & IA64_PSR_PP ? 1 : 0,
+                psr & IA64_PSR_UP ? 1 : 0,
+                dcr & IA64_DCR_PP ? 1 : 0,
+                info,
+                ia64_psr(regs)->up,
+                ia64_psr(regs)->pp);
+        ia64_psr(regs)->up = 0;
+        ia64_psr(regs)->pp = 0;
+        t = &current->thread;
+        for (i=1; PMC_IS_LAST(i) == 0; i++) {
+                if (PMC_IS_IMPL(i) == 0) continue;
+                printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, t->pmcs[i]);
+        }
+        for (i=1; PMD_IS_LAST(i) == 0; i++) {
+                if (PMD_IS_IMPL(i) == 0) continue;
+                printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, t->pmds[i]);
+        }
+        if (ctx) {
+                printk("->CPU%d ctx_state=%d vaddr=%p addr=%p fd=%d ctx_task=[%d] saved_psr_up=0x%lx\n",
+                                this_cpu,
+                                ctx->ctx_state,
+                                ctx->ctx_smpl_vaddr,
+                                ctx->ctx_smpl_hdr,
+                                ctx->ctx_msgq_head,
+                                ctx->ctx_msgq_tail,
+                                ctx->ctx_saved_psr_up);
+        }
+        local_irq_restore(flags);
+}
+/*
+ * called from process.c:copy_thread(). task is new child.
+ */
+void
+pfm_inherit(struct task_struct *task, struct pt_regs *regs)
+{
+        struct thread_struct *thread;
+        DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task->pid));
+        thread = &task->thread;
+        /*
+         * cut links inherited from parent (current)
+         */
+        thread->pfm_context = NULL;
+        PFM_SET_WORK_PENDING(task, 0);
+        /*
+         * the psr bits are already set properly in copy_threads()
+         */
+}
+#else  /* !CONFIG_PERFMON */
+asmlinkage long
+sys_perfmonctl (int fd, int cmd, void *arg, int count)
+{
+        return -ENOSYS;
+}
+#endif /* CONFIG_PERFMON */
diff --git a/arch/ia64/kernel/perfmon_default_smpl.c b/arch/ia64/kernel/perfmon_default_smpl.c
new file mode 100644
index 000000000000..965d29004555
--- /dev/null
+++ b/arch/ia64/kernel/perfmon_default_smpl.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright (C) 2002-2003 Hewlett-Packard Co
+ *               Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file implements the default sampling buffer format
+ * for the Linux/ia64 perfmon-2 subsystem.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/init.h>
+#include <asm/delay.h>
+#include <linux/smp.h>
+#include <asm/perfmon.h>
+#include <asm/perfmon_default_smpl.h>
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("perfmon default sampling format");
+MODULE_LICENSE("GPL");
+MODULE_PARM(debug, "i");
+MODULE_PARM_DESC(debug, "debug");
+MODULE_PARM(debug_ovfl, "i");
+MODULE_PARM_DESC(debug_ovfl, "debug ovfl");
+#define DEFAULT_DEBUG 1
+#ifdef DEFAULT_DEBUG
+#define DPRINT(a) \
+        do { \
+                if (unlikely(debug >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
+        } while (0)
+#define DPRINT_ovfl(a) \
+        do { \
+                if (unlikely(debug_ovfl >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
+        } while (0)
+#else
+#define DPRINT(a)
+#define DPRINT_ovfl(a)
+#endif
+static int debug, debug_ovfl;
+static int
+default_validate(struct task_struct *task, unsigned int flags, int cpu, void *data)
+{
+        pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t*)data;
+        int ret = 0;
+        if (data == NULL) {
+                DPRINT(("[%d] no argument passed\n", task->pid));
+                return -EINVAL;
+        }
+        DPRINT(("[%d] validate flags=0x%x CPU%d\n", task->pid, flags, cpu));
+        /*
+         * must hold at least the buffer header + one minimally sized entry
+         */
+        if (arg->buf_size < PFM_DEFAULT_SMPL_MIN_BUF_SIZE) return -EINVAL;
+        DPRINT(("buf_size=%lu\n", arg->buf_size));
+        return ret;
+}
+static int
+default_get_size(struct task_struct *task, unsigned int flags, int cpu, void *data, unsigned long *size)
+{
+        pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data;
+        /*
+         * size has been validated in default_validate
+         */
+        *size = arg->buf_size;
+        return 0;
+}
+static int
+default_init(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *data)
+{
+        pfm_default_smpl_hdr_t *hdr;
+        pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data;
+        hdr = (pfm_default_smpl_hdr_t *)buf;
+        hdr->hdr_version      = PFM_DEFAULT_SMPL_VERSION;
+        hdr->hdr_buf_size     = arg->buf_size;
+        hdr->hdr_cur_offs     = sizeof(*hdr);
+        hdr->hdr_overflows    = 0UL;
+        hdr->hdr_count        = 0UL;
+        DPRINT(("[%d] buffer=%p buf_size=%lu hdr_size=%lu hdr_version=%u cur_offs=%lu\n",
+                task->pid,
+                buf,
+                hdr->hdr_buf_size,
+                sizeof(*hdr),
+                hdr->hdr_version,
+                hdr->hdr_cur_offs));
+        return 0;
+}
+static int
+default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp)
+{
+        pfm_default_smpl_hdr_t *hdr;
+        pfm_default_smpl_entry_t *ent;
+        void *cur, *last;
+        unsigned long *e, entry_size;
+        unsigned int npmds, i;
+        unsigned char ovfl_pmd;
+        unsigned char ovfl_notify;
+        if (unlikely(buf == NULL || arg == NULL|| regs == NULL || task == NULL)) {
+                DPRINT(("[%d] invalid arguments buf=%p arg=%p\n", task->pid, buf, arg));
+                return -EINVAL;
+        }
+        hdr         = (pfm_default_smpl_hdr_t *)buf;
+        cur         = buf+hdr->hdr_cur_offs;
+        last        = buf+hdr->hdr_buf_size;
+        ovfl_pmd    = arg->ovfl_pmd;
+        ovfl_notify = arg->ovfl_notify;
+        /*
+         * precheck for sanity
+         */
+        if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full;
+        npmds = hweight64(arg->smpl_pmds[0]);
+        ent = (pfm_default_smpl_entry_t *)cur;
+        prefetch(arg->smpl_pmds_values);
+        entry_size = sizeof(*ent) + (npmds << 3);
+        /* position for first pmd */
+        e = (unsigned long *)(ent+1);
+        hdr->hdr_count++;
+        DPRINT_ovfl(("[%d] count=%lu cur=%p last=%p free_bytes=%lu ovfl_pmd=%d ovfl_notify=%d npmds=%u\n",
+                        task->pid,
+                        hdr->hdr_count,
+                        cur, last,
+                        last-cur,
+                        ovfl_pmd,
+                        ovfl_notify, npmds));
+        /*
+         * current = task running at the time of the overflow.
+         *
+         * per-task mode:
+         *      - this is ususally the task being monitored.
+         *        Under certain conditions, it might be a different task
+         *
+         * system-wide:
+         *      - this is not necessarily the task controlling the session
+         */
+        ent->pid            = current->pid;
+        ent->ovfl_pmd       = ovfl_pmd;
+        ent->last_reset_val = arg->pmd_last_reset; //pmd[0].reg_last_reset_val;
+        /*
+         * where did the fault happen (includes slot number)
+         */
+        ent->ip = regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3);
+        ent->tstamp    = stamp;
+        ent->cpu       = smp_processor_id();
+        ent->set       = arg->active_set;
+        ent->tgid      = current->tgid;
+        /*
+         * selectively store PMDs in increasing index number
+         */
+        if (npmds) {
+                unsigned long *val = arg->smpl_pmds_values;
+                for(i=0; i < npmds; i++) {
+                        *e++ = *val++;
+                }
+        }
+        /*
+         * update position for next entry
+         */
+        hdr->hdr_cur_offs += entry_size;
+        cur               += entry_size;
+        /*
+         * post check to avoid losing the last sample
+         */
+        if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full;
+        /*
+         * keep same ovfl_pmds, ovfl_notify
+         */
+        arg->ovfl_ctrl.bits.notify_user     = 0;
+        arg->ovfl_ctrl.bits.block_task      = 0;
+        arg->ovfl_ctrl.bits.mask_monitoring = 0;
+        arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; /* reset before returning from interrupt handler */
+        return 0;
+full:
+        DPRINT_ovfl(("sampling buffer full free=%lu, count=%lu, ovfl_notify=%d\n", last-cur, hdr->hdr_count, ovfl_notify));
+        /*
+         * increment number of buffer overflow.
+         * important to detect duplicate set of samples.
+         */
+        hdr->hdr_overflows++;
+        /*
+         * if no notification requested, then we saturate the buffer
+         */
+        if (ovfl_notify == 0) {
+                arg->ovfl_ctrl.bits.notify_user     = 0;
+                arg->ovfl_ctrl.bits.block_task      = 0;
+                arg->ovfl_ctrl.bits.mask_monitoring = 1;
+                arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0;
+        } else {
+                arg->ovfl_ctrl.bits.notify_user     = 1;
+                arg->ovfl_ctrl.bits.block_task      = 1; /* ignored for non-blocking context */
+                arg->ovfl_ctrl.bits.mask_monitoring = 1;
+                arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; /* no reset now */
+        }
+        return -1; /* we are full, sorry */
+}
+static int
+default_restart(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
+{
+        pfm_default_smpl_hdr_t *hdr;
+        hdr = (pfm_default_smpl_hdr_t *)buf;
+        hdr->hdr_count    = 0UL;
+        hdr->hdr_cur_offs = sizeof(*hdr);
+        ctrl->bits.mask_monitoring = 0;
+        ctrl->bits.reset_ovfl_pmds = 1; /* uses long-reset values */
+        return 0;
+}
+static int
+default_exit(struct task_struct *task, void *buf, struct pt_regs *regs)
+{
+        DPRINT(("[%d] exit(%p)\n", task->pid, buf));
+        return 0;
+}
+static pfm_buffer_fmt_t default_fmt={
+        .fmt_name           = "default_format",
+        .fmt_uuid           = PFM_DEFAULT_SMPL_UUID,
+        .fmt_arg_size       = sizeof(pfm_default_smpl_arg_t),
+        .fmt_validate       = default_validate,
+        .fmt_getsize        = default_get_size,
+        .fmt_init           = default_init,
+        .fmt_handler        = default_handler,
+        .fmt_restart        = default_restart,
+        .fmt_restart_active = default_restart,
+        .fmt_exit           = default_exit,
+};
+static int __init
+pfm_default_smpl_init_module(void)
+{
+        int ret;
+        ret = pfm_register_buffer_fmt(&default_fmt);
+        if (ret == 0) {
+                printk("perfmon_default_smpl: %s v%u.%u registered\n",
+                        default_fmt.fmt_name,
+                        PFM_DEFAULT_SMPL_VERSION_MAJ,
+                        PFM_DEFAULT_SMPL_VERSION_MIN);
+        } else {
+                printk("perfmon_default_smpl: %s cannot register ret=%d\n",
+                        default_fmt.fmt_name,
+                        ret);
+        }
+        return ret;
+}
+static void __exit
+pfm_default_smpl_cleanup_module(void)
+{
+        int ret;
+        ret = pfm_unregister_buffer_fmt(default_fmt.fmt_uuid);
+        printk("perfmon_default_smpl: unregister %s=%d\n", default_fmt.fmt_name, ret);
+}
+module_init(pfm_default_smpl_init_module);
+module_exit(pfm_default_smpl_cleanup_module);
diff --git a/arch/ia64/kernel/perfmon_generic.h b/arch/ia64/kernel/perfmon_generic.h
new file mode 100644
index 000000000000..67489478041e
--- /dev/null
+++ b/arch/ia64/kernel/perfmon_generic.h
@@ -0,0 +1,45 @@
+/*
+ * This file contains the generic PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Copyright (C) 2002-2003  Hewlett Packard Co
+ *               Stephane Eranian <eranian@hpl.hp.com>
+ */
+static pfm_reg_desc_t pfm_gen_pmc_desc[PMU_MAX_PMCS]={
+/* pmc0  */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc1  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc2  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc3  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc4  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc5  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc6  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc7  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+            { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
+};
+static pfm_reg_desc_t pfm_gen_pmd_desc[PMU_MAX_PMDS]={
+/* pmd0  */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
+/* pmd1  */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
+/* pmd2  */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
+/* pmd3  */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
+/* pmd4  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
+/* pmd5  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
+/* pmd6  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
+/* pmd7  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
+            { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
+};
+/*
+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
+ */
+static pmu_config_t pmu_conf_gen={
+        .pmu_name   = "Generic",
+        .pmu_family = 0xff, /* any */
+        .ovfl_val   = (1UL << 32) - 1,
+        .num_ibrs   = 0, /* does not use */
+        .num_dbrs   = 0, /* does not use */
+        .pmd_desc   = pfm_gen_pmd_desc,
+        .pmc_desc   = pfm_gen_pmc_desc
+};
diff --git a/arch/ia64/kernel/perfmon_itanium.h b/arch/ia64/kernel/perfmon_itanium.h
new file mode 100644
index 000000000000..d1d508a0fbd3
--- /dev/null
+++ b/arch/ia64/kernel/perfmon_itanium.h
@@ -0,0 +1,115 @@
+/*
+ * This file contains the Itanium PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Copyright (C) 2002-2003  Hewlett Packard Co
+ *               Stephane Eranian <eranian@hpl.hp.com>
+ */
+static int pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
+static pfm_reg_desc_t pfm_ita_pmc_desc[PMU_MAX_PMCS]={
+/* pmc0  */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc1  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc2  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc3  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc4  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc5  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc6  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc7  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc8  */ { PFM_REG_CONFIG  , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc9  */ { PFM_REG_CONFIG  , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc10 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0000000010000000UL, -1UL, NULL, pfm_ita_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc13 */ { PFM_REG_CONFIG  , 0, 0x0003ffff00000001UL, -1UL, NULL, pfm_ita_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+            { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
+};
+static pfm_reg_desc_t pfm_ita_pmd_desc[PMU_MAX_PMDS]={
+/* pmd0  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
+/* pmd1  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
+/* pmd2  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
+/* pmd3  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
+/* pmd4  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
+/* pmd5  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
+/* pmd6  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
+/* pmd7  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
+/* pmd8  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd9  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd10 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd11 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd12 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd13 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd14 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd15 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd16 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd17 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
+            { PFM_REG_END     , 0, 0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
+};
+static int
+pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
+{
+        int ret;
+        int is_loaded;
+        /* sanitfy check */
+        if (ctx == NULL) return -EINVAL;
+        is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
+        /*
+         * we must clear the (instruction) debug registers if pmc13.ta bit is cleared
+         * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
+         */
+        if (cnum == 13 && is_loaded && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) {
+                DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val));
+                /* don't mix debug with perfmon */
+                if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
+                /*
+                 * a count of 0 will mark the debug registers as in use and also
+                 * ensure that they are properly cleared.
+                 */
+                ret = pfm_write_ibr_dbr(1, ctx, NULL, 0, regs);
+                if (ret) return ret;
+        }
+        /*
+         * we must clear the (data) debug registers if pmc11.pt bit is cleared
+         * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
+         */
+        if (cnum == 11 && is_loaded && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) {
+                DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val));
+                /* don't mix debug with perfmon */
+                if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
+                /*
+                 * a count of 0 will mark the debug registers as in use and also
+                 * ensure that they are properly cleared.
+                 */
+                ret = pfm_write_ibr_dbr(0, ctx, NULL, 0, regs);
+                if (ret) return ret;
+        }
+        return 0;
+}
+/*
+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
+ */
+static pmu_config_t pmu_conf_ita={
+        .pmu_name      = "Itanium",
+        .pmu_family    = 0x7,
+        .ovfl_val      = (1UL << 32) - 1,
+        .pmd_desc      = pfm_ita_pmd_desc,
+        .pmc_desc      = pfm_ita_pmc_desc,
+        .num_ibrs      = 8,
+        .num_dbrs      = 8,
+        .use_rr_dbregs = 1, /* debug register are use for range retrictions */
+};
diff --git a/arch/ia64/kernel/perfmon_mckinley.h b/arch/ia64/kernel/perfmon_mckinley.h
new file mode 100644
index 000000000000..9becccda2897
--- /dev/null
+++ b/arch/ia64/kernel/perfmon_mckinley.h
@@ -0,0 +1,187 @@
+/*
+ * This file contains the McKinley PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Copyright (C) 2002-2003  Hewlett Packard Co
+ *               Stephane Eranian <eranian@hpl.hp.com>
+ */
+static int pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
+static pfm_reg_desc_t pfm_mck_pmc_desc[PMU_MAX_PMCS]={
+/* pmc0  */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc1  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc2  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc3  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc4  */ { PFM_REG_COUNTING, 6, 0x0000000000800000UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc5  */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL,  pfm_mck_pmc_check, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc6  */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL,  pfm_mck_pmc_check, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc7  */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL,  pfm_mck_pmc_check, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc8  */ { PFM_REG_CONFIG  , 0, 0xffffffff3fffffffUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc9  */ { PFM_REG_CONFIG  , 0, 0xffffffff3ffffffcUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc10 */ { PFM_REG_MONITOR , 4, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0UL, 0x30f01cf, NULL,  pfm_mck_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, 0xffffUL, NULL,  pfm_mck_pmc_check, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc13 */ { PFM_REG_CONFIG  , 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc14 */ { PFM_REG_CONFIG  , 0, 0x0db60db60db60db6UL, 0x2492UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+/* pmc15 */ { PFM_REG_CONFIG  , 0, 0x00000000fffffff0UL, 0xfUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
+            { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
+};
+static pfm_reg_desc_t pfm_mck_pmd_desc[PMU_MAX_PMDS]={
+/* pmd0  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
+/* pmd1  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
+/* pmd2  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
+/* pmd3  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
+/* pmd4  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
+/* pmd5  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
+/* pmd6  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
+/* pmd7  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
+/* pmd8  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd9  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd10 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd11 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd12 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd13 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd14 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd15 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd16 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
+/* pmd17 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
+            { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
+};
+/*
+ * PMC reserved fields must have their power-up values preserved
+ */
+static int
+pfm_mck_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs)
+{
+        unsigned long tmp1, tmp2, ival = *val;
+        /* remove reserved areas from user value */
+        tmp1 = ival & PMC_RSVD_MASK(cnum);
+        /* get reserved fields values */
+        tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum);
+        *val = tmp1 | tmp2;
+        DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n",
+                  cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val));
+        return 0;
+}
+/*
+ * task can be NULL if the context is unloaded
+ */
+static int
+pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
+{
+        int ret = 0, check_case1 = 0;
+        unsigned long val8 = 0, val14 = 0, val13 = 0;
+        int is_loaded;
+        /* first preserve the reserved fields */
+        pfm_mck_reserved(cnum, val, regs);
+        /* sanitfy check */
+        if (ctx == NULL) return -EINVAL;
+        is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
+        /*
+         * we must clear the debug registers if pmc13 has a value which enable
+         * memory pipeline event constraints. In this case we need to clear the
+         * the debug registers if they have not yet been accessed. This is required
+         * to avoid picking stale state.
+         * PMC13 is "active" if:
+         *      one of the pmc13.cfg_dbrpXX field is different from 0x3
+         * AND
+         *      at the corresponding pmc13.ena_dbrpXX is set.
+         */
+        DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, *val, ctx->ctx_fl_using_dbreg, is_loaded));
+        if (cnum == 13 && is_loaded
+            && (*val & 0x1e00000000000UL) && (*val & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) {
+                DPRINT(("pmc[%d]=0x%lx has active pmc13 settings, clearing dbr\n", cnum, *val));
+                /* don't mix debug with perfmon */
+                if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
+                /*
+                 * a count of 0 will mark the debug registers as in use and also
+                 * ensure that they are properly cleared.
+                 */
+                ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs);
+                if (ret) return ret;
+        }
+        /*
+         * we must clear the (instruction) debug registers if any pmc14.ibrpX bit is enabled
+         * before they are (fl_using_dbreg==0) to avoid picking up stale information.
+         */
+        if (cnum == 14 && is_loaded && ((*val & 0x2222UL) != 0x2222UL) && ctx->ctx_fl_using_dbreg == 0) {
+                DPRINT(("pmc[%d]=0x%lx has active pmc14 settings, clearing ibr\n", cnum, *val));
+                /* don't mix debug with perfmon */
+                if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
+                /*
+                 * a count of 0 will mark the debug registers as in use and also
+                 * ensure that they are properly cleared.
+                 */
+                ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs);
+                if (ret) return ret;
+        }
+        switch(cnum) {
+                case  4: *val |= 1UL << 23; /* force power enable bit */
+                         break;
+                case  8: val8 = *val;
+                         val13 = ctx->ctx_pmcs[13];
+                         val14 = ctx->ctx_pmcs[14];
+                         check_case1 = 1;
+                         break;
+                case 13: val8  = ctx->ctx_pmcs[8];
+                         val13 = *val;
+                         val14 = ctx->ctx_pmcs[14];
+                         check_case1 = 1;
+                         break;
+                case 14: val8  = ctx->ctx_pmcs[8];
+                         val13 = ctx->ctx_pmcs[13];
+                         val14 = *val;
+                         check_case1 = 1;
+                         break;
+        }
+        /* check illegal configuration which can produce inconsistencies in tagging
+         * i-side events in L1D and L2 caches
+         */
+        if (check_case1) {
+                ret =   ((val13 >> 45) & 0xf) == 0
+                   && ((val8 & 0x1) == 0)
+                   && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0)
+                       ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0));
+                if (ret) DPRINT((KERN_DEBUG "perfmon: failure check_case1\n"));
+        }
+        return ret ? -EINVAL : 0;
+}
+/*
+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
+ */
+static pmu_config_t pmu_conf_mck={
+        .pmu_name      = "Itanium 2",
+        .pmu_family    = 0x1f,
+        .flags         = PFM_PMU_IRQ_RESEND,
+        .ovfl_val      = (1UL << 47) - 1,
+        .pmd_desc      = pfm_mck_pmd_desc,
+        .pmc_desc      = pfm_mck_pmc_desc,
+        .num_ibrs       = 8,
+        .num_dbrs       = 8,
+        .use_rr_dbregs = 1 /* debug register are use for range retrictions */
+};
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
new file mode 100644
index 000000000000..91293388dd29
--- /dev/null
+++ b/arch/ia64/kernel/process.c
@@ -0,0 +1,800 @@
+/*
+ * Architecture-specific setup.
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#define __KERNEL_SYSCALLS__     /* see <asm/unistd.h> */
+#include <linux/config.h>
+#include <linux/cpu.h>
+#include <linux/pm.h>
+#include <linux/elf.h>
+#include <linux/errno.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/personality.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/thread_info.h>
+#include <linux/unistd.h>
+#include <linux/efi.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <asm/cpu.h>
+#include <asm/delay.h>
+#include <asm/elf.h>
+#include <asm/ia32.h>
+#include <asm/irq.h>
+#include <asm/pgalloc.h>
+#include <asm/processor.h>
+#include <asm/sal.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/unwind.h>
+#include <asm/user.h>
+#include "entry.h"
+#ifdef CONFIG_PERFMON
+# include <asm/perfmon.h>
+#endif
+#include "sigframe.h"
+void (*ia64_mark_idle)(int);
+static cpumask_t cpu_idle_map;
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+void
+ia64_do_show_stack (struct unw_frame_info *info, void *arg)
+{
+        unsigned long ip, sp, bsp;
+        char buf[128];                  /* don't make it so big that it overflows the stack! */
+        printk("\nCall Trace:\n");
+        do {
+                unw_get_ip(info, &ip);
+                if (ip == 0)
+                        break;
+                unw_get_sp(info, &sp);
+                unw_get_bsp(info, &bsp);
+                snprintf(buf, sizeof(buf),
+                         " [<%016lx>] %%s\n"
+                         "                                sp=%016lx bsp=%016lx\n",
+                         ip, sp, bsp);
+                print_symbol(buf, ip);
+        } while (unw_unwind(info) >= 0);
+}
+void
+show_stack (struct task_struct *task, unsigned long *sp)
+{
+        if (!task)
+                unw_init_running(ia64_do_show_stack, NULL);
+        else {
+                struct unw_frame_info info;
+                unw_init_from_blocked_task(&info, task);
+                ia64_do_show_stack(&info, NULL);
+        }
+}
+void
+dump_stack (void)
+{
+        show_stack(NULL, NULL);
+}
+EXPORT_SYMBOL(dump_stack);
+void
+show_regs (struct pt_regs *regs)
+{
+        unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
+        print_modules();
+        printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm);
+        printk("psr : %016lx ifs : %016lx ip  : [<%016lx>]    %s\n",
+               regs->cr_ipsr, regs->cr_ifs, ip, print_tainted());
+        print_symbol("ip is at %s\n", ip);
+        printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
+               regs->ar_unat, regs->ar_pfs, regs->ar_rsc);
+        printk("rnat: %016lx bsps: %016lx pr  : %016lx\n",
+               regs->ar_rnat, regs->ar_bspstore, regs->pr);
+        printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n",
+               regs->loadrs, regs->ar_ccv, regs->ar_fpsr);
+        printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd);
+        printk("b0  : %016lx b6  : %016lx b7  : %016lx\n", regs->b0, regs->b6, regs->b7);
+        printk("f6  : %05lx%016lx f7  : %05lx%016lx\n",
+               regs->f6.u.bits[1], regs->f6.u.bits[0],
+               regs->f7.u.bits[1], regs->f7.u.bits[0]);
+        printk("f8  : %05lx%016lx f9  : %05lx%016lx\n",
+               regs->f8.u.bits[1], regs->f8.u.bits[0],
+               regs->f9.u.bits[1], regs->f9.u.bits[0]);
+        printk("f10 : %05lx%016lx f11 : %05lx%016lx\n",
+               regs->f10.u.bits[1], regs->f10.u.bits[0],
+               regs->f11.u.bits[1], regs->f11.u.bits[0]);
+        printk("r1  : %016lx r2  : %016lx r3  : %016lx\n", regs->r1, regs->r2, regs->r3);
+        printk("r8  : %016lx r9  : %016lx r10 : %016lx\n", regs->r8, regs->r9, regs->r10);
+        printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11, regs->r12, regs->r13);
+        printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14, regs->r15, regs->r16);
+        printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17, regs->r18, regs->r19);
+        printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20, regs->r21, regs->r22);
+        printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23, regs->r24, regs->r25);
+        printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26, regs->r27, regs->r28);
+        printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29, regs->r30, regs->r31);
+        if (user_mode(regs)) {
+                /* print the stacked registers */
+                unsigned long val, *bsp, ndirty;
+                int i, sof, is_nat = 0;
+                sof = regs->cr_ifs & 0x7f;      /* size of frame */
+                ndirty = (regs->loadrs >> 19);
+                bsp = ia64_rse_skip_regs((unsigned long *) regs->ar_bspstore, ndirty);
+                for (i = 0; i < sof; ++i) {
+                        get_user(val, (unsigned long __user *) ia64_rse_skip_regs(bsp, i));
+                        printk("r%-3u:%c%016lx%s", 32 + i, is_nat ? '*' : ' ', val,
+                               ((i == sof - 1) || (i % 3) == 2) ? "\n" : " ");
+                }
+        } else
+                show_stack(NULL, NULL);
+}
+void
+do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
+{
+        if (fsys_mode(current, &scr->pt)) {
+                /* defer signal-handling etc. until we return to privilege-level 0.  */
+                if (!ia64_psr(&scr->pt)->lp)
+                        ia64_psr(&scr->pt)->lp = 1;
+                return;
+        }
+#ifdef CONFIG_PERFMON
+        if (current->thread.pfm_needs_checking)
+                pfm_handle_work();
+#endif
+        /* deal with pending signal delivery */
+        if (test_thread_flag(TIF_SIGPENDING))
+                ia64_do_signal(oldset, scr, in_syscall);
+}
+static int pal_halt = 1;
+static int __init nohalt_setup(char * str)
+{
+        pal_halt = 0;
+        return 1;
+}
+__setup("nohalt", nohalt_setup);
+/*
+ * We use this if we don't have any better idle routine..
+ */
+void
+default_idle (void)
+{
+        unsigned long pmu_active = ia64_getreg(_IA64_REG_PSR) & (IA64_PSR_PP | IA64_PSR_UP);
+        while (!need_resched())
+                if (pal_halt && !pmu_active)
+                        safe_halt();
+                else
+                        cpu_relax();
+}
+#ifdef CONFIG_HOTPLUG_CPU
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+        extern void ia64_cpu_local_tick (void);
+        /* Ack it */
+        __get_cpu_var(cpu_state) = CPU_DEAD;
+        /* We shouldn't have to disable interrupts while dead, but
+         * some interrupts just don't seem to go away, and this makes
+         * it "work" for testing purposes. */
+        max_xtp();
+        local_irq_disable();
+        /* Death loop */
+        while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
+                cpu_relax();
+        /*
+         * Enable timer interrupts from now on
+         * Not required if we put processor in SAL_BOOT_RENDEZ mode.
+         */
+        local_flush_tlb_all();
+        cpu_set(smp_processor_id(), cpu_online_map);
+        wmb();
+        ia64_cpu_local_tick ();
+        local_irq_enable();
+}
+#else
+static inline void play_dead(void)
+{
+        BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+void cpu_idle_wait(void)
+{
+        int cpu;
+        cpumask_t map;
+        for_each_online_cpu(cpu)
+                cpu_set(cpu, cpu_idle_map);
+        wmb();
+        do {
+                ssleep(1);
+                cpus_and(map, cpu_idle_map, cpu_online_map);
+        } while (!cpus_empty(map));
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+void __attribute__((noreturn))
+cpu_idle (void)
+{
+        void (*mark_idle)(int) = ia64_mark_idle;
+        int cpu = smp_processor_id();
+        /* endless idle loop with no priority at all */
+        while (1) {
+#ifdef CONFIG_SMP
+                if (!need_resched())
+                        min_xtp();
+#endif
+                while (!need_resched()) {
+                        void (*idle)(void);
+                        if (mark_idle)
+                                (*mark_idle)(1);
+                        if (cpu_isset(cpu, cpu_idle_map))
+                                cpu_clear(cpu, cpu_idle_map);
+                        rmb();
+                        idle = pm_idle;
+                        if (!idle)
+                                idle = default_idle;
+                        (*idle)();
+                }
+                if (mark_idle)
+                        (*mark_idle)(0);
+#ifdef CONFIG_SMP
+                normal_xtp();
+#endif
+                schedule();
+                check_pgt_cache();
+                if (cpu_is_offline(smp_processor_id()))
+                        play_dead();
+        }
+}
+void
+ia64_save_extra (struct task_struct *task)
+{
+#ifdef CONFIG_PERFMON
+        unsigned long info;
+#endif
+        if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
+                ia64_save_debug_regs(&task->thread.dbr[0]);
+#ifdef CONFIG_PERFMON
+        if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
+                pfm_save_regs(task);
+        info = __get_cpu_var(pfm_syst_info);
+        if (info & PFM_CPUINFO_SYST_WIDE)
+                pfm_syst_wide_update_task(task, info, 0);
+#endif
+#ifdef CONFIG_IA32_SUPPORT
+        if (IS_IA32_PROCESS(ia64_task_regs(task)))
+                ia32_save_state(task);
+#endif
+}
+void
+ia64_load_extra (struct task_struct *task)
+{
+#ifdef CONFIG_PERFMON
+        unsigned long info;
+#endif
+        if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
+                ia64_load_debug_regs(&task->thread.dbr[0]);
+#ifdef CONFIG_PERFMON
+        if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
+                pfm_load_regs(task);
+        info = __get_cpu_var(pfm_syst_info);
+        if (info & PFM_CPUINFO_SYST_WIDE) 
+                pfm_syst_wide_update_task(task, info, 1);
+#endif
+#ifdef CONFIG_IA32_SUPPORT
+        if (IS_IA32_PROCESS(ia64_task_regs(task)))
+                ia32_load_state(task);
+#endif
+}
+/*
+ * Copy the state of an ia-64 thread.
+ *
+ * We get here through the following  call chain:
+ *
+ *      from user-level:        from kernel:
+ *
+ *      <clone syscall>         <some kernel call frames>
+ *      sys_clone                  :
+ *      do_fork                 do_fork
+ *      copy_thread             copy_thread
+ *
+ * This means that the stack layout is as follows:
+ *
+ *      +---------------------+ (highest addr)
+ *      |   struct pt_regs    |
+ *      +---------------------+
+ *      | struct switch_stack |
+ *      +---------------------+
+ *      |                     |
+ *      |    memory stack     |
+ *      |                     | <-- sp (lowest addr)
+ *      +---------------------+
+ *
+ * Observe that we copy the unat values that are in pt_regs and switch_stack.  Spilling an
+ * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register,
+ * with N=(X & 0x1ff)/8.  Thus, copying the unat value preserves the NaT bits ONLY if the
+ * pt_regs structure in the parent is congruent to that of the child, modulo 512.  Since
+ * the stack is page aligned and the page size is at least 4KB, this is always the case,
+ * so there is nothing to worry about.
+ */
+int
+copy_thread (int nr, unsigned long clone_flags,
+             unsigned long user_stack_base, unsigned long user_stack_size,
+             struct task_struct *p, struct pt_regs *regs)
+{
+        extern char ia64_ret_from_clone, ia32_ret_from_clone;
+        struct switch_stack *child_stack, *stack;
+        unsigned long rbs, child_rbs, rbs_size;
+        struct pt_regs *child_ptregs;
+        int retval = 0;
+#ifdef CONFIG_SMP
+        /*
+         * For SMP idle threads, fork_by_hand() calls do_fork with
+         * NULL regs.
+         */
+        if (!regs)
+                return 0;
+#endif
+        stack = ((struct switch_stack *) regs) - 1;
+        child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1;
+        child_stack = (struct switch_stack *) child_ptregs - 1;
+        /* copy parent's switch_stack & pt_regs to child: */
+        memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack));
+        rbs = (unsigned long) current + IA64_RBS_OFFSET;
+        child_rbs = (unsigned long) p + IA64_RBS_OFFSET;
+        rbs_size = stack->ar_bspstore - rbs;
+        /* copy the parent's register backing store to the child: */
+        memcpy((void *) child_rbs, (void *) rbs, rbs_size);
+        if (likely(user_mode(child_ptregs))) {
+                if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs))
+                        child_ptregs->r13 = regs->r16;  /* see sys_clone2() in entry.S */
+                if (user_stack_base) {
+                        child_ptregs->r12 = user_stack_base + user_stack_size - 16;
+                        child_ptregs->ar_bspstore = user_stack_base;
+                        child_ptregs->ar_rnat = 0;
+                        child_ptregs->loadrs = 0;
+                }
+        } else {
+                /*
+                 * Note: we simply preserve the relative position of
+                 * the stack pointer here.  There is no need to
+                 * allocate a scratch area here, since that will have
+                 * been taken care of by the caller of sys_clone()
+                 * already.
+                 */
+                child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */
+                child_ptregs->r13 = (unsigned long) p;          /* set `current' pointer */
+        }
+        child_stack->ar_bspstore = child_rbs + rbs_size;
+        if (IS_IA32_PROCESS(regs))
+                child_stack->b0 = (unsigned long) &ia32_ret_from_clone;
+        else
+                child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
+        /* copy parts of thread_struct: */
+        p->thread.ksp = (unsigned long) child_stack - 16;
+        /* stop some PSR bits from being inherited.
+         * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
+         * therefore we must specify them explicitly here and not include them in
+         * IA64_PSR_BITS_TO_CLEAR.
+         */
+        child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
+                                 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
+        /*
+         * NOTE: The calling convention considers all floating point
+         * registers in the high partition (fph) to be scratch.  Since
+         * the only way to get to this point is through a system call,
+         * we know that the values in fph are all dead.  Hence, there
+         * is no need to inherit the fph state from the parent to the
+         * child and all we have to do is to make sure that
+         * IA64_THREAD_FPH_VALID is cleared in the child.
+         *
+         * XXX We could push this optimization a bit further by
+         * clearing IA64_THREAD_FPH_VALID on ANY system call.
+         * However, it's not clear this is worth doing.  Also, it
+         * would be a slight deviation from the normal Linux system
+         * call behavior where scratch registers are preserved across
+         * system calls (unless used by the system call itself).
+         */
+#       define THREAD_FLAGS_TO_CLEAR    (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \
+                                         | IA64_THREAD_PM_VALID)
+#       define THREAD_FLAGS_TO_SET      0
+        p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR)
+                           | THREAD_FLAGS_TO_SET);
+        ia64_drop_fpu(p);       /* don't pick up stale state from a CPU's fph */
+#ifdef CONFIG_IA32_SUPPORT
+        /*
+         * If we're cloning an IA32 task then save the IA32 extra
+         * state from the current task to the new task
+         */
+        if (IS_IA32_PROCESS(ia64_task_regs(current))) {
+                ia32_save_state(p);
+                if (clone_flags & CLONE_SETTLS)
+                        retval = ia32_clone_tls(p, child_ptregs);
+                /* Copy partially mapped page list */
+                if (!retval)
+                        retval = ia32_copy_partial_page_list(p, clone_flags);
+        }
+#endif
+#ifdef CONFIG_PERFMON
+        if (current->thread.pfm_context)
+                pfm_inherit(p, child_ptregs);
+#endif
+        return retval;
+}
+static void
+do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg)
+{
+        unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm;
+        elf_greg_t *dst = arg;
+        struct pt_regs *pt;
+        char nat;
+        int i;
+        memset(dst, 0, sizeof(elf_gregset_t));  /* don't leak any kernel bits to user-level */
+        if (unw_unwind_to_user(info) < 0)
+                return;
+        unw_get_sp(info, &sp);
+        pt = (struct pt_regs *) (sp + 16);
+        urbs_end = ia64_get_user_rbs_end(task, pt, &cfm);
+        if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0)
+                return;
+        ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end),
+                  &ar_rnat);
+        /*
+         * coredump format:
+         *      r0-r31
+         *      NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
+         *      predicate registers (p0-p63)
+         *      b0-b7
+         *      ip cfm user-mask
+         *      ar.rsc ar.bsp ar.bspstore ar.rnat
+         *      ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec
+         */
+        /* r0 is zero */
+        for (i = 1, mask = (1UL << i); i < 32; ++i) {
+                unw_get_gr(info, i, &dst[i], &nat);
+                if (nat)
+                        nat_bits |= mask;
+                mask <<= 1;
+        }
+        dst[32] = nat_bits;
+        unw_get_pr(info, &dst[33]);
+        for (i = 0; i < 8; ++i)
+                unw_get_br(info, i, &dst[34 + i]);
+        unw_get_rp(info, &ip);
+        dst[42] = ip + ia64_psr(pt)->ri;
+        dst[43] = cfm;
+        dst[44] = pt->cr_ipsr & IA64_PSR_UM;
+        unw_get_ar(info, UNW_AR_RSC, &dst[45]);
+        /*
+         * For bsp and bspstore, unw_get_ar() would return the kernel
+         * addresses, but we need the user-level addresses instead:
+         */
+        dst[46] = urbs_end;     /* note: by convention PT_AR_BSP points to the end of the urbs! */
+        dst[47] = pt->ar_bspstore;
+        dst[48] = ar_rnat;
+        unw_get_ar(info, UNW_AR_CCV, &dst[49]);
+        unw_get_ar(info, UNW_AR_UNAT, &dst[50]);
+        unw_get_ar(info, UNW_AR_FPSR, &dst[51]);
+        dst[52] = pt->ar_pfs;   /* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */
+        unw_get_ar(info, UNW_AR_LC, &dst[53]);
+        unw_get_ar(info, UNW_AR_EC, &dst[54]);
+        unw_get_ar(info, UNW_AR_CSD, &dst[55]);
+        unw_get_ar(info, UNW_AR_SSD, &dst[56]);
+}
+void
+do_dump_task_fpu (struct task_struct *task, struct unw_frame_info *info, void *arg)
+{
+        elf_fpreg_t *dst = arg;
+        int i;
+        memset(dst, 0, sizeof(elf_fpregset_t)); /* don't leak any "random" bits */
+        if (unw_unwind_to_user(info) < 0)
+                return;
+        /* f0 is 0.0, f1 is 1.0 */
+        for (i = 2; i < 32; ++i)
+                unw_get_fr(info, i, dst + i);
+        ia64_flush_fph(task);
+        if ((task->thread.flags & IA64_THREAD_FPH_VALID) != 0)
+                memcpy(dst + 32, task->thread.fph, 96*16);
+}
+void
+do_copy_regs (struct unw_frame_info *info, void *arg)
+{
+        do_copy_task_regs(current, info, arg);
+}
+void
+do_dump_fpu (struct unw_frame_info *info, void *arg)
+{
+        do_dump_task_fpu(current, info, arg);
+}
+int
+dump_task_regs(struct task_struct *task, elf_gregset_t *regs)
+{
+        struct unw_frame_info tcore_info;
+        if (current == task) {
+                unw_init_running(do_copy_regs, regs);
+        } else {
+                memset(&tcore_info, 0, sizeof(tcore_info));
+                unw_init_from_blocked_task(&tcore_info, task);
+                do_copy_task_regs(task, &tcore_info, regs);
+        }
+        return 1;
+}
+void
+ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst)
+{
+        unw_init_running(do_copy_regs, dst);
+}
+int
+dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst)
+{
+        struct unw_frame_info tcore_info;
+        if (current == task) {
+                unw_init_running(do_dump_fpu, dst);
+        } else {
+                memset(&tcore_info, 0, sizeof(tcore_info));
+                unw_init_from_blocked_task(&tcore_info, task);
+                do_dump_task_fpu(task, &tcore_info, dst);
+        }
+        return 1;
+}
+int
+dump_fpu (struct pt_regs *pt, elf_fpregset_t dst)
+{
+        unw_init_running(do_dump_fpu, dst);
+        return 1;       /* f0-f31 are always valid so we always return 1 */
+}
+long
+sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp,
+            struct pt_regs *regs)
+{
+        char *fname;
+        int error;
+        fname = getname(filename);
+        error = PTR_ERR(fname);
+        if (IS_ERR(fname))
+                goto out;
+        error = do_execve(fname, argv, envp, regs);
+        putname(fname);
+out:
+        return error;
+}
+pid_t
+kernel_thread (int (*fn)(void *), void *arg, unsigned long flags)
+{
+        extern void start_kernel_thread (void);
+        unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread;
+        struct {
+                struct switch_stack sw;
+                struct pt_regs pt;
+        } regs;
+        memset(&regs, 0, sizeof(regs));
+        regs.pt.cr_iip = helper_fptr[0];        /* set entry point (IP) */
+        regs.pt.r1 = helper_fptr[1];            /* set GP */
+        regs.pt.r9 = (unsigned long) fn;        /* 1st argument */
+        regs.pt.r11 = (unsigned long) arg;      /* 2nd argument */
+        /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read.  */
+        regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
+        regs.pt.cr_ifs = 1UL << 63;             /* mark as valid, empty frame */
+        regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
+        regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET;
+        regs.sw.pr = (1 << PRED_KERNEL_STACK);
+        return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs.pt, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(kernel_thread);
+/* This gets called from kernel_thread() via ia64_invoke_thread_helper().  */
+int
+kernel_thread_helper (int (*fn)(void *), void *arg)
+{
+#ifdef CONFIG_IA32_SUPPORT
+        if (IS_IA32_PROCESS(ia64_task_regs(current))) {
+                /* A kernel thread is always a 64-bit process. */
+                current->thread.map_base  = DEFAULT_MAP_BASE;
+                current->thread.task_size = DEFAULT_TASK_SIZE;
+                ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob);
+                ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1);
+        }
+#endif
+        return (*fn)(arg);
+}
+/*
+ * Flush thread state.  This is called when a thread does an execve().
+ */
+void
+flush_thread (void)
+{
+        /* drop floating-point and debug-register state if it exists: */
+        current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID);
+        ia64_drop_fpu(current);
+        if (IS_IA32_PROCESS(ia64_task_regs(current)))
+                ia32_drop_partial_page_list(current);
+}
+/*
+ * Clean up state associated with current thread.  This is called when
+ * the thread calls exit().
+ */
+void
+exit_thread (void)
+{
+        ia64_drop_fpu(current);
+#ifdef CONFIG_PERFMON
+       /* if needed, stop monitoring and flush state to perfmon context */
+        if (current->thread.pfm_context)
+                pfm_exit_thread(current);
+        /* free debug register resources */
+        if (current->thread.flags & IA64_THREAD_DBG_VALID)
+                pfm_release_debug_registers(current);
+#endif
+        if (IS_IA32_PROCESS(ia64_task_regs(current)))
+                ia32_drop_partial_page_list(current);
+}
+unsigned long
+get_wchan (struct task_struct *p)
+{
+        struct unw_frame_info info;
+        unsigned long ip;
+        int count = 0;
+        /*
+         * Note: p may not be a blocked task (it could be current or
+         * another process running on some other CPU.  Rather than
+         * trying to determine if p is really blocked, we just assume
+         * it's blocked and rely on the unwind routines to fail
+         * gracefully if the process wasn't really blocked after all.
+         * --davidm 99/12/15
+         */
+        unw_init_from_blocked_task(&info, p);
+        do {
+                if (unw_unwind(&info) < 0)
+                        return 0;
+                unw_get_ip(&info, &ip);
+                if (!in_sched_functions(ip))
+                        return ip;
+        } while (count++ < 16);
+        return 0;
+}
+void
+cpu_halt (void)
+{
+        pal_power_mgmt_info_u_t power_info[8];
+        unsigned long min_power;
+        int i, min_power_state;
+        if (ia64_pal_halt_info(power_info) != 0)
+                return;
+        min_power_state = 0;
+        min_power = power_info[0].pal_power_mgmt_info_s.power_consumption;
+        for (i = 1; i < 8; ++i)
+                if (power_info[i].pal_power_mgmt_info_s.im
+                    && power_info[i].pal_power_mgmt_info_s.power_consumption < min_power) {
+                        min_power = power_info[i].pal_power_mgmt_info_s.power_consumption;
+                        min_power_state = i;
+                }
+        while (1)
+                ia64_pal_halt(min_power_state);
+}
+void
+machine_restart (char *restart_cmd)
+{
+        (*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL);
+}
+EXPORT_SYMBOL(machine_restart);
+void
+machine_halt (void)
+{
+        cpu_halt();
+}
+EXPORT_SYMBOL(machine_halt);
+void
+machine_power_off (void)
+{
+        if (pm_power_off)
+                pm_power_off();
+        machine_halt();
+}
+EXPORT_SYMBOL(machine_power_off);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
new file mode 100644
index 000000000000..55789fcd7210
--- /dev/null
+++ b/arch/ia64/kernel/ptrace.c
@@ -0,0 +1,1627 @@
+/*
+ * Kernel support for the ptrace() and syscall tracing interfaces.
+ *
+ * Copyright (C) 1999-2005 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * Derived from the x86 and Alpha versions.
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/smp_lock.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/ptrace_offsets.h>
+#include <asm/rse.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/unwind.h>
+#ifdef CONFIG_PERFMON
+#include <asm/perfmon.h>
+#endif
+#include "entry.h"
+/*
+ * Bits in the PSR that we allow ptrace() to change:
+ *      be, up, ac, mfl, mfh (the user mask; five bits total)
+ *      db (debug breakpoint fault; one bit)
+ *      id (instruction debug fault disable; one bit)
+ *      dd (data debug fault disable; one bit)
+ *      ri (restart instruction; two bits)
+ *      is (instruction set; one bit)
+ */
+#define IPSR_MASK (IA64_PSR_UM | IA64_PSR_DB | IA64_PSR_IS      \
+                   | IA64_PSR_ID | IA64_PSR_DD | IA64_PSR_RI)
+#define MASK(nbits)     ((1UL << (nbits)) - 1)  /* mask with NBITS bits set */
+#define PFM_MASK        MASK(38)
+#define PTRACE_DEBUG    0
+#if PTRACE_DEBUG
+# define dprintk(format...)     printk(format)
+# define inline
+#else
+# define dprintk(format...)
+#endif
+/* Return TRUE if PT was created due to kernel-entry via a system-call.  */
+static inline int
+in_syscall (struct pt_regs *pt)
+{
+        return (long) pt->cr_ifs >= 0;
+}
+/*
+ * Collect the NaT bits for r1-r31 from scratch_unat and return a NaT
+ * bitset where bit i is set iff the NaT bit of register i is set.
+ */
+unsigned long
+ia64_get_scratch_nat_bits (struct pt_regs *pt, unsigned long scratch_unat)
+{
+#       define GET_BITS(first, last, unat)                              \
+        ({                                                              \
+                unsigned long bit = ia64_unat_pos(&pt->r##first);       \
+                unsigned long nbits = (last - first + 1);               \
+                unsigned long mask = MASK(nbits) << first;              \
+                unsigned long dist;                                     \
+                if (bit < first)                                        \
+                        dist = 64 + bit - first;                        \
+                else                                                    \
+                        dist = bit - first;                             \
+                ia64_rotr(unat, dist) & mask;                           \
+        })
+        unsigned long val;
+        /*
+         * Registers that are stored consecutively in struct pt_regs
+         * can be handled in parallel.  If the register order in
+         * struct_pt_regs changes, this code MUST be updated.
+         */
+        val  = GET_BITS( 1,  1, scratch_unat);
+        val |= GET_BITS( 2,  3, scratch_unat);
+        val |= GET_BITS(12, 13, scratch_unat);
+        val |= GET_BITS(14, 14, scratch_unat);
+        val |= GET_BITS(15, 15, scratch_unat);
+        val |= GET_BITS( 8, 11, scratch_unat);
+        val |= GET_BITS(16, 31, scratch_unat);
+        return val;
+#       undef GET_BITS
+}
+/*
+ * Set the NaT bits for the scratch registers according to NAT and
+ * return the resulting unat (assuming the scratch registers are
+ * stored in PT).
+ */
+unsigned long
+ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat)
+{
+#       define PUT_BITS(first, last, nat)                               \
+        ({                                                              \
+                unsigned long bit = ia64_unat_pos(&pt->r##first);       \
+                unsigned long nbits = (last - first + 1);               \
+                unsigned long mask = MASK(nbits) << first;              \
+                long dist;                                              \
+                if (bit < first)                                        \
+                        dist = 64 + bit - first;                        \
+                else                                                    \
+                        dist = bit - first;                             \
+                ia64_rotl(nat & mask, dist);                            \
+        })
+        unsigned long scratch_unat;
+        /*
+         * Registers that are stored consecutively in struct pt_regs
+         * can be handled in parallel.  If the register order in
+         * struct_pt_regs changes, this code MUST be updated.
+         */
+        scratch_unat  = PUT_BITS( 1,  1, nat);
+        scratch_unat |= PUT_BITS( 2,  3, nat);
+        scratch_unat |= PUT_BITS(12, 13, nat);
+        scratch_unat |= PUT_BITS(14, 14, nat);
+        scratch_unat |= PUT_BITS(15, 15, nat);
+        scratch_unat |= PUT_BITS( 8, 11, nat);
+        scratch_unat |= PUT_BITS(16, 31, nat);
+        return scratch_unat;
+#       undef PUT_BITS
+}
+#define IA64_MLX_TEMPLATE       0x2
+#define IA64_MOVL_OPCODE        6
+void
+ia64_increment_ip (struct pt_regs *regs)
+{
+        unsigned long w0, ri = ia64_psr(regs)->ri + 1;
+        if (ri > 2) {
+                ri = 0;
+                regs->cr_iip += 16;
+        } else if (ri == 2) {
+                get_user(w0, (char __user *) regs->cr_iip + 0);
+                if (((w0 >> 1) & 0xf) == IA64_MLX_TEMPLATE) {
+                        /*
+                         * rfi'ing to slot 2 of an MLX bundle causes
+                         * an illegal operation fault.  We don't want
+                         * that to happen...
+                         */
+                        ri = 0;
+                        regs->cr_iip += 16;
+                }
+        }
+        ia64_psr(regs)->ri = ri;
+}
+void
+ia64_decrement_ip (struct pt_regs *regs)
+{
+        unsigned long w0, ri = ia64_psr(regs)->ri - 1;
+        if (ia64_psr(regs)->ri == 0) {
+                regs->cr_iip -= 16;
+                ri = 2;
+                get_user(w0, (char __user *) regs->cr_iip + 0);
+                if (((w0 >> 1) & 0xf) == IA64_MLX_TEMPLATE) {
+                        /*
+                         * rfi'ing to slot 2 of an MLX bundle causes
+                         * an illegal operation fault.  We don't want
+                         * that to happen...
+                         */
+                        ri = 1;
+                }
+        }
+        ia64_psr(regs)->ri = ri;
+}
+/*
+ * This routine is used to read an rnat bits that are stored on the
+ * kernel backing store.  Since, in general, the alignment of the user
+ * and kernel are different, this is not completely trivial.  In
+ * essence, we need to construct the user RNAT based on up to two
+ * kernel RNAT values and/or the RNAT value saved in the child's
+ * pt_regs.
+ *
+ * user rbs
+ *
+ * +--------+ <-- lowest address
+ * | slot62 |
+ * +--------+
+ * |  rnat  | 0x....1f8
+ * +--------+
+ * | slot00 | \
+ * +--------+ |
+ * | slot01 | > child_regs->ar_rnat
+ * +--------+ |
+ * | slot02 | /                         kernel rbs
+ * +--------+                           +--------+
+ *          <- child_regs->ar_bspstore  | slot61 | <-- krbs
+ * +- - - - +                           +--------+
+ *                                      | slot62 |
+ * +- - - - +                           +--------+
+ *                                      |  rnat  |
+ * +- - - - +                           +--------+
+ *   vrnat                              | slot00 |
+ * +- - - - +                           +--------+
+ *                                      =        =
+ *                                      +--------+
+ *                                      | slot00 | \
+ *                                      +--------+ |
+ *                                      | slot01 | > child_stack->ar_rnat
+ *                                      +--------+ |
+ *                                      | slot02 | /
+ *                                      +--------+
+ *                                                <--- child_stack->ar_bspstore
+ *
+ * The way to think of this code is as follows: bit 0 in the user rnat
+ * corresponds to some bit N (0 <= N <= 62) in one of the kernel rnat
+ * value.  The kernel rnat value holding this bit is stored in
+ * variable rnat0.  rnat1 is loaded with the kernel rnat value that
+ * form the upper bits of the user rnat value.
+ *
+ * Boundary cases:
+ *
+ * o when reading the rnat "below" the first rnat slot on the kernel
+ *   backing store, rnat0/rnat1 are set to 0 and the low order bits are
+ *   merged in from pt->ar_rnat.
+ *
+ * o when reading the rnat "above" the last rnat slot on the kernel
+ *   backing store, rnat0/rnat1 gets its value from sw->ar_rnat.
+ */
+static unsigned long
+get_rnat (struct task_struct *task, struct switch_stack *sw,
+          unsigned long *krbs, unsigned long *urnat_addr,
+          unsigned long *urbs_end)
+{
+        unsigned long rnat0 = 0, rnat1 = 0, urnat = 0, *slot0_kaddr;
+        unsigned long umask = 0, mask, m;
+        unsigned long *kbsp, *ubspstore, *rnat0_kaddr, *rnat1_kaddr, shift;
+        long num_regs, nbits;
+        struct pt_regs *pt;
+        pt = ia64_task_regs(task);
+        kbsp = (unsigned long *) sw->ar_bspstore;
+        ubspstore = (unsigned long *) pt->ar_bspstore;
+        if (urbs_end < urnat_addr)
+                nbits = ia64_rse_num_regs(urnat_addr - 63, urbs_end);
+        else
+                nbits = 63;
+        mask = MASK(nbits);
+        /*
+         * First, figure out which bit number slot 0 in user-land maps
+         * to in the kernel rnat.  Do this by figuring out how many
+         * register slots we're beyond the user's backingstore and
+         * then computing the equivalent address in kernel space.
+         */
+        num_regs = ia64_rse_num_regs(ubspstore, urnat_addr + 1);
+        slot0_kaddr = ia64_rse_skip_regs(krbs, num_regs);
+        shift = ia64_rse_slot_num(slot0_kaddr);
+        rnat1_kaddr = ia64_rse_rnat_addr(slot0_kaddr);
+        rnat0_kaddr = rnat1_kaddr - 64;
+        if (ubspstore + 63 > urnat_addr) {
+                /* some bits need to be merged in from pt->ar_rnat */
+                umask = MASK(ia64_rse_slot_num(ubspstore)) & mask;
+                urnat = (pt->ar_rnat & umask);
+                mask &= ~umask;
+                if (!mask)
+                        return urnat;
+        }
+        m = mask << shift;
+        if (rnat0_kaddr >= kbsp)
+                rnat0 = sw->ar_rnat;
+        else if (rnat0_kaddr > krbs)
+                rnat0 = *rnat0_kaddr;
+        urnat |= (rnat0 & m) >> shift;
+        m = mask >> (63 - shift);
+        if (rnat1_kaddr >= kbsp)
+                rnat1 = sw->ar_rnat;
+        else if (rnat1_kaddr > krbs)
+                rnat1 = *rnat1_kaddr;
+        urnat |= (rnat1 & m) << (63 - shift);
+        return urnat;
+}
+/*
+ * The reverse of get_rnat.
+ */
+static void
+put_rnat (struct task_struct *task, struct switch_stack *sw,
+          unsigned long *krbs, unsigned long *urnat_addr, unsigned long urnat,
+          unsigned long *urbs_end)
+{
+        unsigned long rnat0 = 0, rnat1 = 0, *slot0_kaddr, umask = 0, mask, m;
+        unsigned long *kbsp, *ubspstore, *rnat0_kaddr, *rnat1_kaddr, shift;
+        long num_regs, nbits;
+        struct pt_regs *pt;
+        unsigned long cfm, *urbs_kargs;
+        pt = ia64_task_regs(task);
+        kbsp = (unsigned long *) sw->ar_bspstore;
+        ubspstore = (unsigned long *) pt->ar_bspstore;
+        urbs_kargs = urbs_end;
+        if (in_syscall(pt)) {
+                /*
+                 * If entered via syscall, don't allow user to set rnat bits
+                 * for syscall args.
+                 */
+                cfm = pt->cr_ifs;
+                urbs_kargs = ia64_rse_skip_regs(urbs_end, -(cfm & 0x7f));
+        }
+        if (urbs_kargs >= urnat_addr)
+                nbits = 63;
+        else {
+                if ((urnat_addr - 63) >= urbs_kargs)
+                        return;
+                nbits = ia64_rse_num_regs(urnat_addr - 63, urbs_kargs);
+        }
+        mask = MASK(nbits);
+        /*
+         * First, figure out which bit number slot 0 in user-land maps
+         * to in the kernel rnat.  Do this by figuring out how many
+         * register slots we're beyond the user's backingstore and
+         * then computing the equivalent address in kernel space.
+         */
+        num_regs = ia64_rse_num_regs(ubspstore, urnat_addr + 1);
+        slot0_kaddr = ia64_rse_skip_regs(krbs, num_regs);
+        shift = ia64_rse_slot_num(slot0_kaddr);
+        rnat1_kaddr = ia64_rse_rnat_addr(slot0_kaddr);
+        rnat0_kaddr = rnat1_kaddr - 64;
+        if (ubspstore + 63 > urnat_addr) {
+                /* some bits need to be place in pt->ar_rnat: */
+                umask = MASK(ia64_rse_slot_num(ubspstore)) & mask;
+                pt->ar_rnat = (pt->ar_rnat & ~umask) | (urnat & umask);
+                mask &= ~umask;
+                if (!mask)
+                        return;
+        }
+        /*
+         * Note: Section 11.1 of the EAS guarantees that bit 63 of an
+         * rnat slot is ignored. so we don't have to clear it here.
+         */
+        rnat0 = (urnat << shift);
+        m = mask << shift;
+        if (rnat0_kaddr >= kbsp)
+                sw->ar_rnat = (sw->ar_rnat & ~m) | (rnat0 & m);
+        else if (rnat0_kaddr > krbs)
+                *rnat0_kaddr = ((*rnat0_kaddr & ~m) | (rnat0 & m));
+        rnat1 = (urnat >> (63 - shift));
+        m = mask >> (63 - shift);
+        if (rnat1_kaddr >= kbsp)
+                sw->ar_rnat = (sw->ar_rnat & ~m) | (rnat1 & m);
+        else if (rnat1_kaddr > krbs)
+                *rnat1_kaddr = ((*rnat1_kaddr & ~m) | (rnat1 & m));
+}
+static inline int
+on_kernel_rbs (unsigned long addr, unsigned long bspstore,
+               unsigned long urbs_end)
+{
+        unsigned long *rnat_addr = ia64_rse_rnat_addr((unsigned long *)
+                                                      urbs_end);
+        return (addr >= bspstore && addr <= (unsigned long) rnat_addr);
+}
+/*
+ * Read a word from the user-level backing store of task CHILD.  ADDR
+ * is the user-level address to read the word from, VAL a pointer to
+ * the return value, and USER_BSP gives the end of the user-level
+ * backing store (i.e., it's the address that would be in ar.bsp after
+ * the user executed a "cover" instruction).
+ *
+ * This routine takes care of accessing the kernel register backing
+ * store for those registers that got spilled there.  It also takes
+ * care of calculating the appropriate RNaT collection words.
+ */
+long
+ia64_peek (struct task_struct *child, struct switch_stack *child_stack,
+           unsigned long user_rbs_end, unsigned long addr, long *val)
+{
+        unsigned long *bspstore, *krbs, regnum, *laddr, *urbs_end, *rnat_addr;
+        struct pt_regs *child_regs;
+        size_t copied;
+        long ret;
+        urbs_end = (long *) user_rbs_end;
+        laddr = (unsigned long *) addr;
+        child_regs = ia64_task_regs(child);
+        bspstore = (unsigned long *) child_regs->ar_bspstore;
+        krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
+        if (on_kernel_rbs(addr, (unsigned long) bspstore,
+                          (unsigned long) urbs_end))
+        {
+                /*
+                 * Attempt to read the RBS in an area that's actually
+                 * on the kernel RBS => read the corresponding bits in
+                 * the kernel RBS.
+                 */
+                rnat_addr = ia64_rse_rnat_addr(laddr);
+                ret = get_rnat(child, child_stack, krbs, rnat_addr, urbs_end);
+                if (laddr == rnat_addr) {
+                        /* return NaT collection word itself */
+                        *val = ret;
+                        return 0;
+                }
+                if (((1UL << ia64_rse_slot_num(laddr)) & ret) != 0) {
+                        /*
+                         * It is implementation dependent whether the
+                         * data portion of a NaT value gets saved on a
+                         * st8.spill or RSE spill (e.g., see EAS 2.6,
+                         * 4.4.4.6 Register Spill and Fill).  To get
+                         * consistent behavior across all possible
+                         * IA-64 implementations, we return zero in
+                         * this case.
+                         */
+                        *val = 0;
+                        return 0;
+                }
+                if (laddr < urbs_end) {
+                        /*
+                         * The desired word is on the kernel RBS and
+                         * is not a NaT.
+                         */
+                        regnum = ia64_rse_num_regs(bspstore, laddr);
+                        *val = *ia64_rse_skip_regs(krbs, regnum);
+                        return 0;
+                }
+        }
+        copied = access_process_vm(child, addr, &ret, sizeof(ret), 0);
+        if (copied != sizeof(ret))
+                return -EIO;
+        *val = ret;
+        return 0;
+}
+long
+ia64_poke (struct task_struct *child, struct switch_stack *child_stack,
+           unsigned long user_rbs_end, unsigned long addr, long val)
+{
+        unsigned long *bspstore, *krbs, regnum, *laddr;
+        unsigned long *urbs_end = (long *) user_rbs_end;
+        struct pt_regs *child_regs;
+        laddr = (unsigned long *) addr;
+        child_regs = ia64_task_regs(child);
+        bspstore = (unsigned long *) child_regs->ar_bspstore;
+        krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
+        if (on_kernel_rbs(addr, (unsigned long) bspstore,
+                          (unsigned long) urbs_end))
+        {
+                /*
+                 * Attempt to write the RBS in an area that's actually
+                 * on the kernel RBS => write the corresponding bits
+                 * in the kernel RBS.
+                 */
+                if (ia64_rse_is_rnat_slot(laddr))
+                        put_rnat(child, child_stack, krbs, laddr, val,
+                                 urbs_end);
+                else {
+                        if (laddr < urbs_end) {
+                                regnum = ia64_rse_num_regs(bspstore, laddr);
+                                *ia64_rse_skip_regs(krbs, regnum) = val;
+                        }
+                }
+        } else if (access_process_vm(child, addr, &val, sizeof(val), 1)
+                   != sizeof(val))
+                return -EIO;
+        return 0;
+}
+/*
+ * Calculate the address of the end of the user-level register backing
+ * store.  This is the address that would have been stored in ar.bsp
+ * if the user had executed a "cover" instruction right before
+ * entering the kernel.  If CFMP is not NULL, it is used to return the
+ * "current frame mask" that was active at the time the kernel was
+ * entered.
+ */
+unsigned long
+ia64_get_user_rbs_end (struct task_struct *child, struct pt_regs *pt,
+                       unsigned long *cfmp)
+{
+        unsigned long *krbs, *bspstore, cfm = pt->cr_ifs;
+        long ndirty;
+        krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
+        bspstore = (unsigned long *) pt->ar_bspstore;
+        ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19));
+        if (in_syscall(pt))
+                ndirty += (cfm & 0x7f);
+        else
+                cfm &= ~(1UL << 63);    /* clear valid bit */
+        if (cfmp)
+                *cfmp = cfm;
+        return (unsigned long) ia64_rse_skip_regs(bspstore, ndirty);
+}
+/*
+ * Synchronize (i.e, write) the RSE backing store living in kernel
+ * space to the VM of the CHILD task.  SW and PT are the pointers to
+ * the switch_stack and pt_regs structures, respectively.
+ * USER_RBS_END is the user-level address at which the backing store
+ * ends.
+ */
+long
+ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw,
+                    unsigned long user_rbs_start, unsigned long user_rbs_end)
+{
+        unsigned long addr, val;
+        long ret;
+        /* now copy word for word from kernel rbs to user rbs: */
+        for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
+                ret = ia64_peek(child, sw, user_rbs_end, addr, &val);
+                if (ret < 0)
+                        return ret;
+                if (access_process_vm(child, addr, &val, sizeof(val), 1)
+                    != sizeof(val))
+                        return -EIO;
+        }
+        return 0;
+}
+static inline int
+thread_matches (struct task_struct *thread, unsigned long addr)
+{
+        unsigned long thread_rbs_end;
+        struct pt_regs *thread_regs;
+        if (ptrace_check_attach(thread, 0) < 0)
+                /*
+                 * If the thread is not in an attachable state, we'll
+                 * ignore it.  The net effect is that if ADDR happens
+                 * to overlap with the portion of the thread's
+                 * register backing store that is currently residing
+                 * on the thread's kernel stack, then ptrace() may end
+                 * up accessing a stale value.  But if the thread
+                 * isn't stopped, that's a problem anyhow, so we're
+                 * doing as well as we can...
+                 */
+                return 0;
+        thread_regs = ia64_task_regs(thread);
+        thread_rbs_end = ia64_get_user_rbs_end(thread, thread_regs, NULL);
+        if (!on_kernel_rbs(addr, thread_regs->ar_bspstore, thread_rbs_end))
+                return 0;
+        return 1;       /* looks like we've got a winner */
+}
+/*
+ * GDB apparently wants to be able to read the register-backing store
+ * of any thread when attached to a given process.  If we are peeking
+ * or poking an address that happens to reside in the kernel-backing
+ * store of another thread, we need to attach to that thread, because
+ * otherwise we end up accessing stale data.
+ *
+ * task_list_lock must be read-locked before calling this routine!
+ */
+static struct task_struct *
+find_thread_for_addr (struct task_struct *child, unsigned long addr)
+{
+        struct task_struct *g, *p;
+        struct mm_struct *mm;
+        int mm_users;
+        if (!(mm = get_task_mm(child)))
+                return child;
+        /* -1 because of our get_task_mm(): */
+        mm_users = atomic_read(&mm->mm_users) - 1;
+        if (mm_users <= 1)
+                goto out;               /* not multi-threaded */
+        /*
+         * First, traverse the child's thread-list.  Good for scalability with
+         * NPTL-threads.
+         */
+        p = child;
+        do {
+                if (thread_matches(p, addr)) {
+                        child = p;
+                        goto out;
+                }
+                if (mm_users-- <= 1)
+                        goto out;
+        } while ((p = next_thread(p)) != child);
+        do_each_thread(g, p) {
+                if (child->mm != mm)
+                        continue;
+                if (thread_matches(p, addr)) {
+                        child = p;
+                        goto out;
+                }
+        } while_each_thread(g, p);
+  out:
+        mmput(mm);
+        return child;
+}
+/*
+ * Write f32-f127 back to task->thread.fph if it has been modified.
+ */
+inline void
+ia64_flush_fph (struct task_struct *task)
+{
+        struct ia64_psr *psr = ia64_psr(ia64_task_regs(task));
+        if (ia64_is_local_fpu_owner(task) && psr->mfh) {
+                psr->mfh = 0;
+                task->thread.flags |= IA64_THREAD_FPH_VALID;
+                ia64_save_fpu(&task->thread.fph[0]);
+        }
+}
+/*
+ * Sync the fph state of the task so that it can be manipulated
+ * through thread.fph.  If necessary, f32-f127 are written back to
+ * thread.fph or, if the fph state hasn't been used before, thread.fph
+ * is cleared to zeroes.  Also, access to f32-f127 is disabled to
+ * ensure that the task picks up the state from thread.fph when it
+ * executes again.
+ */
+void
+ia64_sync_fph (struct task_struct *task)
+{
+        struct ia64_psr *psr = ia64_psr(ia64_task_regs(task));
+        ia64_flush_fph(task);
+        if (!(task->thread.flags & IA64_THREAD_FPH_VALID)) {
+                task->thread.flags |= IA64_THREAD_FPH_VALID;
+                memset(&task->thread.fph, 0, sizeof(task->thread.fph));
+        }
+        ia64_drop_fpu(task);
+        psr->dfh = 1;
+}
+static int
+access_fr (struct unw_frame_info *info, int regnum, int hi,
+           unsigned long *data, int write_access)
+{
+        struct ia64_fpreg fpval;
+        int ret;
+        ret = unw_get_fr(info, regnum, &fpval);
+        if (ret < 0)
+                return ret;
+        if (write_access) {
+                fpval.u.bits[hi] = *data;
+                ret = unw_set_fr(info, regnum, fpval);
+        } else
+                *data = fpval.u.bits[hi];
+        return ret;
+}
+/*
+ * Change the machine-state of CHILD such that it will return via the normal
+ * kernel exit-path, rather than the syscall-exit path.
+ */
+static void
+convert_to_non_syscall (struct task_struct *child, struct pt_regs  *pt,
+                        unsigned long cfm)
+{
+        struct unw_frame_info info, prev_info;
+        unsigned long ip, pr;
+        unw_init_from_blocked_task(&info, child);
+        while (1) {
+                prev_info = info;
+                if (unw_unwind(&info) < 0)
+                        return;
+                if (unw_get_rp(&info, &ip) < 0)
+                        return;
+                if (ip < FIXADDR_USER_END)
+                        break;
+        }
+        unw_get_pr(&prev_info, &pr);
+        pr &= ~(1UL << PRED_SYSCALL);
+        pr |=  (1UL << PRED_NON_SYSCALL);
+        unw_set_pr(&prev_info, pr);
+        pt->cr_ifs = (1UL << 63) | cfm;
+}
+static int
+access_nat_bits (struct task_struct *child, struct pt_regs *pt,
+                 struct unw_frame_info *info,
+                 unsigned long *data, int write_access)
+{
+        unsigned long regnum, nat_bits, scratch_unat, dummy = 0;
+        char nat = 0;
+        if (write_access) {
+                nat_bits = *data;
+                scratch_unat = ia64_put_scratch_nat_bits(pt, nat_bits);
+                if (unw_set_ar(info, UNW_AR_UNAT, scratch_unat) < 0) {
+                        dprintk("ptrace: failed to set ar.unat\n");
+                        return -1;
+                }
+                for (regnum = 4; regnum <= 7; ++regnum) {
+                        unw_get_gr(info, regnum, &dummy, &nat);
+                        unw_set_gr(info, regnum, dummy,
+                                   (nat_bits >> regnum) & 1);
+                }
+        } else {
+                if (unw_get_ar(info, UNW_AR_UNAT, &scratch_unat) < 0) {
+                        dprintk("ptrace: failed to read ar.unat\n");
+                        return -1;
+                }
+                nat_bits = ia64_get_scratch_nat_bits(pt, scratch_unat);
+                for (regnum = 4; regnum <= 7; ++regnum) {
+                        unw_get_gr(info, regnum, &dummy, &nat);
+                        nat_bits |= (nat != 0) << regnum;
+                }
+                *data = nat_bits;
+        }
+        return 0;
+}
+static int
+access_uarea (struct task_struct *child, unsigned long addr,
+              unsigned long *data, int write_access)
+{
+        unsigned long *ptr, regnum, urbs_end, rnat_addr, cfm;
+        struct switch_stack *sw;
+        struct pt_regs *pt;
+#       define pt_reg_addr(pt, reg)     ((void *)                           \
+                                         ((unsigned long) (pt)              \
+                                          + offsetof(struct pt_regs, reg)))
+        pt = ia64_task_regs(child);
+        sw = (struct switch_stack *) (child->thread.ksp + 16);
+        if ((addr & 0x7) != 0) {
+                dprintk("ptrace: unaligned register address 0x%lx\n", addr);
+                return -1;
+        }
+        if (addr < PT_F127 + 16) {
+                /* accessing fph */
+                if (write_access)
+                        ia64_sync_fph(child);
+                else
+                        ia64_flush_fph(child);
+                ptr = (unsigned long *)
+                        ((unsigned long) &child->thread.fph + addr);
+        } else if ((addr >= PT_F10) && (addr < PT_F11 + 16)) {
+                /* scratch registers untouched by kernel (saved in pt_regs) */
+                ptr = pt_reg_addr(pt, f10) + (addr - PT_F10);
+        } else if (addr >= PT_F12 && addr < PT_F15 + 16) {
+                /*
+                 * Scratch registers untouched by kernel (saved in
+                 * switch_stack).
+                 */
+                ptr = (unsigned long *) ((long) sw
+                                         + (addr - PT_NAT_BITS - 32));
+        } else if (addr < PT_AR_LC + 8) {
+                /* preserved state: */
+                struct unw_frame_info info;
+                char nat = 0;
+                int ret;
+                unw_init_from_blocked_task(&info, child);
+                if (unw_unwind_to_user(&info) < 0)
+                        return -1;
+                switch (addr) {
+                      case PT_NAT_BITS:
+                        return access_nat_bits(child, pt, &info,
+                                               data, write_access);
+                      case PT_R4: case PT_R5: case PT_R6: case PT_R7:
+                        if (write_access) {
+                                /* read NaT bit first: */
+                                unsigned long dummy;
+                                ret = unw_get_gr(&info, (addr - PT_R4)/8 + 4,
+                                                 &dummy, &nat);
+                                if (ret < 0)
+                                        return ret;
+                        }
+                        return unw_access_gr(&info, (addr - PT_R4)/8 + 4, data,
+                                             &nat, write_access);
+                      case PT_B1: case PT_B2: case PT_B3:
+                      case PT_B4: case PT_B5:
+                        return unw_access_br(&info, (addr - PT_B1)/8 + 1, data,
+                                             write_access);
+                      case PT_AR_EC:
+                        return unw_access_ar(&info, UNW_AR_EC, data,
+                                             write_access);
+                      case PT_AR_LC:
+                        return unw_access_ar(&info, UNW_AR_LC, data,
+                                             write_access);
+                      default:
+                        if (addr >= PT_F2 && addr < PT_F5 + 16)
+                                return access_fr(&info, (addr - PT_F2)/16 + 2,
+                                                 (addr & 8) != 0, data,
+                                                 write_access);
+                        else if (addr >= PT_F16 && addr < PT_F31 + 16)
+                                return access_fr(&info,
+                                                 (addr - PT_F16)/16 + 16,
+                                                 (addr & 8) != 0,
+                                                 data, write_access);
+                        else {
+                                dprintk("ptrace: rejecting access to register "
+                                        "address 0x%lx\n", addr);
+                                return -1;
+                        }
+                }
+        } else if (addr < PT_F9+16) {
+                /* scratch state */
+                switch (addr) {
+                      case PT_AR_BSP:
+                        /*
+                         * By convention, we use PT_AR_BSP to refer to
+                         * the end of the user-level backing store.
+                         * Use ia64_rse_skip_regs(PT_AR_BSP, -CFM.sof)
+                         * to get the real value of ar.bsp at the time
+                         * the kernel was entered.
+                         *
+                         * Furthermore, when changing the contents of
+                         * PT_AR_BSP (or PT_CFM) we MUST copy any
+                         * users-level stacked registers that are
+                         * stored on the kernel stack back to
+                         * user-space because otherwise, we might end
+                         * up clobbering kernel stacked registers.
+                         * Also, if this happens while the task is
+                         * blocked in a system call, which convert the
+                         * state such that the non-system-call exit
+                         * path is used.  This ensures that the proper
+                         * state will be picked up when resuming
+                         * execution.  However, it *also* means that
+                         * once we write PT_AR_BSP/PT_CFM, it won't be
+                         * possible to modify the syscall arguments of
+                         * the pending system call any longer.  This
+                         * shouldn't be an issue because modifying
+                         * PT_AR_BSP/PT_CFM generally implies that
+                         * we're either abandoning the pending system
+                         * call or that we defer it's re-execution
+                         * (e.g., due to GDB doing an inferior
+                         * function call).
+                         */
+                        urbs_end = ia64_get_user_rbs_end(child, pt, &cfm);
+                        if (write_access) {
+                                if (*data != urbs_end) {
+                                        if (ia64_sync_user_rbs(child, sw,
+                                                               pt->ar_bspstore,
+                                                               urbs_end) < 0)
+                                                return -1;
+                                        if (in_syscall(pt))
+                                                convert_to_non_syscall(child,
+                                                                       pt,
+                                                                       cfm);
+                                        /*
+                                         * Simulate user-level write
+                                         * of ar.bsp:
+                                         */
+                                        pt->loadrs = 0;
+                                        pt->ar_bspstore = *data;
+                                }
+                        } else
+                                *data = urbs_end;
+                        return 0;
+                      case PT_CFM:
+                        urbs_end = ia64_get_user_rbs_end(child, pt, &cfm);
+                        if (write_access) {
+                                if (((cfm ^ *data) & PFM_MASK) != 0) {
+                                        if (ia64_sync_user_rbs(child, sw,
+                                                               pt->ar_bspstore,
+                                                               urbs_end) < 0)
+                                                return -1;
+                                        if (in_syscall(pt))
+                                                convert_to_non_syscall(child,
+                                                                       pt,
+                                                                       cfm);
+                                        pt->cr_ifs = ((pt->cr_ifs & ~PFM_MASK)
+                                                      | (*data & PFM_MASK));
+                                }
+                        } else
+                                *data = cfm;
+                        return 0;
+                      case PT_CR_IPSR:
+                        if (write_access)
+                                pt->cr_ipsr = ((*data & IPSR_MASK)
+                                               | (pt->cr_ipsr & ~IPSR_MASK));
+                        else
+                                *data = (pt->cr_ipsr & IPSR_MASK);
+                        return 0;
+                      case PT_AR_RNAT:
+                        urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
+                        rnat_addr = (long) ia64_rse_rnat_addr((long *)
+                                                              urbs_end);
+                        if (write_access)
+                                return ia64_poke(child, sw, urbs_end,
+                                                 rnat_addr, *data);
+                        else
+                                return ia64_peek(child, sw, urbs_end,
+                                                 rnat_addr, data);
+                      case PT_R1:
+                        ptr = pt_reg_addr(pt, r1);
+                        break;
+                      case PT_R2:  case PT_R3:
+                        ptr = pt_reg_addr(pt, r2) + (addr - PT_R2);
+                        break;
+                      case PT_R8:  case PT_R9:  case PT_R10: case PT_R11:
+                        ptr = pt_reg_addr(pt, r8) + (addr - PT_R8);
+                        break;
+                      case PT_R12: case PT_R13:
+                        ptr = pt_reg_addr(pt, r12) + (addr - PT_R12);
+                        break;
+                      case PT_R14:
+                        ptr = pt_reg_addr(pt, r14);
+                        break;
+                      case PT_R15:
+                        ptr = pt_reg_addr(pt, r15);
+                        break;
+                      case PT_R16: case PT_R17: case PT_R18: case PT_R19:
+                      case PT_R20: case PT_R21: case PT_R22: case PT_R23:
+                      case PT_R24: case PT_R25: case PT_R26: case PT_R27:
+                      case PT_R28: case PT_R29: case PT_R30: case PT_R31:
+                        ptr = pt_reg_addr(pt, r16) + (addr - PT_R16);
+                        break;
+                      case PT_B0:
+                        ptr = pt_reg_addr(pt, b0);
+                        break;
+                      case PT_B6:
+                        ptr = pt_reg_addr(pt, b6);
+                        break;
+                      case PT_B7:
+                        ptr = pt_reg_addr(pt, b7);
+                        break;
+                      case PT_F6:  case PT_F6+8: case PT_F7: case PT_F7+8:
+                      case PT_F8:  case PT_F8+8: case PT_F9: case PT_F9+8:
+                        ptr = pt_reg_addr(pt, f6) + (addr - PT_F6);
+                        break;
+                      case PT_AR_BSPSTORE:
+                        ptr = pt_reg_addr(pt, ar_bspstore);
+                        break;
+                      case PT_AR_RSC:
+                        ptr = pt_reg_addr(pt, ar_rsc);
+                        break;
+                      case PT_AR_UNAT:
+                        ptr = pt_reg_addr(pt, ar_unat);
+                        break;
+                      case PT_AR_PFS:
+                        ptr = pt_reg_addr(pt, ar_pfs);
+                        break;
+                      case PT_AR_CCV:
+                        ptr = pt_reg_addr(pt, ar_ccv);
+                        break;
+                      case PT_AR_FPSR:
+                        ptr = pt_reg_addr(pt, ar_fpsr);
+                        break;
+                      case PT_CR_IIP:
+                        ptr = pt_reg_addr(pt, cr_iip);
+                        break;
+                      case PT_PR:
+                        ptr = pt_reg_addr(pt, pr);
+                        break;
+                        /* scratch register */
+                      default:
+                        /* disallow accessing anything else... */
+                        dprintk("ptrace: rejecting access to register "
+                                "address 0x%lx\n", addr);
+                        return -1;
+                }
+        } else if (addr <= PT_AR_SSD) {
+                ptr = pt_reg_addr(pt, ar_csd) + (addr - PT_AR_CSD);
+        } else {
+                /* access debug registers */
+                if (addr >= PT_IBR) {
+                        regnum = (addr - PT_IBR) >> 3;
+                        ptr = &child->thread.ibr[0];
+                } else {
+                        regnum = (addr - PT_DBR) >> 3;
+                        ptr = &child->thread.dbr[0];
+                }
+                if (regnum >= 8) {
+                        dprintk("ptrace: rejecting access to register "
+                                "address 0x%lx\n", addr);
+                        return -1;
+                }
+#ifdef CONFIG_PERFMON
+                /*
+                 * Check if debug registers are used by perfmon. This
+                 * test must be done once we know that we can do the
+                 * operation, i.e. the arguments are all valid, but
+                 * before we start modifying the state.
+                 *
+                 * Perfmon needs to keep a count of how many processes
+                 * are trying to modify the debug registers for system
+                 * wide monitoring sessions.
+                 *
+                 * We also include read access here, because they may
+                 * cause the PMU-installed debug register state
+                 * (dbr[], ibr[]) to be reset. The two arrays are also
+                 * used by perfmon, but we do not use
+                 * IA64_THREAD_DBG_VALID. The registers are restored
+                 * by the PMU context switch code.
+                 */
+                if (pfm_use_debug_registers(child)) return -1;
+#endif
+                if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) {
+                        child->thread.flags |= IA64_THREAD_DBG_VALID;
+                        memset(child->thread.dbr, 0,
+                               sizeof(child->thread.dbr));
+                        memset(child->thread.ibr, 0,
+                               sizeof(child->thread.ibr));
+                }
+                ptr += regnum;
+                if ((regnum & 1) && write_access) {
+                        /* don't let the user set kernel-level breakpoints: */
+                        *ptr = *data & ~(7UL << 56);
+                        return 0;
+                }
+        }
+        if (write_access)
+                *ptr = *data;
+        else
+                *data = *ptr;
+        return 0;
+}
+static long
+ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
+{
+        unsigned long psr, ec, lc, rnat, bsp, cfm, nat_bits, val;
+        struct unw_frame_info info;
+        struct ia64_fpreg fpval;
+        struct switch_stack *sw;
+        struct pt_regs *pt;
+        long ret, retval = 0;
+        char nat = 0;
+        int i;
+        if (!access_ok(VERIFY_WRITE, ppr, sizeof(struct pt_all_user_regs)))
+                return -EIO;
+        pt = ia64_task_regs(child);
+        sw = (struct switch_stack *) (child->thread.ksp + 16);
+        unw_init_from_blocked_task(&info, child);
+        if (unw_unwind_to_user(&info) < 0) {
+                return -EIO;
+        }
+        if (((unsigned long) ppr & 0x7) != 0) {
+                dprintk("ptrace:unaligned register address %p\n", ppr);
+                return -EIO;
+        }
+        if (access_uarea(child, PT_CR_IPSR, &psr, 0) < 0
+            || access_uarea(child, PT_AR_EC, &ec, 0) < 0
+            || access_uarea(child, PT_AR_LC, &lc, 0) < 0
+            || access_uarea(child, PT_AR_RNAT, &rnat, 0) < 0
+            || access_uarea(child, PT_AR_BSP, &bsp, 0) < 0
+            || access_uarea(child, PT_CFM, &cfm, 0)
+            || access_uarea(child, PT_NAT_BITS, &nat_bits, 0))
+                return -EIO;
+        /* control regs */
+        retval |= __put_user(pt->cr_iip, &ppr->cr_iip);
+        retval |= __put_user(psr, &ppr->cr_ipsr);
+        /* app regs */
+        retval |= __put_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]);
+        retval |= __put_user(pt->ar_rsc, &ppr->ar[PT_AUR_RSC]);
+        retval |= __put_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]);
+        retval |= __put_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]);
+        retval |= __put_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]);
+        retval |= __put_user(pt->ar_fpsr, &ppr->ar[PT_AUR_FPSR]);
+        retval |= __put_user(ec, &ppr->ar[PT_AUR_EC]);
+        retval |= __put_user(lc, &ppr->ar[PT_AUR_LC]);
+        retval |= __put_user(rnat, &ppr->ar[PT_AUR_RNAT]);
+        retval |= __put_user(bsp, &ppr->ar[PT_AUR_BSP]);
+        retval |= __put_user(cfm, &ppr->cfm);
+        /* gr1-gr3 */
+        retval |= __copy_to_user(&ppr->gr[1], &pt->r1, sizeof(long));
+        retval |= __copy_to_user(&ppr->gr[2], &pt->r2, sizeof(long) *2);
+        /* gr4-gr7 */
+        for (i = 4; i < 8; i++) {
+                if (unw_access_gr(&info, i, &val, &nat, 0) < 0)
+                        return -EIO;
+                retval |= __put_user(val, &ppr->gr[i]);
+        }
+        /* gr8-gr11 */
+        retval |= __copy_to_user(&ppr->gr[8], &pt->r8, sizeof(long) * 4);
+        /* gr12-gr15 */
+        retval |= __copy_to_user(&ppr->gr[12], &pt->r12, sizeof(long) * 2);
+        retval |= __copy_to_user(&ppr->gr[14], &pt->r14, sizeof(long));
+        retval |= __copy_to_user(&ppr->gr[15], &pt->r15, sizeof(long));
+        /* gr16-gr31 */
+        retval |= __copy_to_user(&ppr->gr[16], &pt->r16, sizeof(long) * 16);
+        /* b0 */
+        retval |= __put_user(pt->b0, &ppr->br[0]);
+        /* b1-b5 */
+        for (i = 1; i < 6; i++) {
+                if (unw_access_br(&info, i, &val, 0) < 0)
+                        return -EIO;
+                __put_user(val, &ppr->br[i]);
+        }
+        /* b6-b7 */
+        retval |= __put_user(pt->b6, &ppr->br[6]);
+        retval |= __put_user(pt->b7, &ppr->br[7]);
+        /* fr2-fr5 */
+        for (i = 2; i < 6; i++) {
+                if (unw_get_fr(&info, i, &fpval) < 0)
+                        return -EIO;
+                retval |= __copy_to_user(&ppr->fr[i], &fpval, sizeof (fpval));
+        }
+        /* fr6-fr11 */
+        retval |= __copy_to_user(&ppr->fr[6], &pt->f6,
+                                 sizeof(struct ia64_fpreg) * 6);
+        /* fp scratch regs(12-15) */
+        retval |= __copy_to_user(&ppr->fr[12], &sw->f12,
+                                 sizeof(struct ia64_fpreg) * 4);
+        /* fr16-fr31 */
+        for (i = 16; i < 32; i++) {
+                if (unw_get_fr(&info, i, &fpval) < 0)
+                        return -EIO;
+                retval |= __copy_to_user(&ppr->fr[i], &fpval, sizeof (fpval));
+        }
+        /* fph */
+        ia64_flush_fph(child);
+        retval |= __copy_to_user(&ppr->fr[32], &child->thread.fph,
+                                 sizeof(ppr->fr[32]) * 96);
+        /*  preds */
+        retval |= __put_user(pt->pr, &ppr->pr);
+        /* nat bits */
+        retval |= __put_user(nat_bits, &ppr->nat);
+        ret = retval ? -EIO : 0;
+        return ret;
+}
+static long
+ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
+{
+        unsigned long psr, ec, lc, rnat, bsp, cfm, nat_bits, val = 0;
+        struct unw_frame_info info;
+        struct switch_stack *sw;
+        struct ia64_fpreg fpval;
+        struct pt_regs *pt;
+        long ret, retval = 0;
+        int i;
+        memset(&fpval, 0, sizeof(fpval));
+        if (!access_ok(VERIFY_READ, ppr, sizeof(struct pt_all_user_regs)))
+                return -EIO;
+        pt = ia64_task_regs(child);
+        sw = (struct switch_stack *) (child->thread.ksp + 16);
+        unw_init_from_blocked_task(&info, child);
+        if (unw_unwind_to_user(&info) < 0) {
+                return -EIO;
+        }
+        if (((unsigned long) ppr & 0x7) != 0) {
+                dprintk("ptrace:unaligned register address %p\n", ppr);
+                return -EIO;
+        }
+        /* control regs */
+        retval |= __get_user(pt->cr_iip, &ppr->cr_iip);
+        retval |= __get_user(psr, &ppr->cr_ipsr);
+        /* app regs */
+        retval |= __get_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]);
+        retval |= __get_user(pt->ar_rsc, &ppr->ar[PT_AUR_RSC]);
+        retval |= __get_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]);
+        retval |= __get_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]);
+        retval |= __get_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]);
+        retval |= __get_user(pt->ar_fpsr, &ppr->ar[PT_AUR_FPSR]);
+        retval |= __get_user(ec, &ppr->ar[PT_AUR_EC]);
+        retval |= __get_user(lc, &ppr->ar[PT_AUR_LC]);
+        retval |= __get_user(rnat, &ppr->ar[PT_AUR_RNAT]);
+        retval |= __get_user(bsp, &ppr->ar[PT_AUR_BSP]);
+        retval |= __get_user(cfm, &ppr->cfm);
+        /* gr1-gr3 */
+        retval |= __copy_from_user(&pt->r1, &ppr->gr[1], sizeof(long));
+        retval |= __copy_from_user(&pt->r2, &ppr->gr[2], sizeof(long) * 2);
+        /* gr4-gr7 */
+        for (i = 4; i < 8; i++) {
+                retval |= __get_user(val, &ppr->gr[i]);
+                /* NaT bit will be set via PT_NAT_BITS: */
+                if (unw_set_gr(&info, i, val, 0) < 0)
+                        return -EIO;
+        }
+        /* gr8-gr11 */
+        retval |= __copy_from_user(&pt->r8, &ppr->gr[8], sizeof(long) * 4);
+        /* gr12-gr15 */
+        retval |= __copy_from_user(&pt->r12, &ppr->gr[12], sizeof(long) * 2);
+        retval |= __copy_from_user(&pt->r14, &ppr->gr[14], sizeof(long));
+        retval |= __copy_from_user(&pt->r15, &ppr->gr[15], sizeof(long));
+        /* gr16-gr31 */
+        retval |= __copy_from_user(&pt->r16, &ppr->gr[16], sizeof(long) * 16);
+        /* b0 */
+        retval |= __get_user(pt->b0, &ppr->br[0]);
+        /* b1-b5 */
+        for (i = 1; i < 6; i++) {
+                retval |= __get_user(val, &ppr->br[i]);
+                unw_set_br(&info, i, val);
+        }
+        /* b6-b7 */
+        retval |= __get_user(pt->b6, &ppr->br[6]);
+        retval |= __get_user(pt->b7, &ppr->br[7]);
+        /* fr2-fr5 */
+        for (i = 2; i < 6; i++) {
+                retval |= __copy_from_user(&fpval, &ppr->fr[i], sizeof(fpval));
+                if (unw_set_fr(&info, i, fpval) < 0)
+                        return -EIO;
+        }
+        /* fr6-fr11 */
+        retval |= __copy_from_user(&pt->f6, &ppr->fr[6],
+                                   sizeof(ppr->fr[6]) * 6);
+        /* fp scratch regs(12-15) */
+        retval |= __copy_from_user(&sw->f12, &ppr->fr[12],
+                                   sizeof(ppr->fr[12]) * 4);
+        /* fr16-fr31 */
+        for (i = 16; i < 32; i++) {
+                retval |= __copy_from_user(&fpval, &ppr->fr[i],
+                                           sizeof(fpval));
+                if (unw_set_fr(&info, i, fpval) < 0)
+                        return -EIO;
+        }
+        /* fph */
+        ia64_sync_fph(child);
+        retval |= __copy_from_user(&child->thread.fph, &ppr->fr[32],
+                                   sizeof(ppr->fr[32]) * 96);
+        /* preds */
+        retval |= __get_user(pt->pr, &ppr->pr);
+        /* nat bits */
+        retval |= __get_user(nat_bits, &ppr->nat);
+        retval |= access_uarea(child, PT_CR_IPSR, &psr, 1);
+        retval |= access_uarea(child, PT_AR_EC, &ec, 1);
+        retval |= access_uarea(child, PT_AR_LC, &lc, 1);
+        retval |= access_uarea(child, PT_AR_RNAT, &rnat, 1);
+        retval |= access_uarea(child, PT_AR_BSP, &bsp, 1);
+        retval |= access_uarea(child, PT_CFM, &cfm, 1);
+        retval |= access_uarea(child, PT_NAT_BITS, &nat_bits, 1);
+        ret = retval ? -EIO : 0;
+        return ret;
+}
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void
+ptrace_disable (struct task_struct *child)
+{
+        struct ia64_psr *child_psr = ia64_psr(ia64_task_regs(child));
+        /* make sure the single step/taken-branch trap bits are not set: */
+        child_psr->ss = 0;
+        child_psr->tb = 0;
+}
+asmlinkage long
+sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data)
+{
+        struct pt_regs *pt;
+        unsigned long urbs_end, peek_or_poke;
+        struct task_struct *child;
+        struct switch_stack *sw;
+        long ret;
+        lock_kernel();
+        ret = -EPERM;
+        if (request == PTRACE_TRACEME) {
+                /* are we already being traced? */
+                if (current->ptrace & PT_PTRACED)
+                        goto out;
+                ret = security_ptrace(current->parent, current);
+                if (ret)
+                        goto out;
+                current->ptrace |= PT_PTRACED;
+                ret = 0;
+                goto out;
+        }
+        peek_or_poke = (request == PTRACE_PEEKTEXT
+                        || request == PTRACE_PEEKDATA
+                        || request == PTRACE_POKETEXT
+                        || request == PTRACE_POKEDATA);
+        ret = -ESRCH;
+        read_lock(&tasklist_lock);
+        {
+                child = find_task_by_pid(pid);
+                if (child) {
+                        if (peek_or_poke)
+                                child = find_thread_for_addr(child, addr);
+                        get_task_struct(child);
+                }
+        }
+        read_unlock(&tasklist_lock);
+        if (!child)
+                goto out;
+        ret = -EPERM;
+        if (pid == 1)           /* no messing around with init! */
+                goto out_tsk;
+        if (request == PTRACE_ATTACH) {
+                ret = ptrace_attach(child);
+                goto out_tsk;
+        }
+        ret = ptrace_check_attach(child, request == PTRACE_KILL);
+        if (ret < 0)
+                goto out_tsk;
+        pt = ia64_task_regs(child);
+        sw = (struct switch_stack *) (child->thread.ksp + 16);
+        switch (request) {
+              case PTRACE_PEEKTEXT:
+              case PTRACE_PEEKDATA:
+                /* read word at location addr */
+                urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
+                ret = ia64_peek(child, sw, urbs_end, addr, &data);
+                if (ret == 0) {
+                        ret = data;
+                        /* ensure "ret" is not mistaken as an error code: */
+                        force_successful_syscall_return();
+                }
+                goto out_tsk;
+              case PTRACE_POKETEXT:
+              case PTRACE_POKEDATA:
+                /* write the word at location addr */
+                urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
+                ret = ia64_poke(child, sw, urbs_end, addr, data);
+                goto out_tsk;
+              case PTRACE_PEEKUSR:
+                /* read the word at addr in the USER area */
+                if (access_uarea(child, addr, &data, 0) < 0) {
+                        ret = -EIO;
+                        goto out_tsk;
+                }
+                ret = data;
+                /* ensure "ret" is not mistaken as an error code */
+                force_successful_syscall_return();
+                goto out_tsk;
+              case PTRACE_POKEUSR:
+                /* write the word at addr in the USER area */
+                if (access_uarea(child, addr, &data, 1) < 0) {
+                        ret = -EIO;
+                        goto out_tsk;
+                }
+                ret = 0;
+                goto out_tsk;
+              case PTRACE_OLD_GETSIGINFO:
+                /* for backwards-compatibility */
+                ret = ptrace_request(child, PTRACE_GETSIGINFO, addr, data);
+                goto out_tsk;
+              case PTRACE_OLD_SETSIGINFO:
+                /* for backwards-compatibility */
+                ret = ptrace_request(child, PTRACE_SETSIGINFO, addr, data);
+                goto out_tsk;
+              case PTRACE_SYSCALL:
+                /* continue and stop at next (return from) syscall */
+              case PTRACE_CONT:
+                /* restart after signal. */
+                ret = -EIO;
+                if (data > _NSIG)
+                        goto out_tsk;
+                if (request == PTRACE_SYSCALL)
+                        set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+                else
+                        clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+                child->exit_code = data;
+                /*
+                 * Make sure the single step/taken-branch trap bits
+                 * are not set:
+                 */
+                ia64_psr(pt)->ss = 0;
+                ia64_psr(pt)->tb = 0;
+                wake_up_process(child);
+                ret = 0;
+                goto out_tsk;
+              case PTRACE_KILL:
+                /*
+                 * Make the child exit.  Best I can do is send it a
+                 * sigkill.  Perhaps it should be put in the status
+                 * that it wants to exit.
+                 */
+                if (child->exit_state == EXIT_ZOMBIE)
+                        /* already dead */
+                        goto out_tsk;
+                child->exit_code = SIGKILL;
+                ptrace_disable(child);
+                wake_up_process(child);
+                ret = 0;
+                goto out_tsk;
+              case PTRACE_SINGLESTEP:
+                /* let child execute for one instruction */
+              case PTRACE_SINGLEBLOCK:
+                ret = -EIO;
+                if (data > _NSIG)
+                        goto out_tsk;
+                clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+                if (request == PTRACE_SINGLESTEP) {
+                        ia64_psr(pt)->ss = 1;
+                } else {
+                        ia64_psr(pt)->tb = 1;
+                }
+                child->exit_code = data;
+                /* give it a chance to run. */
+                wake_up_process(child);
+                ret = 0;
+                goto out_tsk;
+              case PTRACE_DETACH:
+                /* detach a process that was attached. */
+                ret = ptrace_detach(child, data);
+                goto out_tsk;
+              case PTRACE_GETREGS:
+                ret = ptrace_getregs(child,
+                                     (struct pt_all_user_regs __user *) data);
+                goto out_tsk;
+              case PTRACE_SETREGS:
+                ret = ptrace_setregs(child,
+                                     (struct pt_all_user_regs __user *) data);
+                goto out_tsk;
+              default:
+                ret = ptrace_request(child, request, addr, data);
+                goto out_tsk;
+        }
+  out_tsk:
+        put_task_struct(child);
+  out:
+        unlock_kernel();
+        return ret;
+}
+void
+syscall_trace (void)
+{
+        if (!test_thread_flag(TIF_SYSCALL_TRACE))
+                return;
+        if (!(current->ptrace & PT_PTRACED))
+                return;
+        /*
+         * The 0x80 provides a way for the tracing parent to
+         * distinguish between a syscall stop and SIGTRAP delivery.
+         */
+        ptrace_notify(SIGTRAP
+                      | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
+        /*
+         * This isn't the same as continuing with a signal, but it
+         * will do for normal use.  strace only continues with a
+         * signal if the stopping signal is not SIGTRAP.  -brl
+         */
+        if (current->exit_code) {
+                send_sig(current->exit_code, current, 1);
+                current->exit_code = 0;
+        }
+}
+/* "asmlinkage" so the input arguments are preserved... */
+asmlinkage void
+syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
+                     long arg4, long arg5, long arg6, long arg7,
+                     struct pt_regs regs)
+{
+        long syscall;
+        if (unlikely(current->audit_context)) {
+                if (IS_IA32_PROCESS(&regs))
+                        syscall = regs.r1;
+                else
+                        syscall = regs.r15;
+                audit_syscall_entry(current, syscall, arg0, arg1, arg2, arg3);
+        }
+        if (test_thread_flag(TIF_SYSCALL_TRACE)
+            && (current->ptrace & PT_PTRACED))
+                syscall_trace();
+}
+/* "asmlinkage" so the input arguments are preserved... */
+asmlinkage void
+syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
+                     long arg4, long arg5, long arg6, long arg7,
+                     struct pt_regs regs)
+{
+        if (unlikely(current->audit_context))
+                audit_syscall_exit(current, regs.r8);
+        if (test_thread_flag(TIF_SYSCALL_TRACE)
+            && (current->ptrace & PT_PTRACED))
+                syscall_trace();
+}
diff --git a/arch/ia64/kernel/sal.c b/arch/ia64/kernel/sal.c
new file mode 100644
index 000000000000..acc0f132f86c
--- /dev/null
+++ b/arch/ia64/kernel/sal.c
@@ -0,0 +1,302 @@
+/*
+ * System Abstraction Layer (SAL) interface routines.
+ *
+ * Copyright (C) 1998, 1999, 2001, 2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <asm/page.h>
+#include <asm/sal.h>
+#include <asm/pal.h>
+ __cacheline_aligned DEFINE_SPINLOCK(sal_lock);
+unsigned long sal_platform_features;
+unsigned short sal_revision;
+unsigned short sal_version;
+#define SAL_MAJOR(x) ((x) >> 8)
+#define SAL_MINOR(x) ((x) & 0xff)
+static struct {
+        void *addr;     /* function entry point */
+        void *gpval;    /* gp value to use */
+} pdesc;
+static long
+default_handler (void)
+{
+        return -1;
+}
+ia64_sal_handler ia64_sal = (ia64_sal_handler) default_handler;
+ia64_sal_desc_ptc_t *ia64_ptc_domain_info;
+const char *
+ia64_sal_strerror (long status)
+{
+        const char *str;
+        switch (status) {
+              case 0: str = "Call completed without error"; break;
+              case 1: str = "Effect a warm boot of the system to complete "
+                              "the update"; break;
+              case -1: str = "Not implemented"; break;
+              case -2: str = "Invalid argument"; break;
+              case -3: str = "Call completed with error"; break;
+              case -4: str = "Virtual address not registered"; break;
+              case -5: str = "No information available"; break;
+              case -6: str = "Insufficient space to add the entry"; break;
+              case -7: str = "Invalid entry_addr value"; break;
+              case -8: str = "Invalid interrupt vector"; break;
+              case -9: str = "Requested memory not available"; break;
+              case -10: str = "Unable to write to the NVM device"; break;
+              case -11: str = "Invalid partition type specified"; break;
+              case -12: str = "Invalid NVM_Object id specified"; break;
+              case -13: str = "NVM_Object already has the maximum number "
+                                "of partitions"; break;
+              case -14: str = "Insufficient space in partition for the "
+                                "requested write sub-function"; break;
+              case -15: str = "Insufficient data buffer space for the "
+                                "requested read record sub-function"; break;
+              case -16: str = "Scratch buffer required for the write/delete "
+                                "sub-function"; break;
+              case -17: str = "Insufficient space in the NVM_Object for the "
+                                "requested create sub-function"; break;
+              case -18: str = "Invalid value specified in the partition_rec "
+                                "argument"; break;
+              case -19: str = "Record oriented I/O not supported for this "
+                                "partition"; break;
+              case -20: str = "Bad format of record to be written or "
+                                "required keyword variable not "
+                                "specified"; break;
+              default: str = "Unknown SAL status code"; break;
+        }
+        return str;
+}
+void __init
+ia64_sal_handler_init (void *entry_point, void *gpval)
+{
+        /* fill in the SAL procedure descriptor and point ia64_sal to it: */
+        pdesc.addr = entry_point;
+        pdesc.gpval = gpval;
+        ia64_sal = (ia64_sal_handler) &pdesc;
+}
+static void __init
+check_versions (struct ia64_sal_systab *systab)
+{
+        sal_revision = (systab->sal_rev_major << 8) | systab->sal_rev_minor;
+        sal_version = (systab->sal_b_rev_major << 8) | systab->sal_b_rev_minor;
+        /* Check for broken firmware */
+        if ((sal_revision == SAL_VERSION_CODE(49, 29))
+            && (sal_version == SAL_VERSION_CODE(49, 29)))
+        {
+                /*
+                 * Old firmware for zx2000 prototypes have this weird version number,
+                 * reset it to something sane.
+                 */
+                sal_revision = SAL_VERSION_CODE(2, 8);
+                sal_version = SAL_VERSION_CODE(0, 0);
+        }
+}
+static void __init
+sal_desc_entry_point (void *p)
+{
+        struct ia64_sal_desc_entry_point *ep = p;
+        ia64_pal_handler_init(__va(ep->pal_proc));
+        ia64_sal_handler_init(__va(ep->sal_proc), __va(ep->gp));
+}
+#ifdef CONFIG_SMP
+static void __init
+set_smp_redirect (int flag)
+{
+#ifndef CONFIG_HOTPLUG_CPU
+        if (no_int_routing)
+                smp_int_redirect &= ~flag;
+        else
+                smp_int_redirect |= flag;
+#else
+        /*
+         * For CPU Hotplug we dont want to do any chipset supported
+         * interrupt redirection. The reason is this would require that
+         * All interrupts be stopped and hard bind the irq to a cpu.
+         * Later when the interrupt is fired we need to set the redir hint
+         * on again in the vector. This is combersome for something that the
+         * user mode irq balancer will solve anyways.
+         */
+        no_int_routing=1;
+        smp_int_redirect &= ~flag;
+#endif
+}
+#else
+#define set_smp_redirect(flag)  do { } while (0)
+#endif
+static void __init
+sal_desc_platform_feature (void *p)
+{
+        struct ia64_sal_desc_platform_feature *pf = p;
+        sal_platform_features = pf->feature_mask;
+        printk(KERN_INFO "SAL Platform features:");
+        if (!sal_platform_features) {
+                printk(" None\n");
+                return;
+        }
+        if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_BUS_LOCK)
+                printk(" BusLock");
+        if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT) {
+                printk(" IRQ_Redirection");
+                set_smp_redirect(SMP_IRQ_REDIRECTION);
+        }
+        if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT) {
+                printk(" IPI_Redirection");
+                set_smp_redirect(SMP_IPI_REDIRECTION);
+        }
+        if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)
+                printk(" ITC_Drift");
+        printk("\n");
+}
+#ifdef CONFIG_SMP
+static void __init
+sal_desc_ap_wakeup (void *p)
+{
+        struct ia64_sal_desc_ap_wakeup *ap = p;
+        switch (ap->mechanism) {
+        case IA64_SAL_AP_EXTERNAL_INT:
+                ap_wakeup_vector = ap->vector;
+                printk(KERN_INFO "SAL: AP wakeup using external interrupt "
+                                "vector 0x%lx\n", ap_wakeup_vector);
+                break;
+        default:
+                printk(KERN_ERR "SAL: AP wakeup mechanism unsupported!\n");
+                break;
+        }
+}
+static void __init
+chk_nointroute_opt(void)
+{
+        char *cp;
+        extern char saved_command_line[];
+        for (cp = saved_command_line; *cp; ) {
+                if (memcmp(cp, "nointroute", 10) == 0) {
+                        no_int_routing = 1;
+                        printk ("no_int_routing on\n");
+                        break;
+                } else {
+                        while (*cp != ' ' && *cp)
+                                ++cp;
+                        while (*cp == ' ')
+                                ++cp;
+                }
+        }
+}
+#else
+static void __init sal_desc_ap_wakeup(void *p) { }
+#endif
+void __init
+ia64_sal_init (struct ia64_sal_systab *systab)
+{
+        char *p;
+        int i;
+        if (!systab) {
+                printk(KERN_WARNING "Hmm, no SAL System Table.\n");
+                return;
+        }
+        if (strncmp(systab->signature, "SST_", 4) != 0)
+                printk(KERN_ERR "bad signature in system table!");
+        check_versions(systab);
+#ifdef CONFIG_SMP
+        chk_nointroute_opt();
+#endif
+        /* revisions are coded in BCD, so %x does the job for us */
+        printk(KERN_INFO "SAL %x.%x: %.32s %.32s%sversion %x.%x\n",
+                        SAL_MAJOR(sal_revision), SAL_MINOR(sal_revision),
+                        systab->oem_id, systab->product_id,
+                        systab->product_id[0] ? " " : "",
+                        SAL_MAJOR(sal_version), SAL_MINOR(sal_version));
+        p = (char *) (systab + 1);
+        for (i = 0; i < systab->entry_count; i++) {
+                /*
+                 * The first byte of each entry type contains the type
+                 * descriptor.
+                 */
+                switch (*p) {
+                case SAL_DESC_ENTRY_POINT:
+                        sal_desc_entry_point(p);
+                        break;
+                case SAL_DESC_PLATFORM_FEATURE:
+                        sal_desc_platform_feature(p);
+                        break;
+                case SAL_DESC_PTC:
+                        ia64_ptc_domain_info = (ia64_sal_desc_ptc_t *)p;
+                        break;
+                case SAL_DESC_AP_WAKEUP:
+                        sal_desc_ap_wakeup(p);
+                        break;
+                }
+                p += SAL_DESC_SIZE(*p);
+        }
+}
+int
+ia64_sal_oemcall(struct ia64_sal_retval *isrvp, u64 oemfunc, u64 arg1,
+                 u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7)
+{
+        if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
+                return -1;
+        SAL_CALL(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
+        return 0;
+}
+EXPORT_SYMBOL(ia64_sal_oemcall);
+int
+ia64_sal_oemcall_nolock(struct ia64_sal_retval *isrvp, u64 oemfunc, u64 arg1,
+                        u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6,
+                        u64 arg7)
+{
+        if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
+                return -1;
+        SAL_CALL_NOLOCK(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6,
+                        arg7);
+        return 0;
+}
+EXPORT_SYMBOL(ia64_sal_oemcall_nolock);
+int
+ia64_sal_oemcall_reentrant(struct ia64_sal_retval *isrvp, u64 oemfunc,
+                           u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5,
+                           u64 arg6, u64 arg7)
+{
+        if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
+                return -1;
+        SAL_CALL_REENTRANT(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6,
+                           arg7);
+        return 0;
+}
+EXPORT_SYMBOL(ia64_sal_oemcall_reentrant);
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
new file mode 100644
index 000000000000..d227fabecd02
--- /dev/null
+++ b/arch/ia64/kernel/salinfo.c
@@ -0,0 +1,629 @@
+/*
+ * salinfo.c
+ *
+ * Creates entries in /proc/sal for various system features.
+ *
+ * Copyright (c) 2003 Silicon Graphics, Inc.  All rights reserved.
+ * Copyright (c) 2003 Hewlett-Packard Co
+ *      Bjorn Helgaas <bjorn.helgaas@hp.com>
+ *
+ * 10/30/2001   jbarnes@sgi.com         copied much of Stephane's palinfo
+ *                                      code to create this file
+ * Oct 23 2003  kaos@sgi.com
+ *   Replace IPI with set_cpus_allowed() to read a record from the required cpu.
+ *   Redesign salinfo log processing to separate interrupt and user space
+ *   contexts.
+ *   Cache the record across multi-block reads from user space.
+ *   Support > 64 cpus.
+ *   Delete module_exit and MOD_INC/DEC_COUNT, salinfo cannot be a module.
+ *
+ * Jan 28 2004  kaos@sgi.com
+ *   Periodically check for outstanding MCA or INIT records.
+ *
+ * Dec  5 2004  kaos@sgi.com
+ *   Standardize which records are cleared automatically.
+ */
+#include <linux/types.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/timer.h>
+#include <linux/vmalloc.h>
+#include <asm/semaphore.h>
+#include <asm/sal.h>
+#include <asm/uaccess.h>
+MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
+MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
+MODULE_LICENSE("GPL");
+static int salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data);
+typedef struct {
+        const char              *name;          /* name of the proc entry */
+        unsigned long           feature;        /* feature bit */
+        struct proc_dir_entry   *entry;         /* registered entry (removal) */
+} salinfo_entry_t;
+/*
+ * List {name,feature} pairs for every entry in /proc/sal/<feature>
+ * that this module exports
+ */
+static salinfo_entry_t salinfo_entries[]={
+        { "bus_lock",           IA64_SAL_PLATFORM_FEATURE_BUS_LOCK, },
+        { "irq_redirection",    IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT, },
+        { "ipi_redirection",    IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT, },
+        { "itc_drift",          IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT, },
+};
+#define NR_SALINFO_ENTRIES ARRAY_SIZE(salinfo_entries)
+static char *salinfo_log_name[] = {
+        "mca",
+        "init",
+        "cmc",
+        "cpe",
+};
+static struct proc_dir_entry *salinfo_proc_entries[
+        ARRAY_SIZE(salinfo_entries) +                   /* /proc/sal/bus_lock */
+        ARRAY_SIZE(salinfo_log_name) +                  /* /proc/sal/{mca,...} */
+        (2 * ARRAY_SIZE(salinfo_log_name)) +            /* /proc/sal/mca/{event,data} */
+        1];                                             /* /proc/sal */
+/* Some records we get ourselves, some are accessed as saved data in buffers
+ * that are owned by mca.c.
+ */
+struct salinfo_data_saved {
+        u8*                     buffer;
+        u64                     size;
+        u64                     id;
+        int                     cpu;
+};
+/* State transitions.  Actions are :-
+ *   Write "read <cpunum>" to the data file.
+ *   Write "clear <cpunum>" to the data file.
+ *   Write "oemdata <cpunum> <offset> to the data file.
+ *   Read from the data file.
+ *   Close the data file.
+ *
+ * Start state is NO_DATA.
+ *
+ * NO_DATA
+ *    write "read <cpunum>" -> NO_DATA or LOG_RECORD.
+ *    write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
+ *    write "oemdata <cpunum> <offset> -> return -EINVAL.
+ *    read data -> return EOF.
+ *    close -> unchanged.  Free record areas.
+ *
+ * LOG_RECORD
+ *    write "read <cpunum>" -> NO_DATA or LOG_RECORD.
+ *    write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
+ *    write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA.
+ *    read data -> return the INIT/MCA/CMC/CPE record.
+ *    close -> unchanged.  Keep record areas.
+ *
+ * OEMDATA
+ *    write "read <cpunum>" -> NO_DATA or LOG_RECORD.
+ *    write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
+ *    write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA.
+ *    read data -> return the formatted oemdata.
+ *    close -> unchanged.  Keep record areas.
+ *
+ * Closing the data file does not change the state.  This allows shell scripts
+ * to manipulate salinfo data, each shell redirection opens the file, does one
+ * action then closes it again.  The record areas are only freed at close when
+ * the state is NO_DATA.
+ */
+enum salinfo_state {
+        STATE_NO_DATA,
+        STATE_LOG_RECORD,
+        STATE_OEMDATA,
+};
+struct salinfo_data {
+        volatile cpumask_t      cpu_event;      /* which cpus have outstanding events */
+        struct semaphore        sem;            /* count of cpus with outstanding events (bits set in cpu_event) */
+        u8                      *log_buffer;
+        u64                     log_size;
+        u8                      *oemdata;       /* decoded oem data */
+        u64                     oemdata_size;
+        int                     open;           /* single-open to prevent races */
+        u8                      type;
+        u8                      saved_num;      /* using a saved record? */
+        enum salinfo_state      state :8;       /* processing state */
+        u8                      padding;
+        int                     cpu_check;      /* next CPU to check */
+        struct salinfo_data_saved data_saved[5];/* save last 5 records from mca.c, must be < 255 */
+};
+static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)];
+static spinlock_t data_lock, data_saved_lock;
+/** salinfo_platform_oemdata - optional callback to decode oemdata from an error
+ * record.
+ * @sect_header: pointer to the start of the section to decode.
+ * @oemdata: returns vmalloc area containing the decded output.
+ * @oemdata_size: returns length of decoded output (strlen).
+ *
+ * Description: If user space asks for oem data to be decoded by the kernel
+ * and/or prom and the platform has set salinfo_platform_oemdata to the address
+ * of a platform specific routine then call that routine.  salinfo_platform_oemdata
+ * vmalloc's and formats its output area, returning the address of the text
+ * and its strlen.  Returns 0 for success, -ve for error.  The callback is
+ * invoked on the cpu that generated the error record.
+ */
+int (*salinfo_platform_oemdata)(const u8 *sect_header, u8 **oemdata, u64 *oemdata_size);
+struct salinfo_platform_oemdata_parms {
+        const u8 *efi_guid;
+        u8 **oemdata;
+        u64 *oemdata_size;
+        int ret;
+};
+static void
+salinfo_platform_oemdata_cpu(void *context)
+{
+        struct salinfo_platform_oemdata_parms *parms = context;
+        parms->ret = salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
+}
+static void
+shift1_data_saved (struct salinfo_data *data, int shift)
+{
+        memcpy(data->data_saved+shift, data->data_saved+shift+1,
+               (ARRAY_SIZE(data->data_saved) - (shift+1)) * sizeof(data->data_saved[0]));
+        memset(data->data_saved + ARRAY_SIZE(data->data_saved) - 1, 0,
+               sizeof(data->data_saved[0]));
+}
+/* This routine is invoked in interrupt context.  Note: mca.c enables
+ * interrupts before calling this code for CMC/CPE.  MCA and INIT events are
+ * not irq safe, do not call any routines that use spinlocks, they may deadlock.
+ * MCA and INIT records are recorded, a timer event will look for any
+ * outstanding events and wake up the user space code.
+ *
+ * The buffer passed from mca.c points to the output from ia64_log_get. This is
+ * a persistent buffer but its contents can change between the interrupt and
+ * when user space processes the record.  Save the record id to identify
+ * changes.
+ */
+void
+salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe)
+{
+        struct salinfo_data *data = salinfo_data + type;
+        struct salinfo_data_saved *data_saved;
+        unsigned long flags = 0;
+        int i;
+        int saved_size = ARRAY_SIZE(data->data_saved);
+        BUG_ON(type >= ARRAY_SIZE(salinfo_log_name));
+        if (irqsafe)
+                spin_lock_irqsave(&data_saved_lock, flags);
+        for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
+                if (!data_saved->buffer)
+                        break;
+        }
+        if (i == saved_size) {
+                if (!data->saved_num) {
+                        shift1_data_saved(data, 0);
+                        data_saved = data->data_saved + saved_size - 1;
+                } else
+                        data_saved = NULL;
+        }
+        if (data_saved) {
+                data_saved->cpu = smp_processor_id();
+                data_saved->id = ((sal_log_record_header_t *)buffer)->id;
+                data_saved->size = size;
+                data_saved->buffer = buffer;
+        }
+        if (irqsafe)
+                spin_unlock_irqrestore(&data_saved_lock, flags);
+        if (!test_and_set_bit(smp_processor_id(), &data->cpu_event)) {
+                if (irqsafe)
+                        up(&data->sem);
+        }
+}
+/* Check for outstanding MCA/INIT records every minute (arbitrary) */
+#define SALINFO_TIMER_DELAY (60*HZ)
+static struct timer_list salinfo_timer;
+static void
+salinfo_timeout_check(struct salinfo_data *data)
+{
+        int i;
+        if (!data->open)
+                return;
+        for (i = 0; i < NR_CPUS; ++i) {
+                if (test_bit(i, &data->cpu_event)) {
+                        /* double up() is not a problem, user space will see no
+                         * records for the additional "events".
+                         */
+                        up(&data->sem);
+                }
+        }
+}
+static void 
+salinfo_timeout (unsigned long arg)
+{
+        salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA);
+        salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT);
+        salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
+        add_timer(&salinfo_timer);
+}
+static int
+salinfo_event_open(struct inode *inode, struct file *file)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return 0;
+}
+static ssize_t
+salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct proc_dir_entry *entry = PDE(inode);
+        struct salinfo_data *data = entry->data;
+        char cmd[32];
+        size_t size;
+        int i, n, cpu = -1;
+retry:
+        if (down_trylock(&data->sem)) {
+                if (file->f_flags & O_NONBLOCK)
+                        return -EAGAIN;
+                if (down_interruptible(&data->sem))
+                        return -ERESTARTSYS;
+        }
+        n = data->cpu_check;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (test_bit(n, &data->cpu_event)) {
+                        cpu = n;
+                        break;
+                }
+                if (++n == NR_CPUS)
+                        n = 0;
+        }
+        if (cpu == -1)
+                goto retry;
+        /* events are sticky until the user says "clear" */
+        up(&data->sem);
+        /* for next read, start checking at next CPU */
+        data->cpu_check = cpu;
+        if (++data->cpu_check == NR_CPUS)
+                data->cpu_check = 0;
+        snprintf(cmd, sizeof(cmd), "read %d\n", cpu);
+        size = strlen(cmd);
+        if (size > count)
+                size = count;
+        if (copy_to_user(buffer, cmd, size))
+                return -EFAULT;
+        return size;
+}
+static struct file_operations salinfo_event_fops = {
+        .open  = salinfo_event_open,
+        .read  = salinfo_event_read,
+};
+static int
+salinfo_log_open(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *entry = PDE(inode);
+        struct salinfo_data *data = entry->data;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        spin_lock(&data_lock);
+        if (data->open) {
+                spin_unlock(&data_lock);
+                return -EBUSY;
+        }
+        data->open = 1;
+        spin_unlock(&data_lock);
+        if (data->state == STATE_NO_DATA &&
+            !(data->log_buffer = vmalloc(ia64_sal_get_state_info_size(data->type)))) {
+                data->open = 0;
+                return -ENOMEM;
+        }
+        return 0;
+}
+static int
+salinfo_log_release(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *entry = PDE(inode);
+        struct salinfo_data *data = entry->data;
+        if (data->state == STATE_NO_DATA) {
+                vfree(data->log_buffer);
+                vfree(data->oemdata);
+                data->log_buffer = NULL;
+                data->oemdata = NULL;
+        }
+        spin_lock(&data_lock);
+        data->open = 0;
+        spin_unlock(&data_lock);
+        return 0;
+}
+static void
+call_on_cpu(int cpu, void (*fn)(void *), void *arg)
+{
+        cpumask_t save_cpus_allowed, new_cpus_allowed;
+        memcpy(&save_cpus_allowed, &current->cpus_allowed, sizeof(save_cpus_allowed));
+        memset(&new_cpus_allowed, 0, sizeof(new_cpus_allowed));
+        set_bit(cpu, &new_cpus_allowed);
+        set_cpus_allowed(current, new_cpus_allowed);
+        (*fn)(arg);
+        set_cpus_allowed(current, save_cpus_allowed);
+}
+static void
+salinfo_log_read_cpu(void *context)
+{
+        struct salinfo_data *data = context;
+        sal_log_record_header_t *rh;
+        data->log_size = ia64_sal_get_state_info(data->type, (u64 *) data->log_buffer);
+        rh = (sal_log_record_header_t *)(data->log_buffer);
+        /* Clear corrected errors as they are read from SAL */
+        if (rh->severity == sal_log_severity_corrected)
+                ia64_sal_clear_state_info(data->type);
+}
+static void
+salinfo_log_new_read(int cpu, struct salinfo_data *data)
+{
+        struct salinfo_data_saved *data_saved;
+        unsigned long flags;
+        int i;
+        int saved_size = ARRAY_SIZE(data->data_saved);
+        data->saved_num = 0;
+        spin_lock_irqsave(&data_saved_lock, flags);
+retry:
+        for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
+                if (data_saved->buffer && data_saved->cpu == cpu) {
+                        sal_log_record_header_t *rh = (sal_log_record_header_t *)(data_saved->buffer);
+                        data->log_size = data_saved->size;
+                        memcpy(data->log_buffer, rh, data->log_size);
+                        barrier();      /* id check must not be moved */
+                        if (rh->id == data_saved->id) {
+                                data->saved_num = i+1;
+                                break;
+                        }
+                        /* saved record changed by mca.c since interrupt, discard it */
+                        shift1_data_saved(data, i);
+                        goto retry;
+                }
+        }
+        spin_unlock_irqrestore(&data_saved_lock, flags);
+        if (!data->saved_num)
+                call_on_cpu(cpu, salinfo_log_read_cpu, data);
+        if (!data->log_size) {
+                data->state = STATE_NO_DATA;
+                clear_bit(cpu, &data->cpu_event);
+        } else {
+                data->state = STATE_LOG_RECORD;
+        }
+}
+static ssize_t
+salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct proc_dir_entry *entry = PDE(inode);
+        struct salinfo_data *data = entry->data;
+        u8 *buf;
+        u64 bufsize;
+        if (data->state == STATE_LOG_RECORD) {
+                buf = data->log_buffer;
+                bufsize = data->log_size;
+        } else if (data->state == STATE_OEMDATA) {
+                buf = data->oemdata;
+                bufsize = data->oemdata_size;
+        } else {
+                buf = NULL;
+                bufsize = 0;
+        }
+        return simple_read_from_buffer(buffer, count, ppos, buf, bufsize);
+}
+static void
+salinfo_log_clear_cpu(void *context)
+{
+        struct salinfo_data *data = context;
+        ia64_sal_clear_state_info(data->type);
+}
+static int
+salinfo_log_clear(struct salinfo_data *data, int cpu)
+{
+        sal_log_record_header_t *rh;
+        data->state = STATE_NO_DATA;
+        if (!test_bit(cpu, &data->cpu_event))
+                return 0;
+        down(&data->sem);
+        clear_bit(cpu, &data->cpu_event);
+        if (data->saved_num) {
+                unsigned long flags;
+                spin_lock_irqsave(&data_saved_lock, flags);
+                shift1_data_saved(data, data->saved_num - 1 );
+                data->saved_num = 0;
+                spin_unlock_irqrestore(&data_saved_lock, flags);
+        }
+        rh = (sal_log_record_header_t *)(data->log_buffer);
+        /* Corrected errors have already been cleared from SAL */
+        if (rh->severity != sal_log_severity_corrected)
+                call_on_cpu(cpu, salinfo_log_clear_cpu, data);
+        /* clearing a record may make a new record visible */
+        salinfo_log_new_read(cpu, data);
+        if (data->state == STATE_LOG_RECORD &&
+            !test_and_set_bit(cpu,  &data->cpu_event))
+                up(&data->sem);
+        return 0;
+}
+static ssize_t
+salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct proc_dir_entry *entry = PDE(inode);
+        struct salinfo_data *data = entry->data;
+        char cmd[32];
+        size_t size;
+        u32 offset;
+        int cpu;
+        size = sizeof(cmd);
+        if (count < size)
+                size = count;
+        if (copy_from_user(cmd, buffer, size))
+                return -EFAULT;
+        if (sscanf(cmd, "read %d", &cpu) == 1) {
+                salinfo_log_new_read(cpu, data);
+        } else if (sscanf(cmd, "clear %d", &cpu) == 1) {
+                int ret;
+                if ((ret = salinfo_log_clear(data, cpu)))
+                        count = ret;
+        } else if (sscanf(cmd, "oemdata %d %d", &cpu, &offset) == 2) {
+                if (data->state != STATE_LOG_RECORD && data->state != STATE_OEMDATA)
+                        return -EINVAL;
+                if (offset > data->log_size - sizeof(efi_guid_t))
+                        return -EINVAL;
+                data->state = STATE_OEMDATA;
+                if (salinfo_platform_oemdata) {
+                        struct salinfo_platform_oemdata_parms parms = {
+                                .efi_guid = data->log_buffer + offset,
+                                .oemdata = &data->oemdata,
+                                .oemdata_size = &data->oemdata_size
+                        };
+                        call_on_cpu(cpu, salinfo_platform_oemdata_cpu, &parms);
+                        if (parms.ret)
+                                count = parms.ret;
+                } else
+                        data->oemdata_size = 0;
+        } else
+                return -EINVAL;
+        return count;
+}
+static struct file_operations salinfo_data_fops = {
+        .open    = salinfo_log_open,
+        .release = salinfo_log_release,
+        .read    = salinfo_log_read,
+        .write   = salinfo_log_write,
+};
+static int __init
+salinfo_init(void)
+{
+        struct proc_dir_entry *salinfo_dir; /* /proc/sal dir entry */
+        struct proc_dir_entry **sdir = salinfo_proc_entries; /* keeps track of every entry */
+        struct proc_dir_entry *dir, *entry;
+        struct salinfo_data *data;
+        int i, j, online;
+        salinfo_dir = proc_mkdir("sal", NULL);
+        if (!salinfo_dir)
+                return 0;
+        for (i=0; i < NR_SALINFO_ENTRIES; i++) {
+                /* pass the feature bit in question as misc data */
+                *sdir++ = create_proc_read_entry (salinfo_entries[i].name, 0, salinfo_dir,
+                                                  salinfo_read, (void *)salinfo_entries[i].feature);
+        }
+        for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) {
+                data = salinfo_data + i;
+                data->type = i;
+                sema_init(&data->sem, 0);
+                dir = proc_mkdir(salinfo_log_name[i], salinfo_dir);
+                if (!dir)
+                        continue;
+                entry = create_proc_entry("event", S_IRUSR, dir);
+                if (!entry)
+                        continue;
+                entry->data = data;
+                entry->proc_fops = &salinfo_event_fops;
+                *sdir++ = entry;
+                entry = create_proc_entry("data", S_IRUSR | S_IWUSR, dir);
+                if (!entry)
+                        continue;
+                entry->data = data;
+                entry->proc_fops = &salinfo_data_fops;
+                *sdir++ = entry;
+                /* we missed any events before now */
+                online = 0;
+                for (j = 0; j < NR_CPUS; j++)
+                        if (cpu_online(j)) {
+                                set_bit(j, &data->cpu_event);
+                                ++online;
+                        }
+                sema_init(&data->sem, online);
+                *sdir++ = dir;
+        }
+        *sdir++ = salinfo_dir;
+        init_timer(&salinfo_timer);
+        salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
+        salinfo_timer.function = &salinfo_timeout;
+        add_timer(&salinfo_timer);
+        return 0;
+}
+/*
+ * 'data' contains an integer that corresponds to the feature we're
+ * testing
+ */
+static int
+salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data)
+{
+        int len = 0;
+        len = sprintf(page, (sal_platform_features & (unsigned long)data) ? "1\n" : "0\n");
+        if (len <= off+count) *eof = 1;
+        *start = page + off;
+        len   -= off;
+        if (len>count) len = count;
+        if (len<0) len = 0;
+        return len;
+}
+module_init(salinfo_init);
diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c
new file mode 100644
index 000000000000..2724ef3fbae2
--- /dev/null
+++ b/arch/ia64/kernel/semaphore.c
@@ -0,0 +1,165 @@
+/*
+ * IA-64 semaphore implementation (derived from x86 version).
+ *
+ * Copyright (C) 1999-2000, 2002 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+/*
+ * Semaphores are implemented using a two-way counter: The "count"
+ * variable is decremented for each process that tries to acquire the
+ * semaphore, while the "sleepers" variable is a count of such
+ * acquires.
+ *
+ * Notably, the inline "up()" and "down()" functions can efficiently
+ * test if they need to do any extra work (up needs to do something
+ * only if count was negative before the increment operation.
+ *
+ * "sleeping" and the contention routine ordering is protected
+ * by the spinlock in the semaphore's waitqueue head.
+ *
+ * Note that these functions are only called when there is contention
+ * on the lock, and as such all this is the "non-critical" part of the
+ * whole semaphore business. The critical part is the inline stuff in
+ * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
+ */
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <asm/errno.h>
+#include <asm/semaphore.h>
+/*
+ * Logic:
+ *  - Only on a boundary condition do we need to care. When we go
+ *    from a negative count to a non-negative, we wake people up.
+ *  - When we go from a non-negative count to a negative do we
+ *    (a) synchronize with the "sleepers" count and (b) make sure
+ *    that we're on the wakeup list before we synchronize so that
+ *    we cannot lose wakeup events.
+ */
+void
+__up (struct semaphore *sem)
+{
+        wake_up(&sem->wait);
+}
+void __sched __down (struct semaphore *sem)
+{
+        struct task_struct *tsk = current;
+        DECLARE_WAITQUEUE(wait, tsk);
+        unsigned long flags;
+        tsk->state = TASK_UNINTERRUPTIBLE;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        add_wait_queue_exclusive_locked(&sem->wait, &wait);
+        sem->sleepers++;
+        for (;;) {
+                int sleepers = sem->sleepers;
+                /*
+                 * Add "everybody else" into it. They aren't
+                 * playing, because we own the spinlock in
+                 * the wait_queue_head.
+                 */
+                if (!atomic_add_negative(sleepers - 1, &sem->count)) {
+                        sem->sleepers = 0;
+                        break;
+                }
+                sem->sleepers = 1;      /* us - see -1 above */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                schedule();
+                spin_lock_irqsave(&sem->wait.lock, flags);
+                tsk->state = TASK_UNINTERRUPTIBLE;
+        }
+        remove_wait_queue_locked(&sem->wait, &wait);
+        wake_up_locked(&sem->wait);
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        tsk->state = TASK_RUNNING;
+}
+int __sched __down_interruptible (struct semaphore * sem)
+{
+        int retval = 0;
+        struct task_struct *tsk = current;
+        DECLARE_WAITQUEUE(wait, tsk);
+        unsigned long flags;
+        tsk->state = TASK_INTERRUPTIBLE;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        add_wait_queue_exclusive_locked(&sem->wait, &wait);
+        sem->sleepers ++;
+        for (;;) {
+                int sleepers = sem->sleepers;
+                /*
+                 * With signals pending, this turns into
+                 * the trylock failure case - we won't be
+                 * sleeping, and we* can't get the lock as
+                 * it has contention. Just correct the count
+                 * and exit.
+                 */
+                if (signal_pending(current)) {
+                        retval = -EINTR;
+                        sem->sleepers = 0;
+                        atomic_add(sleepers, &sem->count);
+                        break;
+                }
+                /*
+                 * Add "everybody else" into it. They aren't
+                 * playing, because we own the spinlock in
+                 * wait_queue_head. The "-1" is because we're
+                 * still hoping to get the semaphore.
+                 */
+                if (!atomic_add_negative(sleepers - 1, &sem->count)) {
+                        sem->sleepers = 0;
+                        break;
+                }
+                sem->sleepers = 1;      /* us - see -1 above */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+                schedule();
+                spin_lock_irqsave(&sem->wait.lock, flags);
+                tsk->state = TASK_INTERRUPTIBLE;
+        }
+        remove_wait_queue_locked(&sem->wait, &wait);
+        wake_up_locked(&sem->wait);
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        tsk->state = TASK_RUNNING;
+        return retval;
+}
+/*
+ * Trylock failed - make sure we correct for having decremented the
+ * count.
+ */
+int
+__down_trylock (struct semaphore *sem)
+{
+        unsigned long flags;
+        int sleepers;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        sleepers = sem->sleepers + 1;
+        sem->sleepers = 0;
+        /*
+         * Add "everybody else" and us into it. They aren't
+         * playing, because we own the spinlock in the
+         * wait_queue_head.
+         */
+        if (!atomic_add_negative(sleepers, &sem->count)) {
+                wake_up_locked(&sem->wait);
+        }
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        return 1;
+}
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
new file mode 100644
index 000000000000..f05650c801d2
--- /dev/null
+++ b/arch/ia64/kernel/setup.c
@@ -0,0 +1,723 @@
+/*
+ * Architecture-specific setup.
+ *
+ * Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com>
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ *
+ * 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo().
+ * 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map
+ * 03/31/00 R.Seth      cpu_initialized and current->processor fixes
+ * 02/04/00 D.Mosberger some more get_cpuinfo fixes...
+ * 02/01/00 R.Seth      fixed get_cpuinfo for SMP
+ * 01/07/99 S.Eranian   added the support for command line argument
+ * 06/24/99 W.Drummond  added boot_cpu_data.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/bootmem.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/threads.h>
+#include <linux/tty.h>
+#include <linux/serial.h>
+#include <linux/serial_core.h>
+#include <linux/efi.h>
+#include <linux/initrd.h>
+#include <asm/ia32.h>
+#include <asm/machvec.h>
+#include <asm/mca.h>
+#include <asm/meminit.h>
+#include <asm/page.h>
+#include <asm/patch.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/sal.h>
+#include <asm/sections.h>
+#include <asm/serial.h>
+#include <asm/setup.h>
+#include <asm/smp.h>
+#include <asm/system.h>
+#include <asm/unistd.h>
+#if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
+# error "struct cpuinfo_ia64 too big!"
+#endif
+#ifdef CONFIG_SMP
+unsigned long __per_cpu_offset[NR_CPUS];
+EXPORT_SYMBOL(__per_cpu_offset);
+#endif
+DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
+DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
+DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8);
+unsigned long ia64_cycles_per_usec;
+struct ia64_boot_param *ia64_boot_param;
+struct screen_info screen_info;
+unsigned long ia64_max_cacheline_size;
+unsigned long ia64_iobase;      /* virtual address for I/O accesses */
+EXPORT_SYMBOL(ia64_iobase);
+struct io_space io_space[MAX_IO_SPACES];
+EXPORT_SYMBOL(io_space);
+unsigned int num_io_spaces;
+/*
+ * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1).  This
+ * mask specifies a mask of address bits that must be 0 in order for two buffers to be
+ * mergeable by the I/O MMU (i.e., the end address of the first buffer and the start
+ * address of the second buffer must be aligned to (merge_mask+1) in order to be
+ * mergeable).  By default, we assume there is no I/O MMU which can merge physically
+ * discontiguous buffers, so we set the merge_mask to ~0UL, which corresponds to a iommu
+ * page-size of 2^64.
+ */
+unsigned long ia64_max_iommu_merge_mask = ~0UL;
+EXPORT_SYMBOL(ia64_max_iommu_merge_mask);
+/*
+ * We use a special marker for the end of memory and it uses the extra (+1) slot
+ */
+struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1];
+int num_rsvd_regions;
+/*
+ * Filter incoming memory segments based on the primitive map created from the boot
+ * parameters. Segments contained in the map are removed from the memory ranges. A
+ * caller-specified function is called with the memory ranges that remain after filtering.
+ * This routine does not assume the incoming segments are sorted.
+ */
+int
+filter_rsvd_memory (unsigned long start, unsigned long end, void *arg)
+{
+        unsigned long range_start, range_end, prev_start;
+        void (*func)(unsigned long, unsigned long, int);
+        int i;
+#if IGNORE_PFN0
+        if (start == PAGE_OFFSET) {
+                printk(KERN_WARNING "warning: skipping physical page 0\n");
+                start += PAGE_SIZE;
+                if (start >= end) return 0;
+        }
+#endif
+        /*
+         * lowest possible address(walker uses virtual)
+         */
+        prev_start = PAGE_OFFSET;
+        func = arg;
+        for (i = 0; i < num_rsvd_regions; ++i) {
+                range_start = max(start, prev_start);
+                range_end   = min(end, rsvd_region[i].start);
+                if (range_start < range_end)
+                        call_pernode_memory(__pa(range_start), range_end - range_start, func);
+                /* nothing more available in this segment */
+                if (range_end == end) return 0;
+                prev_start = rsvd_region[i].end;
+        }
+        /* end of memory marker allows full processing inside loop body */
+        return 0;
+}
+static void
+sort_regions (struct rsvd_region *rsvd_region, int max)
+{
+        int j;
+        /* simple bubble sorting */
+        while (max--) {
+                for (j = 0; j < max; ++j) {
+                        if (rsvd_region[j].start > rsvd_region[j+1].start) {
+                                struct rsvd_region tmp;
+                                tmp = rsvd_region[j];
+                                rsvd_region[j] = rsvd_region[j + 1];
+                                rsvd_region[j + 1] = tmp;
+                        }
+                }
+        }
+}
+/**
+ * reserve_memory - setup reserved memory areas
+ *
+ * Setup the reserved memory areas set aside for the boot parameters,
+ * initrd, etc.  There are currently %IA64_MAX_RSVD_REGIONS defined,
+ * see include/asm-ia64/meminit.h if you need to define more.
+ */
+void
+reserve_memory (void)
+{
+        int n = 0;
+        /*
+         * none of the entries in this table overlap
+         */
+        rsvd_region[n].start = (unsigned long) ia64_boot_param;
+        rsvd_region[n].end   = rsvd_region[n].start + sizeof(*ia64_boot_param);
+        n++;
+        rsvd_region[n].start = (unsigned long) __va(ia64_boot_param->efi_memmap);
+        rsvd_region[n].end   = rsvd_region[n].start + ia64_boot_param->efi_memmap_size;
+        n++;
+        rsvd_region[n].start = (unsigned long) __va(ia64_boot_param->command_line);
+        rsvd_region[n].end   = (rsvd_region[n].start
+                                + strlen(__va(ia64_boot_param->command_line)) + 1);
+        n++;
+        rsvd_region[n].start = (unsigned long) ia64_imva((void *)KERNEL_START);
+        rsvd_region[n].end   = (unsigned long) ia64_imva(_end);
+        n++;
+#ifdef CONFIG_BLK_DEV_INITRD
+        if (ia64_boot_param->initrd_start) {
+                rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
+                rsvd_region[n].end   = rsvd_region[n].start + ia64_boot_param->initrd_size;
+                n++;
+        }
+#endif
+        /* end of memory marker */
+        rsvd_region[n].start = ~0UL;
+        rsvd_region[n].end   = ~0UL;
+        n++;
+        num_rsvd_regions = n;
+        sort_regions(rsvd_region, num_rsvd_regions);
+}
+/**
+ * find_initrd - get initrd parameters from the boot parameter structure
+ *
+ * Grab the initrd start and end from the boot parameter struct given us by
+ * the boot loader.
+ */
+void
+find_initrd (void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+        if (ia64_boot_param->initrd_start) {
+                initrd_start = (unsigned long)__va(ia64_boot_param->initrd_start);
+                initrd_end   = initrd_start+ia64_boot_param->initrd_size;
+                printk(KERN_INFO "Initial ramdisk at: 0x%lx (%lu bytes)\n",
+                       initrd_start, ia64_boot_param->initrd_size);
+        }
+#endif
+}
+static void __init
+io_port_init (void)
+{
+        extern unsigned long ia64_iobase;
+        unsigned long phys_iobase;
+        /*
+         *  Set `iobase' to the appropriate address in region 6 (uncached access range).
+         *
+         *  The EFI memory map is the "preferred" location to get the I/O port space base,
+         *  rather the relying on AR.KR0. This should become more clear in future SAL
+         *  specs. We'll fall back to getting it out of AR.KR0 if no appropriate entry is
+         *  found in the memory map.
+         */
+        phys_iobase = efi_get_iobase();
+        if (phys_iobase)
+                /* set AR.KR0 since this is all we use it for anyway */
+                ia64_set_kr(IA64_KR_IO_BASE, phys_iobase);
+        else {
+                phys_iobase = ia64_get_kr(IA64_KR_IO_BASE);
+                printk(KERN_INFO "No I/O port range found in EFI memory map, falling back "
+                       "to AR.KR0\n");
+                printk(KERN_INFO "I/O port base = 0x%lx\n", phys_iobase);
+        }
+        ia64_iobase = (unsigned long) ioremap(phys_iobase, 0);
+        /* setup legacy IO port space */
+        io_space[0].mmio_base = ia64_iobase;
+        io_space[0].sparse = 1;
+        num_io_spaces = 1;
+}
+/**
+ * early_console_setup - setup debugging console
+ *
+ * Consoles started here require little enough setup that we can start using
+ * them very early in the boot process, either right after the machine
+ * vector initialization, or even before if the drivers can detect their hw.
+ *
+ * Returns non-zero if a console couldn't be setup.
+ */
+static inline int __init
+early_console_setup (char *cmdline)
+{
+#ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
+        {
+                extern int sn_serial_console_early_setup(void);
+                if (!sn_serial_console_early_setup())
+                        return 0;
+        }
+#endif
+#ifdef CONFIG_EFI_PCDP
+        if (!efi_setup_pcdp_console(cmdline))
+                return 0;
+#endif
+#ifdef CONFIG_SERIAL_8250_CONSOLE
+        if (!early_serial_console_init(cmdline))
+                return 0;
+#endif
+        return -1;
+}
+static inline void
+mark_bsp_online (void)
+{
+#ifdef CONFIG_SMP
+        /* If we register an early console, allow CPU 0 to printk */
+        cpu_set(smp_processor_id(), cpu_online_map);
+#endif
+}
+void __init
+setup_arch (char **cmdline_p)
+{
+        unw_init();
+        ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
+        *cmdline_p = __va(ia64_boot_param->command_line);
+        strlcpy(saved_command_line, *cmdline_p, COMMAND_LINE_SIZE);
+        efi_init();
+        io_port_init();
+#ifdef CONFIG_IA64_GENERIC
+        {
+                const char *mvec_name = strstr (*cmdline_p, "machvec=");
+                char str[64];
+                if (mvec_name) {
+                        const char *end;
+                        size_t len;
+                        mvec_name += 8;
+                        end = strchr (mvec_name, ' ');
+                        if (end)
+                                len = end - mvec_name;
+                        else
+                                len = strlen (mvec_name);
+                        len = min(len, sizeof (str) - 1);
+                        strncpy (str, mvec_name, len);
+                        str[len] = '\0';
+                        mvec_name = str;
+                } else
+                        mvec_name = acpi_get_sysname();
+                machvec_init(mvec_name);
+        }
+#endif
+        if (early_console_setup(*cmdline_p) == 0)
+                mark_bsp_online();
+#ifdef CONFIG_ACPI_BOOT
+        /* Initialize the ACPI boot-time table parser */
+        acpi_table_init();
+# ifdef CONFIG_ACPI_NUMA
+        acpi_numa_init();
+# endif
+#else
+# ifdef CONFIG_SMP
+        smp_build_cpu_map();    /* happens, e.g., with the Ski simulator */
+# endif
+#endif /* CONFIG_APCI_BOOT */
+        find_memory();
+        /* process SAL system table: */
+        ia64_sal_init(efi.sal_systab);
+#ifdef CONFIG_SMP
+        cpu_physical_id(0) = hard_smp_processor_id();
+#endif
+        cpu_init();     /* initialize the bootstrap CPU */
+#ifdef CONFIG_ACPI_BOOT
+        acpi_boot_init();
+#endif
+#ifdef CONFIG_VT
+        if (!conswitchp) {
+# if defined(CONFIG_DUMMY_CONSOLE)
+                conswitchp = &dummy_con;
+# endif
+# if defined(CONFIG_VGA_CONSOLE)
+                /*
+                 * Non-legacy systems may route legacy VGA MMIO range to system
+                 * memory.  vga_con probes the MMIO hole, so memory looks like
+                 * a VGA device to it.  The EFI memory map can tell us if it's
+                 * memory so we can avoid this problem.
+                 */
+                if (efi_mem_type(0xA0000) != EFI_CONVENTIONAL_MEMORY)
+                        conswitchp = &vga_con;
+# endif
+        }
+#endif
+        /* enable IA-64 Machine Check Abort Handling unless disabled */
+        if (!strstr(saved_command_line, "nomca"))
+                ia64_mca_init();
+        platform_setup(cmdline_p);
+        paging_init();
+}
+/*
+ * Display cpu info for all cpu's.
+ */
+static int
+show_cpuinfo (struct seq_file *m, void *v)
+{
+#ifdef CONFIG_SMP
+#       define lpj      c->loops_per_jiffy
+#       define cpunum   c->cpu
+#else
+#       define lpj      loops_per_jiffy
+#       define cpunum   0
+#endif
+        static struct {
+                unsigned long mask;
+                const char *feature_name;
+        } feature_bits[] = {
+                { 1UL << 0, "branchlong" },
+                { 1UL << 1, "spontaneous deferral"},
+                { 1UL << 2, "16-byte atomic ops" }
+        };
+        char family[32], features[128], *cp, sep;
+        struct cpuinfo_ia64 *c = v;
+        unsigned long mask;
+        int i;
+        mask = c->features;
+        switch (c->family) {
+              case 0x07:        memcpy(family, "Itanium", 8); break;
+              case 0x1f:        memcpy(family, "Itanium 2", 10); break;
+              default:          sprintf(family, "%u", c->family); break;
+        }
+        /* build the feature string: */
+        memcpy(features, " standard", 10);
+        cp = features;
+        sep = 0;
+        for (i = 0; i < (int) ARRAY_SIZE(feature_bits); ++i) {
+                if (mask & feature_bits[i].mask) {
+                        if (sep)
+                                *cp++ = sep;
+                        sep = ',';
+                        *cp++ = ' ';
+                        strcpy(cp, feature_bits[i].feature_name);
+                        cp += strlen(feature_bits[i].feature_name);
+                        mask &= ~feature_bits[i].mask;
+                }
+        }
+        if (mask) {
+                /* print unknown features as a hex value: */
+                if (sep)
+                        *cp++ = sep;
+                sprintf(cp, " 0x%lx", mask);
+        }
+        seq_printf(m,
+                   "processor  : %d\n"
+                   "vendor     : %s\n"
+                   "arch       : IA-64\n"
+                   "family     : %s\n"
+                   "model      : %u\n"
+                   "revision   : %u\n"
+                   "archrev    : %u\n"
+                   "features   :%s\n"   /* don't change this---it _is_ right! */
+                   "cpu number : %lu\n"
+                   "cpu regs   : %u\n"
+                   "cpu MHz    : %lu.%06lu\n"
+                   "itc MHz    : %lu.%06lu\n"
+                   "BogoMIPS   : %lu.%02lu\n\n",
+                   cpunum, c->vendor, family, c->model, c->revision, c->archrev,
+                   features, c->ppn, c->number,
+                   c->proc_freq / 1000000, c->proc_freq % 1000000,
+                   c->itc_freq / 1000000, c->itc_freq % 1000000,
+                   lpj*HZ/500000, (lpj*HZ/5000) % 100);
+        return 0;
+}
+static void *
+c_start (struct seq_file *m, loff_t *pos)
+{
+#ifdef CONFIG_SMP
+        while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map))
+                ++*pos;
+#endif
+        return *pos < NR_CPUS ? cpu_data(*pos) : NULL;
+}
+static void *
+c_next (struct seq_file *m, void *v, loff_t *pos)
+{
+        ++*pos;
+        return c_start(m, pos);
+}
+static void
+c_stop (struct seq_file *m, void *v)
+{
+}
+struct seq_operations cpuinfo_op = {
+        .start =        c_start,
+        .next =         c_next,
+        .stop =         c_stop,
+        .show =         show_cpuinfo
+};
+void
+identify_cpu (struct cpuinfo_ia64 *c)
+{
+        union {
+                unsigned long bits[5];
+                struct {
+                        /* id 0 & 1: */
+                        char vendor[16];
+                        /* id 2 */
+                        u64 ppn;                /* processor serial number */
+                        /* id 3: */
+                        unsigned number         :  8;
+                        unsigned revision       :  8;
+                        unsigned model          :  8;
+                        unsigned family         :  8;
+                        unsigned archrev        :  8;
+                        unsigned reserved       : 24;
+                        /* id 4: */
+                        u64 features;
+                } field;
+        } cpuid;
+        pal_vm_info_1_u_t vm1;
+        pal_vm_info_2_u_t vm2;
+        pal_status_t status;
+        unsigned long impl_va_msb = 50, phys_addr_size = 44;    /* Itanium defaults */
+        int i;
+        for (i = 0; i < 5; ++i)
+                cpuid.bits[i] = ia64_get_cpuid(i);
+        memcpy(c->vendor, cpuid.field.vendor, 16);
+#ifdef CONFIG_SMP
+        c->cpu = smp_processor_id();
+#endif
+        c->ppn = cpuid.field.ppn;
+        c->number = cpuid.field.number;
+        c->revision = cpuid.field.revision;
+        c->model = cpuid.field.model;
+        c->family = cpuid.field.family;
+        c->archrev = cpuid.field.archrev;
+        c->features = cpuid.field.features;
+        status = ia64_pal_vm_summary(&vm1, &vm2);
+        if (status == PAL_STATUS_SUCCESS) {
+                impl_va_msb = vm2.pal_vm_info_2_s.impl_va_msb;
+                phys_addr_size = vm1.pal_vm_info_1_s.phys_add_size;
+        }
+        c->unimpl_va_mask = ~((7L<<61) | ((1L << (impl_va_msb + 1)) - 1));
+        c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1));
+}
+void
+setup_per_cpu_areas (void)
+{
+        /* start_kernel() requires this... */
+}
+static void
+get_max_cacheline_size (void)
+{
+        unsigned long line_size, max = 1;
+        u64 l, levels, unique_caches;
+        pal_cache_config_info_t cci;
+        s64 status;
+        status = ia64_pal_cache_summary(&levels, &unique_caches);
+        if (status != 0) {
+                printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n",
+                       __FUNCTION__, status);
+                max = SMP_CACHE_BYTES;
+                goto out;
+        }
+        for (l = 0; l < levels; ++l) {
+                status = ia64_pal_cache_config_info(l, /* cache_type (data_or_unified)= */ 2,
+                                                    &cci);
+                if (status != 0) {
+                        printk(KERN_ERR
+                               "%s: ia64_pal_cache_config_info(l=%lu) failed (status=%ld)\n",
+                               __FUNCTION__, l, status);
+                        max = SMP_CACHE_BYTES;
+                }
+                line_size = 1 << cci.pcci_line_size;
+                if (line_size > max)
+                        max = line_size;
+        }
+  out:
+        if (max > ia64_max_cacheline_size)
+                ia64_max_cacheline_size = max;
+}
+/*
+ * cpu_init() initializes state that is per-CPU.  This function acts
+ * as a 'CPU state barrier', nothing should get across.
+ */
+void
+cpu_init (void)
+{
+        extern void __devinit ia64_mmu_init (void *);
+        unsigned long num_phys_stacked;
+        pal_vm_info_2_u_t vmi;
+        unsigned int max_ctx;
+        struct cpuinfo_ia64 *cpu_info;
+        void *cpu_data;
+        cpu_data = per_cpu_init();
+        /*
+         * We set ar.k3 so that assembly code in MCA handler can compute
+         * physical addresses of per cpu variables with a simple:
+         *   phys = ar.k3 + &per_cpu_var
+         */
+        ia64_set_kr(IA64_KR_PER_CPU_DATA,
+                    ia64_tpa(cpu_data) - (long) __per_cpu_start);
+        get_max_cacheline_size();
+        /*
+         * We can't pass "local_cpu_data" to identify_cpu() because we haven't called
+         * ia64_mmu_init() yet.  And we can't call ia64_mmu_init() first because it
+         * depends on the data returned by identify_cpu().  We break the dependency by
+         * accessing cpu_data() through the canonical per-CPU address.
+         */
+        cpu_info = cpu_data + ((char *) &__ia64_per_cpu_var(cpu_info) - __per_cpu_start);
+        identify_cpu(cpu_info);
+#ifdef CONFIG_MCKINLEY
+        {
+#               define FEATURE_SET 16
+                struct ia64_pal_retval iprv;
+                if (cpu_info->family == 0x1f) {
+                        PAL_CALL_PHYS(iprv, PAL_PROC_GET_FEATURES, 0, FEATURE_SET, 0);
+                        if ((iprv.status == 0) && (iprv.v0 & 0x80) && (iprv.v2 & 0x80))
+                                PAL_CALL_PHYS(iprv, PAL_PROC_SET_FEATURES,
+                                              (iprv.v1 | 0x80), FEATURE_SET, 0);
+                }
+        }
+#endif
+        /* Clear the stack memory reserved for pt_regs: */
+        memset(ia64_task_regs(current), 0, sizeof(struct pt_regs));
+        ia64_set_kr(IA64_KR_FPU_OWNER, 0);
+        /*
+         * Initialize the page-table base register to a global
+         * directory with all zeroes.  This ensure that we can handle
+         * TLB-misses to user address-space even before we created the
+         * first user address-space.  This may happen, e.g., due to
+         * aggressive use of lfetch.fault.
+         */
+        ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page)));
+        /*
+         * Initialize default control register to defer all speculative faults.  The
+         * kernel MUST NOT depend on a particular setting of these bits (in other words,
+         * the kernel must have recovery code for all speculative accesses).  Turn on
+         * dcr.lc as per recommendation by the architecture team.  Most IA-32 apps
+         * shouldn't be affected by this (moral: keep your ia32 locks aligned and you'll
+         * be fine).
+         */
+        ia64_setreg(_IA64_REG_CR_DCR,  (  IA64_DCR_DP | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_DR
+                                        | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
+        atomic_inc(&init_mm.mm_count);
+        current->active_mm = &init_mm;
+        if (current->mm)
+                BUG();
+        ia64_mmu_init(ia64_imva(cpu_data));
+        ia64_mca_cpu_init(ia64_imva(cpu_data));
+#ifdef CONFIG_IA32_SUPPORT
+        ia32_cpu_init();
+#endif
+        /* Clear ITC to eliminiate sched_clock() overflows in human time.  */
+        ia64_set_itc(0);
+        /* disable all local interrupt sources: */
+        ia64_set_itv(1 << 16);
+        ia64_set_lrr0(1 << 16);
+        ia64_set_lrr1(1 << 16);
+        ia64_setreg(_IA64_REG_CR_PMV, 1 << 16);
+        ia64_setreg(_IA64_REG_CR_CMCV, 1 << 16);
+        /* clear TPR & XTP to enable all interrupt classes: */
+        ia64_setreg(_IA64_REG_CR_TPR, 0);
+#ifdef CONFIG_SMP
+        normal_xtp();
+#endif
+        /* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */
+        if (ia64_pal_vm_summary(NULL, &vmi) == 0)
+                max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1;
+        else {
+                printk(KERN_WARNING "cpu_init: PAL VM summary failed, assuming 18 RID bits\n");
+                max_ctx = (1U << 15) - 1;       /* use architected minimum */
+        }
+        while (max_ctx < ia64_ctx.max_ctx) {
+                unsigned int old = ia64_ctx.max_ctx;
+                if (cmpxchg(&ia64_ctx.max_ctx, old, max_ctx) == old)
+                        break;
+        }
+        if (ia64_pal_rse_info(&num_phys_stacked, NULL) != 0) {
+                printk(KERN_WARNING "cpu_init: PAL RSE info failed; assuming 96 physical "
+                       "stacked regs\n");
+                num_phys_stacked = 96;
+        }
+        /* size of physical stacked register partition plus 8 bytes: */
+        __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
+        platform_cpu_init();
+}
+void
+check_bugs (void)
+{
+        ia64_patch_mckinley_e9((unsigned long) __start___mckinley_e9_bundles,
+                               (unsigned long) __end___mckinley_e9_bundles);
+}
diff --git a/arch/ia64/kernel/sigframe.h b/arch/ia64/kernel/sigframe.h
new file mode 100644
index 000000000000..37b986cb86e0
--- /dev/null
+++ b/arch/ia64/kernel/sigframe.h
@@ -0,0 +1,25 @@
+struct sigscratch {
+        unsigned long scratch_unat;     /* ar.unat for the general registers saved in pt */
+        unsigned long ar_pfs;           /* for syscalls, the user-level function-state  */
+        struct pt_regs pt;
+};
+struct sigframe {
+        /*
+         * Place signal handler args where user-level unwinder can find them easily.
+         * DO NOT MOVE THESE.  They are part of the IA-64 Linux ABI and there is
+         * user-level code that depends on their presence!
+         */
+        unsigned long arg0;             /* signum */
+        unsigned long arg1;             /* siginfo pointer */
+        unsigned long arg2;             /* sigcontext pointer */
+        /*
+         * End of architected state.
+         */
+        void __user *handler;           /* pointer to the plabel of the signal handler */
+        struct siginfo info;
+        struct sigcontext sc;
+};
+extern long ia64_do_signal (sigset_t *, struct sigscratch *, long);
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
new file mode 100644
index 000000000000..6891d86937d9
--- /dev/null
+++ b/arch/ia64/kernel/signal.c
@@ -0,0 +1,691 @@
+/*
+ * Architecture-specific signal handling support.
+ *
+ * Copyright (C) 1999-2004 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * Derived from i386 and Alpha versions.
+ */
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/signal.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/tty.h>
+#include <linux/binfmts.h>
+#include <linux/unistd.h>
+#include <linux/wait.h>
+#include <asm/ia32.h>
+#include <asm/intrinsics.h>
+#include <asm/uaccess.h>
+#include <asm/rse.h>
+#include <asm/sigcontext.h>
+#include "sigframe.h"
+#define DEBUG_SIG       0
+#define STACK_ALIGN     16              /* minimal alignment for stack pointer */
+#define _BLOCKABLE      (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+#if _NSIG_WORDS > 1
+# define PUT_SIGSET(k,u)        __copy_to_user((u)->sig, (k)->sig, sizeof(sigset_t))
+# define GET_SIGSET(k,u)        __copy_from_user((k)->sig, (u)->sig, sizeof(sigset_t))
+#else
+# define PUT_SIGSET(k,u)        __put_user((k)->sig[0], &(u)->sig[0])
+# define GET_SIGSET(k,u)        __get_user((k)->sig[0], &(u)->sig[0])
+#endif
+long
+ia64_rt_sigsuspend (sigset_t __user *uset, size_t sigsetsize, struct sigscratch *scr)
+{
+        sigset_t oldset, set;
+        /* XXX: Don't preclude handling different sized sigset_t's.  */
+        if (sigsetsize != sizeof(sigset_t))
+                return -EINVAL;
+        if (!access_ok(VERIFY_READ, uset, sigsetsize))
+                return -EFAULT;
+        if (GET_SIGSET(&set, uset))
+                return -EFAULT;
+        sigdelsetmask(&set, ~_BLOCKABLE);
+        spin_lock_irq(&current->sighand->siglock);
+        {
+                oldset = current->blocked;
+                current->blocked = set;
+                recalc_sigpending();
+        }
+        spin_unlock_irq(&current->sighand->siglock);
+        /*
+         * The return below usually returns to the signal handler.  We need to
+         * pre-set the correct error code here to ensure that the right values
+         * get saved in sigcontext by ia64_do_signal.
+         */
+        scr->pt.r8 = EINTR;
+        scr->pt.r10 = -1;
+        while (1) {
+                current->state = TASK_INTERRUPTIBLE;
+                schedule();
+                if (ia64_do_signal(&oldset, scr, 1))
+                        return -EINTR;
+        }
+}
+asmlinkage long
+sys_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, long arg2,
+                 long arg3, long arg4, long arg5, long arg6, long arg7,
+                 struct pt_regs regs)
+{
+        return do_sigaltstack(uss, uoss, regs.r12);
+}
+static long
+restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
+{
+        unsigned long ip, flags, nat, um, cfm;
+        long err;
+        /* Always make any pending restarted system calls return -EINTR */
+        current_thread_info()->restart_block.fn = do_no_restart_syscall;
+        /* restore scratch that always needs gets updated during signal delivery: */
+        err  = __get_user(flags, &sc->sc_flags);
+        err |= __get_user(nat, &sc->sc_nat);
+        err |= __get_user(ip, &sc->sc_ip);                      /* instruction pointer */
+        err |= __get_user(cfm, &sc->sc_cfm);
+        err |= __get_user(um, &sc->sc_um);                      /* user mask */
+        err |= __get_user(scr->pt.ar_rsc, &sc->sc_ar_rsc);
+        err |= __get_user(scr->pt.ar_unat, &sc->sc_ar_unat);
+        err |= __get_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr);
+        err |= __get_user(scr->pt.ar_pfs, &sc->sc_ar_pfs);
+        err |= __get_user(scr->pt.pr, &sc->sc_pr);              /* predicates */
+        err |= __get_user(scr->pt.b0, &sc->sc_br[0]);           /* b0 (rp) */
+        err |= __get_user(scr->pt.b6, &sc->sc_br[6]);           /* b6 */
+        err |= __copy_from_user(&scr->pt.r1, &sc->sc_gr[1], 8); /* r1 */
+        err |= __copy_from_user(&scr->pt.r8, &sc->sc_gr[8], 4*8);       /* r8-r11 */
+        err |= __copy_from_user(&scr->pt.r12, &sc->sc_gr[12], 2*8);     /* r12-r13 */
+        err |= __copy_from_user(&scr->pt.r15, &sc->sc_gr[15], 8);       /* r15 */
+        scr->pt.cr_ifs = cfm | (1UL << 63);
+        /* establish new instruction pointer: */
+        scr->pt.cr_iip = ip & ~0x3UL;
+        ia64_psr(&scr->pt)->ri = ip & 0x3;
+        scr->pt.cr_ipsr = (scr->pt.cr_ipsr & ~IA64_PSR_UM) | (um & IA64_PSR_UM);
+        scr->scratch_unat = ia64_put_scratch_nat_bits(&scr->pt, nat);
+        if (!(flags & IA64_SC_FLAG_IN_SYSCALL)) {
+                /* Restore most scratch-state only when not in syscall. */
+                err |= __get_user(scr->pt.ar_ccv, &sc->sc_ar_ccv);              /* ar.ccv */
+                err |= __get_user(scr->pt.b7, &sc->sc_br[7]);                   /* b7 */
+                err |= __get_user(scr->pt.r14, &sc->sc_gr[14]);                 /* r14 */
+                err |= __copy_from_user(&scr->pt.ar_csd, &sc->sc_ar25, 2*8); /* ar.csd & ar.ssd */
+                err |= __copy_from_user(&scr->pt.r2, &sc->sc_gr[2], 2*8);       /* r2-r3 */
+                err |= __copy_from_user(&scr->pt.r16, &sc->sc_gr[16], 16*8);    /* r16-r31 */
+        }
+        if ((flags & IA64_SC_FLAG_FPH_VALID) != 0) {
+                struct ia64_psr *psr = ia64_psr(&scr->pt);
+                __copy_from_user(current->thread.fph, &sc->sc_fr[32], 96*16);
+                psr->mfh = 0;   /* drop signal handler's fph contents... */
+                if (psr->dfh)
+                        ia64_drop_fpu(current);
+                else {
+                        /* We already own the local fph, otherwise psr->dfh wouldn't be 0.  */
+                        __ia64_load_fpu(current->thread.fph);
+                        ia64_set_local_fpu_owner(current);
+                }
+        }
+        return err;
+}
+int
+copy_siginfo_to_user (siginfo_t __user *to, siginfo_t *from)
+{
+        if (!access_ok(VERIFY_WRITE, to, sizeof(siginfo_t)))
+                return -EFAULT;
+        if (from->si_code < 0) {
+                if (__copy_to_user(to, from, sizeof(siginfo_t)))
+                        return -EFAULT;
+                return 0;
+        } else {
+                int err;
+                /*
+                 * If you change siginfo_t structure, please be sure this code is fixed
+                 * accordingly.  It should never copy any pad contained in the structure
+                 * to avoid security leaks, but must copy the generic 3 ints plus the
+                 * relevant union member.
+                 */
+                err = __put_user(from->si_signo, &to->si_signo);
+                err |= __put_user(from->si_errno, &to->si_errno);
+                err |= __put_user((short)from->si_code, &to->si_code);
+                switch (from->si_code >> 16) {
+                      case __SI_FAULT >> 16:
+                        err |= __put_user(from->si_flags, &to->si_flags);
+                        err |= __put_user(from->si_isr, &to->si_isr);
+                      case __SI_POLL >> 16:
+                        err |= __put_user(from->si_addr, &to->si_addr);
+                        err |= __put_user(from->si_imm, &to->si_imm);
+                        break;
+                      case __SI_TIMER >> 16:
+                        err |= __put_user(from->si_tid, &to->si_tid);
+                        err |= __put_user(from->si_overrun, &to->si_overrun);
+                        err |= __put_user(from->si_ptr, &to->si_ptr);
+                        break;
+                      case __SI_RT >> 16:       /* Not generated by the kernel as of now.  */
+                      case __SI_MESGQ >> 16:
+                        err |= __put_user(from->si_uid, &to->si_uid);
+                        err |= __put_user(from->si_pid, &to->si_pid);
+                        err |= __put_user(from->si_ptr, &to->si_ptr);
+                        break;
+                      case __SI_CHLD >> 16:
+                        err |= __put_user(from->si_utime, &to->si_utime);
+                        err |= __put_user(from->si_stime, &to->si_stime);
+                        err |= __put_user(from->si_status, &to->si_status);
+                      default:
+                        err |= __put_user(from->si_uid, &to->si_uid);
+                        err |= __put_user(from->si_pid, &to->si_pid);
+                        break;
+                }
+                return err;
+        }
+}
+long
+ia64_rt_sigreturn (struct sigscratch *scr)
+{
+        extern char ia64_strace_leave_kernel, ia64_leave_kernel;
+        struct sigcontext __user *sc;
+        struct siginfo si;
+        sigset_t set;
+        long retval;
+        sc = &((struct sigframe __user *) (scr->pt.r12 + 16))->sc;
+        /*
+         * When we return to the previously executing context, r8 and r10 have already
+         * been setup the way we want them.  Indeed, if the signal wasn't delivered while
+         * in a system call, we must not touch r8 or r10 as otherwise user-level state
+         * could be corrupted.
+         */
+        retval = (long) &ia64_leave_kernel;
+        if (test_thread_flag(TIF_SYSCALL_TRACE))
+                /*
+                 * strace expects to be notified after sigreturn returns even though the
+                 * context to which we return may not be in the middle of a syscall.
+                 * Thus, the return-value that strace displays for sigreturn is
+                 * meaningless.
+                 */
+                retval = (long) &ia64_strace_leave_kernel;
+        if (!access_ok(VERIFY_READ, sc, sizeof(*sc)))
+                goto give_sigsegv;
+        if (GET_SIGSET(&set, &sc->sc_mask))
+                goto give_sigsegv;
+        sigdelsetmask(&set, ~_BLOCKABLE);
+        spin_lock_irq(&current->sighand->siglock);
+        {
+                current->blocked = set;
+                recalc_sigpending();
+        }
+        spin_unlock_irq(&current->sighand->siglock);
+        if (restore_sigcontext(sc, scr))
+                goto give_sigsegv;
+#if DEBUG_SIG
+        printk("SIG return (%s:%d): sp=%lx ip=%lx\n",
+               current->comm, current->pid, scr->pt.r12, scr->pt.cr_iip);
+#endif
+        /*
+         * It is more difficult to avoid calling this function than to
+         * call it and ignore errors.
+         */
+        do_sigaltstack(&sc->sc_stack, NULL, scr->pt.r12);
+        return retval;
+  give_sigsegv:
+        si.si_signo = SIGSEGV;
+        si.si_errno = 0;
+        si.si_code = SI_KERNEL;
+        si.si_pid = current->pid;
+        si.si_uid = current->uid;
+        si.si_addr = sc;
+        force_sig_info(SIGSEGV, &si, current);
+        return retval;
+}
+/*
+ * This does just the minimum required setup of sigcontext.
+ * Specifically, it only installs data that is either not knowable at
+ * the user-level or that gets modified before execution in the
+ * trampoline starts.  Everything else is done at the user-level.
+ */
+static long
+setup_sigcontext (struct sigcontext __user *sc, sigset_t *mask, struct sigscratch *scr)
+{
+        unsigned long flags = 0, ifs, cfm, nat;
+        long err;
+        ifs = scr->pt.cr_ifs;
+        if (on_sig_stack((unsigned long) sc))
+                flags |= IA64_SC_FLAG_ONSTACK;
+        if ((ifs & (1UL << 63)) == 0)
+                /* if cr_ifs doesn't have the valid bit set, we got here through a syscall */
+                flags |= IA64_SC_FLAG_IN_SYSCALL;
+        cfm = ifs & ((1UL << 38) - 1);
+        ia64_flush_fph(current);
+        if ((current->thread.flags & IA64_THREAD_FPH_VALID)) {
+                flags |= IA64_SC_FLAG_FPH_VALID;
+                __copy_to_user(&sc->sc_fr[32], current->thread.fph, 96*16);
+        }
+        nat = ia64_get_scratch_nat_bits(&scr->pt, scr->scratch_unat);
+        err  = __put_user(flags, &sc->sc_flags);
+        err |= __put_user(nat, &sc->sc_nat);
+        err |= PUT_SIGSET(mask, &sc->sc_mask);
+        err |= __put_user(cfm, &sc->sc_cfm);
+        err |= __put_user(scr->pt.cr_ipsr & IA64_PSR_UM, &sc->sc_um);
+        err |= __put_user(scr->pt.ar_rsc, &sc->sc_ar_rsc);
+        err |= __put_user(scr->pt.ar_unat, &sc->sc_ar_unat);            /* ar.unat */
+        err |= __put_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr);            /* ar.fpsr */
+        err |= __put_user(scr->pt.ar_pfs, &sc->sc_ar_pfs);
+        err |= __put_user(scr->pt.pr, &sc->sc_pr);                      /* predicates */
+        err |= __put_user(scr->pt.b0, &sc->sc_br[0]);                   /* b0 (rp) */
+        err |= __put_user(scr->pt.b6, &sc->sc_br[6]);                   /* b6 */
+        err |= __copy_to_user(&sc->sc_gr[1], &scr->pt.r1, 8);           /* r1 */
+        err |= __copy_to_user(&sc->sc_gr[8], &scr->pt.r8, 4*8);         /* r8-r11 */
+        err |= __copy_to_user(&sc->sc_gr[12], &scr->pt.r12, 2*8);       /* r12-r13 */
+        err |= __copy_to_user(&sc->sc_gr[15], &scr->pt.r15, 8);         /* r15 */
+        err |= __put_user(scr->pt.cr_iip + ia64_psr(&scr->pt)->ri, &sc->sc_ip);
+        if (flags & IA64_SC_FLAG_IN_SYSCALL) {
+                /* Clear scratch registers if the signal interrupted a system call. */
+                err |= __put_user(0, &sc->sc_ar_ccv);                           /* ar.ccv */
+                err |= __put_user(0, &sc->sc_br[7]);                            /* b7 */
+                err |= __put_user(0, &sc->sc_gr[14]);                           /* r14 */
+                err |= __clear_user(&sc->sc_ar25, 2*8);                 /* ar.csd & ar.ssd */
+                err |= __clear_user(&sc->sc_gr[2], 2*8);                        /* r2-r3 */
+                err |= __clear_user(&sc->sc_gr[16], 16*8);                      /* r16-r31 */
+        } else {
+                /* Copy scratch regs to sigcontext if the signal didn't interrupt a syscall. */
+                err |= __put_user(scr->pt.ar_ccv, &sc->sc_ar_ccv);              /* ar.ccv */
+                err |= __put_user(scr->pt.b7, &sc->sc_br[7]);                   /* b7 */
+                err |= __put_user(scr->pt.r14, &sc->sc_gr[14]);                 /* r14 */
+                err |= __copy_to_user(&sc->sc_ar25, &scr->pt.ar_csd, 2*8); /* ar.csd & ar.ssd */
+                err |= __copy_to_user(&sc->sc_gr[2], &scr->pt.r2, 2*8);         /* r2-r3 */
+                err |= __copy_to_user(&sc->sc_gr[16], &scr->pt.r16, 16*8);      /* r16-r31 */
+        }
+        return err;
+}
+/*
+ * Check whether the register-backing store is already on the signal stack.
+ */
+static inline int
+rbs_on_sig_stack (unsigned long bsp)
+{
+        return (bsp - current->sas_ss_sp < current->sas_ss_size);
+}
+static long
+force_sigsegv_info (int sig, void __user *addr)
+{
+        unsigned long flags;
+        struct siginfo si;
+        if (sig == SIGSEGV) {
+                /*
+                 * Acquiring siglock around the sa_handler-update is almost
+                 * certainly overkill, but this isn't a
+                 * performance-critical path and I'd rather play it safe
+                 * here than having to debug a nasty race if and when
+                 * something changes in kernel/signal.c that would make it
+                 * no longer safe to modify sa_handler without holding the
+                 * lock.
+                 */
+                spin_lock_irqsave(&current->sighand->siglock, flags);
+                current->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
+                spin_unlock_irqrestore(&current->sighand->siglock, flags);
+        }
+        si.si_signo = SIGSEGV;
+        si.si_errno = 0;
+        si.si_code = SI_KERNEL;
+        si.si_pid = current->pid;
+        si.si_uid = current->uid;
+        si.si_addr = addr;
+        force_sig_info(SIGSEGV, &si, current);
+        return 0;
+}
+static long
+setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
+             struct sigscratch *scr)
+{
+        extern char __kernel_sigtramp[];
+        unsigned long tramp_addr, new_rbs = 0;
+        struct sigframe __user *frame;
+        long err;
+        frame = (void __user *) scr->pt.r12;
+        tramp_addr = (unsigned long) __kernel_sigtramp;
+        if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags((unsigned long) frame) == 0) {
+                frame = (void __user *) ((current->sas_ss_sp + current->sas_ss_size)
+                                         & ~(STACK_ALIGN - 1));
+                /*
+                 * We need to check for the register stack being on the signal stack
+                 * separately, because it's switched separately (memory stack is switched
+                 * in the kernel, register stack is switched in the signal trampoline).
+                 */
+                if (!rbs_on_sig_stack(scr->pt.ar_bspstore))
+                        new_rbs = (current->sas_ss_sp + sizeof(long) - 1) & ~(sizeof(long) - 1);
+        }
+        frame = (void __user *) frame - ((sizeof(*frame) + STACK_ALIGN - 1) & ~(STACK_ALIGN - 1));
+        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+                return force_sigsegv_info(sig, frame);
+        err  = __put_user(sig, &frame->arg0);
+        err |= __put_user(&frame->info, &frame->arg1);
+        err |= __put_user(&frame->sc, &frame->arg2);
+        err |= __put_user(new_rbs, &frame->sc.sc_rbs_base);
+        err |= __put_user(0, &frame->sc.sc_loadrs);     /* initialize to zero */
+        err |= __put_user(ka->sa.sa_handler, &frame->handler);
+        err |= copy_siginfo_to_user(&frame->info, info);
+        err |= __put_user(current->sas_ss_sp, &frame->sc.sc_stack.ss_sp);
+        err |= __put_user(current->sas_ss_size, &frame->sc.sc_stack.ss_size);
+        err |= __put_user(sas_ss_flags(scr->pt.r12), &frame->sc.sc_stack.ss_flags);
+        err |= setup_sigcontext(&frame->sc, set, scr);
+        if (unlikely(err))
+                return force_sigsegv_info(sig, frame);
+        scr->pt.r12 = (unsigned long) frame - 16;       /* new stack pointer */
+        scr->pt.ar_fpsr = FPSR_DEFAULT;                 /* reset fpsr for signal handler */
+        scr->pt.cr_iip = tramp_addr;
+        ia64_psr(&scr->pt)->ri = 0;                     /* start executing in first slot */
+        ia64_psr(&scr->pt)->be = 0;                     /* force little-endian byte-order */
+        /*
+         * Force the interruption function mask to zero.  This has no effect when a
+         * system-call got interrupted by a signal (since, in that case, scr->pt_cr_ifs is
+         * ignored), but it has the desirable effect of making it possible to deliver a
+         * signal with an incomplete register frame (which happens when a mandatory RSE
+         * load faults).  Furthermore, it has no negative effect on the getting the user's
+         * dirty partition preserved, because that's governed by scr->pt.loadrs.
+         */
+        scr->pt.cr_ifs = (1UL << 63);
+        /*
+         * Note: this affects only the NaT bits of the scratch regs (the ones saved in
+         * pt_regs), which is exactly what we want.
+         */
+        scr->scratch_unat = 0; /* ensure NaT bits of r12 is clear */
+#if DEBUG_SIG
+        printk("SIG deliver (%s:%d): sig=%d sp=%lx ip=%lx handler=%p\n",
+               current->comm, current->pid, sig, scr->pt.r12, frame->sc.sc_ip, frame->handler);
+#endif
+        return 1;
+}
+static long
+handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset,
+               struct sigscratch *scr)
+{
+        if (IS_IA32_PROCESS(&scr->pt)) {
+                /* send signal to IA-32 process */
+                if (!ia32_setup_frame1(sig, ka, info, oldset, &scr->pt))
+                        return 0;
+        } else
+                /* send signal to IA-64 process */
+                if (!setup_frame(sig, ka, info, oldset, scr))
+                        return 0;
+        if (!(ka->sa.sa_flags & SA_NODEFER)) {
+                spin_lock_irq(&current->sighand->siglock);
+                {
+                        sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
+                        sigaddset(&current->blocked, sig);
+                        recalc_sigpending();
+                }
+                spin_unlock_irq(&current->sighand->siglock);
+        }
+        return 1;
+}
+/*
+ * Note that `init' is a special process: it doesn't get signals it doesn't want to
+ * handle.  Thus you cannot kill init even with a SIGKILL even by mistake.
+ */
+long
+ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
+{
+        struct k_sigaction ka;
+        siginfo_t info;
+        long restart = in_syscall;
+        long errno = scr->pt.r8;
+#       define ERR_CODE(c)      (IS_IA32_PROCESS(&scr->pt) ? -(c) : (c))
+        /*
+         * In the ia64_leave_kernel code path, we want the common case to go fast, which
+         * is why we may in certain cases get here from kernel mode. Just return without
+         * doing anything if so.
+         */
+        if (!user_mode(&scr->pt))
+                return 0;
+        if (!oldset)
+                oldset = &current->blocked;
+        /*
+         * This only loops in the rare cases of handle_signal() failing, in which case we
+         * need to push through a forced SIGSEGV.
+         */
+        while (1) {
+                int signr = get_signal_to_deliver(&info, &ka, &scr->pt, NULL);
+                /*
+                 * get_signal_to_deliver() may have run a debugger (via notify_parent())
+                 * and the debugger may have modified the state (e.g., to arrange for an
+                 * inferior call), thus it's important to check for restarting _after_
+                 * get_signal_to_deliver().
+                 */
+                if (IS_IA32_PROCESS(&scr->pt)) {
+                        if (in_syscall) {
+                                if (errno >= 0)
+                                        restart = 0;
+                                else
+                                        errno = -errno;
+                        }
+                } else if ((long) scr->pt.r10 != -1)
+                        /*
+                         * A system calls has to be restarted only if one of the error codes
+                         * ERESTARTNOHAND, ERESTARTSYS, or ERESTARTNOINTR is returned.  If r10
+                         * isn't -1 then r8 doesn't hold an error code and we don't need to
+                         * restart the syscall, so we can clear the "restart" flag here.
+                         */
+                        restart = 0;
+                if (signr <= 0)
+                        break;
+                if (unlikely(restart)) {
+                        switch (errno) {
+                              case ERESTART_RESTARTBLOCK:
+                              case ERESTARTNOHAND:
+                                scr->pt.r8 = ERR_CODE(EINTR);
+                                /* note: scr->pt.r10 is already -1 */
+                                break;
+                              case ERESTARTSYS:
+                                if ((ka.sa.sa_flags & SA_RESTART) == 0) {
+                                        scr->pt.r8 = ERR_CODE(EINTR);
+                                        /* note: scr->pt.r10 is already -1 */
+                                        break;
+                                }
+                              case ERESTARTNOINTR:
+                                if (IS_IA32_PROCESS(&scr->pt)) {
+                                        scr->pt.r8 = scr->pt.r1;
+                                        scr->pt.cr_iip -= 2;
+                                } else
+                                        ia64_decrement_ip(&scr->pt);
+                                restart = 0; /* don't restart twice if handle_signal() fails... */
+                        }
+                }
+                /*
+                 * Whee!  Actually deliver the signal.  If the delivery failed, we need to
+                 * continue to iterate in this loop so we can deliver the SIGSEGV...
+                 */
+                if (handle_signal(signr, &ka, &info, oldset, scr))
+                        return 1;
+        }
+        /* Did we come from a system call? */
+        if (restart) {
+                /* Restart the system call - no handlers present */
+                if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR
+                    || errno == ERESTART_RESTARTBLOCK)
+                {
+                        if (IS_IA32_PROCESS(&scr->pt)) {
+                                scr->pt.r8 = scr->pt.r1;
+                                scr->pt.cr_iip -= 2;
+                                if (errno == ERESTART_RESTARTBLOCK)
+                                        scr->pt.r8 = 0; /* x86 version of __NR_restart_syscall */
+                        } else {
+                                /*
+                                 * Note: the syscall number is in r15 which is saved in
+                                 * pt_regs so all we need to do here is adjust ip so that
+                                 * the "break" instruction gets re-executed.
+                                 */
+                                ia64_decrement_ip(&scr->pt);
+                                if (errno == ERESTART_RESTARTBLOCK)
+                                        scr->pt.r15 = __NR_restart_syscall;
+                        }
+                }
+        }
+        return 0;
+}
+/* Set a delayed signal that was detected in MCA/INIT/NMI/PMI context where it
+ * could not be delivered.  It is important that the target process is not
+ * allowed to do any more work in user space.  Possible cases for the target
+ * process:
+ *
+ * - It is sleeping and will wake up soon.  Store the data in the current task,
+ *   the signal will be sent when the current task returns from the next
+ *   interrupt.
+ *
+ * - It is running in user context.  Store the data in the current task, the
+ *   signal will be sent when the current task returns from the next interrupt.
+ *
+ * - It is running in kernel context on this or another cpu and will return to
+ *   user context.  Store the data in the target task, the signal will be sent
+ *   to itself when the target task returns to user space.
+ *
+ * - It is running in kernel context on this cpu and will sleep before
+ *   returning to user context.  Because this is also the current task, the
+ *   signal will not get delivered and the task could sleep indefinitely.
+ *   Store the data in the idle task for this cpu, the signal will be sent
+ *   after the idle task processes its next interrupt.
+ *
+ * To cover all cases, store the data in the target task, the current task and
+ * the idle task on this cpu.  Whatever happens, the signal will be delivered
+ * to the target task before it can do any useful user space work.  Multiple
+ * deliveries have no unwanted side effects.
+ *
+ * Note: This code is executed in MCA/INIT/NMI/PMI context, with interrupts
+ * disabled.  It must not take any locks nor use kernel structures or services
+ * that require locks.
+ */
+/* To ensure that we get the right pid, check its start time.  To avoid extra
+ * include files in thread_info.h, convert the task start_time to unsigned long,
+ * giving us a cycle time of > 580 years.
+ */
+static inline unsigned long
+start_time_ul(const struct task_struct *t)
+{
+        return t->start_time.tv_sec * NSEC_PER_SEC + t->start_time.tv_nsec;
+}
+void
+set_sigdelayed(pid_t pid, int signo, int code, void __user *addr)
+{
+        struct task_struct *t;
+        unsigned long start_time =  0;
+        int i;
+        for (i = 1; i <= 3; ++i) {
+                switch (i) {
+                case 1:
+                        t = find_task_by_pid(pid);
+                        if (t)
+                                start_time = start_time_ul(t);
+                        break;
+                case 2:
+                        t = current;
+                        break;
+                default:
+                        t = idle_task(smp_processor_id());
+                        break;
+                }
+                if (!t)
+                        return;
+                t->thread_info->sigdelayed.signo = signo;
+                t->thread_info->sigdelayed.code = code;
+                t->thread_info->sigdelayed.addr = addr;
+                t->thread_info->sigdelayed.start_time = start_time;
+                t->thread_info->sigdelayed.pid = pid;
+                wmb();
+                set_tsk_thread_flag(t, TIF_SIGDELAYED);
+        }
+}
+/* Called from entry.S when it detects TIF_SIGDELAYED, a delayed signal that
+ * was detected in MCA/INIT/NMI/PMI context where it could not be delivered.
+ */
+void
+do_sigdelayed(void)
+{
+        struct siginfo siginfo;
+        pid_t pid;
+        struct task_struct *t;
+        clear_thread_flag(TIF_SIGDELAYED);
+        memset(&siginfo, 0, sizeof(siginfo));
+        siginfo.si_signo = current_thread_info()->sigdelayed.signo;
+        siginfo.si_code = current_thread_info()->sigdelayed.code;
+        siginfo.si_addr = current_thread_info()->sigdelayed.addr;
+        pid = current_thread_info()->sigdelayed.pid;
+        t = find_task_by_pid(pid);
+        if (!t)
+                return;
+        if (current_thread_info()->sigdelayed.start_time != start_time_ul(t))
+                return;
+        force_sig_info(siginfo.si_signo, &siginfo, t);
+}
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
new file mode 100644
index 000000000000..953095e2ce15
--- /dev/null
+++ b/arch/ia64/kernel/smp.c
@@ -0,0 +1,376 @@
+/*
+ * SMP Support
+ *
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999, 2001, 2003 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * Lots of stuff stolen from arch/alpha/kernel/smp.c
+ *
+ * 01/05/16 Rohit Seth <rohit.seth@intel.com>  IA64-SMP functions. Reorganized
+ * the existing code (on the lines of x86 port).
+ * 00/09/11 David Mosberger <davidm@hpl.hp.com> Do loops_per_jiffy
+ * calibration on each CPU.
+ * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> fixed logical processor id
+ * 00/03/31 Rohit Seth <rohit.seth@intel.com>   Fixes for Bootstrap Processor
+ * & cpu_online_map now gets done here (instead of setup.c)
+ * 99/10/05 davidm      Update to bring it in sync with new command-line processing
+ *  scheme.
+ * 10/13/00 Goutham Rao <goutham.rao@intel.com> Updated smp_call_function and
+ *              smp_call_function_single to resend IPI on timeouts
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/smp.h>
+#include <linux/kernel_stat.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/delay.h>
+#include <linux/efi.h>
+#include <linux/bitops.h>
+#include <asm/atomic.h>
+#include <asm/current.h>
+#include <asm/delay.h>
+#include <asm/machvec.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/sal.h>
+#include <asm/system.h>
+#include <asm/tlbflush.h>
+#include <asm/unistd.h>
+#include <asm/mca.h>
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise static memory
+ * requirements. It also looks cleaner.
+ */
+static  __cacheline_aligned DEFINE_SPINLOCK(call_lock);
+struct call_data_struct {
+        void (*func) (void *info);
+        void *info;
+        long wait;
+        atomic_t started;
+        atomic_t finished;
+};
+static volatile struct call_data_struct *call_data;
+#define IPI_CALL_FUNC           0
+#define IPI_CPU_STOP            1
+/* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
+static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
+extern void cpu_halt (void);
+void
+lock_ipi_calllock(void)
+{
+        spin_lock_irq(&call_lock);
+}
+void
+unlock_ipi_calllock(void)
+{
+        spin_unlock_irq(&call_lock);
+}
+static void
+stop_this_cpu (void)
+{
+        /*
+         * Remove this CPU:
+         */
+        cpu_clear(smp_processor_id(), cpu_online_map);
+        max_xtp();
+        local_irq_disable();
+        cpu_halt();
+}
+void
+cpu_die(void)
+{
+        max_xtp();
+        local_irq_disable();
+        cpu_halt();
+        /* Should never be here */
+        BUG();
+        for (;;);
+}
+irqreturn_t
+handle_IPI (int irq, void *dev_id, struct pt_regs *regs)
+{
+        int this_cpu = get_cpu();
+        unsigned long *pending_ipis = &__ia64_per_cpu_var(ipi_operation);
+        unsigned long ops;
+        mb();   /* Order interrupt and bit testing. */
+        while ((ops = xchg(pending_ipis, 0)) != 0) {
+                mb();   /* Order bit clearing and data access. */
+                do {
+                        unsigned long which;
+                        which = ffz(~ops);
+                        ops &= ~(1 << which);
+                        switch (which) {
+                              case IPI_CALL_FUNC:
+                              {
+                                      struct call_data_struct *data;
+                                      void (*func)(void *info);
+                                      void *info;
+                                      int wait;
+                                      /* release the 'pointer lock' */
+                                      data = (struct call_data_struct *) call_data;
+                                      func = data->func;
+                                      info = data->info;
+                                      wait = data->wait;
+                                      mb();
+                                      atomic_inc(&data->started);
+                                      /*
+                                       * At this point the structure may be gone unless
+                                       * wait is true.
+                                       */
+                                      (*func)(info);
+                                      /* Notify the sending CPU that the task is done.  */
+                                      mb();
+                                      if (wait)
+                                              atomic_inc(&data->finished);
+                              }
+                              break;
+                              case IPI_CPU_STOP:
+                                stop_this_cpu();
+                                break;
+                              default:
+                                printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which);
+                                break;
+                        }
+                } while (ops);
+                mb();   /* Order data access and bit testing. */
+        }
+        put_cpu();
+        return IRQ_HANDLED;
+}
+/*
+ * Called with preeemption disabled.
+ */
+static inline void
+send_IPI_single (int dest_cpu, int op)
+{
+        set_bit(op, &per_cpu(ipi_operation, dest_cpu));
+        platform_send_ipi(dest_cpu, IA64_IPI_VECTOR, IA64_IPI_DM_INT, 0);
+}
+/*
+ * Called with preeemption disabled.
+ */
+static inline void
+send_IPI_allbutself (int op)
+{
+        unsigned int i;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (cpu_online(i) && i != smp_processor_id())
+                        send_IPI_single(i, op);
+        }
+}
+/*
+ * Called with preeemption disabled.
+ */
+static inline void
+send_IPI_all (int op)
+{
+        int i;
+        for (i = 0; i < NR_CPUS; i++)
+                if (cpu_online(i))
+                        send_IPI_single(i, op);
+}
+/*
+ * Called with preeemption disabled.
+ */
+static inline void
+send_IPI_self (int op)
+{
+        send_IPI_single(smp_processor_id(), op);
+}
+/*
+ * Called with preeemption disabled.
+ */
+void
+smp_send_reschedule (int cpu)
+{
+        platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
+}
+void
+smp_flush_tlb_all (void)
+{
+        on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1);
+}
+void
+smp_flush_tlb_mm (struct mm_struct *mm)
+{
+        /* this happens for the common case of a single-threaded fork():  */
+        if (likely(mm == current->active_mm && atomic_read(&mm->mm_users) == 1))
+        {
+                local_finish_flush_tlb_mm(mm);
+                return;
+        }
+        /*
+         * We could optimize this further by using mm->cpu_vm_mask to track which CPUs
+         * have been running in the address space.  It's not clear that this is worth the
+         * trouble though: to avoid races, we have to raise the IPI on the target CPU
+         * anyhow, and once a CPU is interrupted, the cost of local_flush_tlb_all() is
+         * rather trivial.
+         */
+        on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1, 1);
+}
+/*
+ * Run a function on another CPU
+ *  <func>      The function to run. This must be fast and non-blocking.
+ *  <info>      An arbitrary pointer to pass to the function.
+ *  <nonatomic> Currently unused.
+ *  <wait>      If true, wait until function has completed on other CPUs.
+ *  [RETURNS]   0 on success, else a negative status code.
+ *
+ * Does not return until the remote CPU is nearly ready to execute <func>
+ * or is or has executed.
+ */
+int
+smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int nonatomic,
+                          int wait)
+{
+        struct call_data_struct data;
+        int cpus = 1;
+        int me = get_cpu(); /* prevent preemption and reschedule on another processor */
+        if (cpuid == me) {
+                printk("%s: trying to call self\n", __FUNCTION__);
+                put_cpu();
+                return -EBUSY;
+        }
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        spin_lock_bh(&call_lock);
+        call_data = &data;
+        mb();   /* ensure store to call_data precedes setting of IPI_CALL_FUNC */
+        send_IPI_single(cpuid, IPI_CALL_FUNC);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus)
+                cpu_relax();
+        if (wait)
+                while (atomic_read(&data.finished) != cpus)
+                        cpu_relax();
+        call_data = NULL;
+        spin_unlock_bh(&call_lock);
+        put_cpu();
+        return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
+/*
+ * this function sends a 'generic call function' IPI to all other CPUs
+ * in the system.
+ */
+/*
+ *  [SUMMARY]   Run a function on all other CPUs.
+ *  <func>      The function to run. This must be fast and non-blocking.
+ *  <info>      An arbitrary pointer to pass to the function.
+ *  <nonatomic> currently unused.
+ *  <wait>      If true, wait (atomically) until function has completed on other CPUs.
+ *  [RETURNS]   0 on success, else a negative status code.
+ *
+ * Does not return until remote CPUs are nearly ready to execute <func> or are or have
+ * executed.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int
+smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait)
+{
+        struct call_data_struct data;
+        int cpus = num_online_cpus()-1;
+        if (!cpus)
+                return 0;
+        /* Can deadlock when called with interrupts disabled */
+        WARN_ON(irqs_disabled());
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        spin_lock(&call_lock);
+        call_data = &data;
+        mb();   /* ensure store to call_data precedes setting of IPI_CALL_FUNC */
+        send_IPI_allbutself(IPI_CALL_FUNC);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus)
+                cpu_relax();
+        if (wait)
+                while (atomic_read(&data.finished) != cpus)
+                        cpu_relax();
+        call_data = NULL;
+        spin_unlock(&call_lock);
+        return 0;
+}
+EXPORT_SYMBOL(smp_call_function);
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+void
+smp_send_stop (void)
+{
+        send_IPI_allbutself(IPI_CPU_STOP);
+}
+int __init
+setup_profiling_timer (unsigned int multiplier)
+{
+        return -EINVAL;
+}
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
new file mode 100644
index 000000000000..5318f0cbfc26
--- /dev/null
+++ b/arch/ia64/kernel/smpboot.c
@@ -0,0 +1,692 @@
+/*
+ * SMP boot-related support
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 01/05/16 Rohit Seth <rohit.seth@intel.com>   Moved SMP booting functions from smp.c to here.
+ * 01/04/27 David Mosberger <davidm@hpl.hp.com> Added ITC synching code.
+ * 02/07/31 David Mosberger <davidm@hpl.hp.com> Switch over to hotplug-CPU boot-sequence.
+ *                                              smp_boot_cpus()/smp_commence() is replaced by
+ *                                              smp_prepare_cpus()/__cpu_up()/smp_cpus_done().
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/acpi.h>
+#include <linux/bootmem.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/efi.h>
+#include <linux/percpu.h>
+#include <linux/bitops.h>
+#include <asm/atomic.h>
+#include <asm/cache.h>
+#include <asm/current.h>
+#include <asm/delay.h>
+#include <asm/ia32.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/machvec.h>
+#include <asm/mca.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/sal.h>
+#include <asm/system.h>
+#include <asm/tlbflush.h>
+#include <asm/unistd.h>
+#define SMP_DEBUG 0
+#if SMP_DEBUG
+#define Dprintk(x...)  printk(x)
+#else
+#define Dprintk(x...)
+#endif
+/*
+ * ITC synchronization related stuff:
+ */
+#define MASTER  0
+#define SLAVE   (SMP_CACHE_BYTES/8)
+#define NUM_ROUNDS      64      /* magic value */
+#define NUM_ITERS       5       /* likewise */
+static DEFINE_SPINLOCK(itc_sync_lock);
+static volatile unsigned long go[SLAVE + 1];
+#define DEBUG_ITC_SYNC  0
+extern void __devinit calibrate_delay (void);
+extern void start_ap (void);
+extern unsigned long ia64_iobase;
+task_t *task_for_booting_cpu;
+/*
+ * State for each CPU
+ */
+DEFINE_PER_CPU(int, cpu_state);
+/* Bitmasks of currently online, and possible CPUs */
+cpumask_t cpu_online_map;
+EXPORT_SYMBOL(cpu_online_map);
+cpumask_t cpu_possible_map;
+EXPORT_SYMBOL(cpu_possible_map);
+/* which logical CPU number maps to which CPU (physical APIC ID) */
+volatile int ia64_cpu_to_sapicid[NR_CPUS];
+EXPORT_SYMBOL(ia64_cpu_to_sapicid);
+static volatile cpumask_t cpu_callin_map;
+struct smp_boot_data smp_boot_data __initdata;
+unsigned long ap_wakeup_vector = -1; /* External Int use to wakeup APs */
+char __initdata no_int_routing;
+unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */
+static int __init
+nointroute (char *str)
+{
+        no_int_routing = 1;
+        printk ("no_int_routing on\n");
+        return 1;
+}
+__setup("nointroute", nointroute);
+void
+sync_master (void *arg)
+{
+        unsigned long flags, i;
+        go[MASTER] = 0;
+        local_irq_save(flags);
+        {
+                for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
+                        while (!go[MASTER]);
+                        go[MASTER] = 0;
+                        go[SLAVE] = ia64_get_itc();
+                }
+        }
+        local_irq_restore(flags);
+}
+/*
+ * Return the number of cycles by which our itc differs from the itc on the master
+ * (time-keeper) CPU.  A positive number indicates our itc is ahead of the master,
+ * negative that it is behind.
+ */
+static inline long
+get_delta (long *rt, long *master)
+{
+        unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
+        unsigned long tcenter, t0, t1, tm;
+        long i;
+        for (i = 0; i < NUM_ITERS; ++i) {
+                t0 = ia64_get_itc();
+                go[MASTER] = 1;
+                while (!(tm = go[SLAVE]));
+                go[SLAVE] = 0;
+                t1 = ia64_get_itc();
+                if (t1 - t0 < best_t1 - best_t0)
+                        best_t0 = t0, best_t1 = t1, best_tm = tm;
+        }
+        *rt = best_t1 - best_t0;
+        *master = best_tm - best_t0;
+        /* average best_t0 and best_t1 without overflow: */
+        tcenter = (best_t0/2 + best_t1/2);
+        if (best_t0 % 2 + best_t1 % 2 == 2)
+                ++tcenter;
+        return tcenter - best_tm;
+}
+/*
+ * Synchronize ar.itc of the current (slave) CPU with the ar.itc of the MASTER CPU
+ * (normally the time-keeper CPU).  We use a closed loop to eliminate the possibility of
+ * unaccounted-for errors (such as getting a machine check in the middle of a calibration
+ * step).  The basic idea is for the slave to ask the master what itc value it has and to
+ * read its own itc before and after the master responds.  Each iteration gives us three
+ * timestamps:
+ *
+ *      slave           master
+ *
+ *      t0 ---\
+ *             ---\
+ *                 --->
+ *                      tm
+ *                 /---
+ *             /---
+ *      t1 <---
+ *
+ *
+ * The goal is to adjust the slave's ar.itc such that tm falls exactly half-way between t0
+ * and t1.  If we achieve this, the clocks are synchronized provided the interconnect
+ * between the slave and the master is symmetric.  Even if the interconnect were
+ * asymmetric, we would still know that the synchronization error is smaller than the
+ * roundtrip latency (t0 - t1).
+ *
+ * When the interconnect is quiet and symmetric, this lets us synchronize the itc to
+ * within one or two cycles.  However, we can only *guarantee* that the synchronization is
+ * accurate to within a round-trip time, which is typically in the range of several
+ * hundred cycles (e.g., ~500 cycles).  In practice, this means that the itc's are usually
+ * almost perfectly synchronized, but we shouldn't assume that the accuracy is much better
+ * than half a micro second or so.
+ */
+void
+ia64_sync_itc (unsigned int master)
+{
+        long i, delta, adj, adjust_latency = 0, done = 0;
+        unsigned long flags, rt, master_time_stamp, bound;
+#if DEBUG_ITC_SYNC
+        struct {
+                long rt;        /* roundtrip time */
+                long master;    /* master's timestamp */
+                long diff;      /* difference between midpoint and master's timestamp */
+                long lat;       /* estimate of itc adjustment latency */
+        } t[NUM_ROUNDS];
+#endif
+        /*
+         * Make sure local timer ticks are disabled while we sync.  If
+         * they were enabled, we'd have to worry about nasty issues
+         * like setting the ITC ahead of (or a long time before) the
+         * next scheduled tick.
+         */
+        BUG_ON((ia64_get_itv() & (1 << 16)) == 0);
+        go[MASTER] = 1;
+        if (smp_call_function_single(master, sync_master, NULL, 1, 0) < 0) {
+                printk(KERN_ERR "sync_itc: failed to get attention of CPU %u!\n", master);
+                return;
+        }
+        while (go[MASTER]);     /* wait for master to be ready */
+        spin_lock_irqsave(&itc_sync_lock, flags);
+        {
+                for (i = 0; i < NUM_ROUNDS; ++i) {
+                        delta = get_delta(&rt, &master_time_stamp);
+                        if (delta == 0) {
+                                done = 1;       /* let's lock on to this... */
+                                bound = rt;
+                        }
+                        if (!done) {
+                                if (i > 0) {
+                                        adjust_latency += -delta;
+                                        adj = -delta + adjust_latency/4;
+                                } else
+                                        adj = -delta;
+                                ia64_set_itc(ia64_get_itc() + adj);
+                        }
+#if DEBUG_ITC_SYNC
+                        t[i].rt = rt;
+                        t[i].master = master_time_stamp;
+                        t[i].diff = delta;
+                        t[i].lat = adjust_latency/4;
+#endif
+                }
+        }
+        spin_unlock_irqrestore(&itc_sync_lock, flags);
+#if DEBUG_ITC_SYNC
+        for (i = 0; i < NUM_ROUNDS; ++i)
+                printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
+                       t[i].rt, t[i].master, t[i].diff, t[i].lat);
+#endif
+        printk(KERN_INFO "CPU %d: synchronized ITC with CPU %u (last diff %ld cycles, "
+               "maxerr %lu cycles)\n", smp_processor_id(), master, delta, rt);
+}
+/*
+ * Ideally sets up per-cpu profiling hooks.  Doesn't do much now...
+ */
+static inline void __devinit
+smp_setup_percpu_timer (void)
+{
+}
+static void __devinit
+smp_callin (void)
+{
+        int cpuid, phys_id;
+        extern void ia64_init_itm(void);
+#ifdef CONFIG_PERFMON
+        extern void pfm_init_percpu(void);
+#endif
+        cpuid = smp_processor_id();
+        phys_id = hard_smp_processor_id();
+        if (cpu_online(cpuid)) {
+                printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n",
+                       phys_id, cpuid);
+                BUG();
+        }
+        lock_ipi_calllock();
+        cpu_set(cpuid, cpu_online_map);
+        unlock_ipi_calllock();
+        smp_setup_percpu_timer();
+        ia64_mca_cmc_vector_setup();    /* Setup vector on AP */
+#ifdef CONFIG_PERFMON
+        pfm_init_percpu();
+#endif
+        local_irq_enable();
+        if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
+                /*
+                 * Synchronize the ITC with the BP.  Need to do this after irqs are
+                 * enabled because ia64_sync_itc() calls smp_call_function_single(), which
+                 * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls
+                 * local_bh_enable(), which bugs out if irqs are not enabled...
+                 */
+                Dprintk("Going to syncup ITC with BP.\n");
+                ia64_sync_itc(0);
+        }
+        /*
+         * Get our bogomips.
+         */
+        ia64_init_itm();
+        calibrate_delay();
+        local_cpu_data->loops_per_jiffy = loops_per_jiffy;
+#ifdef CONFIG_IA32_SUPPORT
+        ia32_gdt_init();
+#endif
+        /*
+         * Allow the master to continue.
+         */
+        cpu_set(cpuid, cpu_callin_map);
+        Dprintk("Stack on CPU %d at about %p\n",cpuid, &cpuid);
+}
+/*
+ * Activate a secondary processor.  head.S calls this.
+ */
+int __devinit
+start_secondary (void *unused)
+{
+        /* Early console may use I/O ports */
+        ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase));
+        Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id());
+        efi_map_pal_code();
+        cpu_init();
+        smp_callin();
+        cpu_idle();
+        return 0;
+}
+struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
+{
+        return NULL;
+}
+struct create_idle {
+        struct task_struct *idle;
+        struct completion done;
+        int cpu;
+};
+void
+do_fork_idle(void *_c_idle)
+{
+        struct create_idle *c_idle = _c_idle;
+        c_idle->idle = fork_idle(c_idle->cpu);
+        complete(&c_idle->done);
+}
+static int __devinit
+do_boot_cpu (int sapicid, int cpu)
+{
+        int timeout;
+        struct create_idle c_idle = {
+                .cpu    = cpu,
+                .done   = COMPLETION_INITIALIZER(c_idle.done),
+        };
+        DECLARE_WORK(work, do_fork_idle, &c_idle);
+        /*
+         * We can't use kernel_thread since we must avoid to reschedule the child.
+         */
+        if (!keventd_up() || current_is_keventd())
+                work.func(work.data);
+        else {
+                schedule_work(&work);
+                wait_for_completion(&c_idle.done);
+        }
+        if (IS_ERR(c_idle.idle))
+                panic("failed fork for CPU %d", cpu);
+        task_for_booting_cpu = c_idle.idle;
+        Dprintk("Sending wakeup vector %lu to AP 0x%x/0x%x.\n", ap_wakeup_vector, cpu, sapicid);
+        platform_send_ipi(cpu, ap_wakeup_vector, IA64_IPI_DM_INT, 0);
+        /*
+         * Wait 10s total for the AP to start
+         */
+        Dprintk("Waiting on callin_map ...");
+        for (timeout = 0; timeout < 100000; timeout++) {
+                if (cpu_isset(cpu, cpu_callin_map))
+                        break;  /* It has booted */
+                udelay(100);
+        }
+        Dprintk("\n");
+        if (!cpu_isset(cpu, cpu_callin_map)) {
+                printk(KERN_ERR "Processor 0x%x/0x%x is stuck.\n", cpu, sapicid);
+                ia64_cpu_to_sapicid[cpu] = -1;
+                cpu_clear(cpu, cpu_online_map);  /* was set in smp_callin() */
+                return -EINVAL;
+        }
+        return 0;
+}
+static int __init
+decay (char *str)
+{
+        int ticks;
+        get_option (&str, &ticks);
+        return 1;
+}
+__setup("decay=", decay);
+/*
+ * Initialize the logical CPU number to SAPICID mapping
+ */
+void __init
+smp_build_cpu_map (void)
+{
+        int sapicid, cpu, i;
+        int boot_cpu_id = hard_smp_processor_id();
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                ia64_cpu_to_sapicid[cpu] = -1;
+#ifdef CONFIG_HOTPLUG_CPU
+                cpu_set(cpu, cpu_possible_map);
+#endif
+        }
+        ia64_cpu_to_sapicid[0] = boot_cpu_id;
+        cpus_clear(cpu_present_map);
+        cpu_set(0, cpu_present_map);
+        cpu_set(0, cpu_possible_map);
+        for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) {
+                sapicid = smp_boot_data.cpu_phys_id[i];
+                if (sapicid == boot_cpu_id)
+                        continue;
+                cpu_set(cpu, cpu_present_map);
+                cpu_set(cpu, cpu_possible_map);
+                ia64_cpu_to_sapicid[cpu] = sapicid;
+                cpu++;
+        }
+}
+#ifdef CONFIG_NUMA
+/* on which node is each logical CPU (one cacheline even for 64 CPUs) */
+u8 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_to_node_map);
+/* which logical CPUs are on which nodes */
+cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
+/*
+ * Build cpu to node mapping and initialize the per node cpu masks.
+ */
+void __init
+build_cpu_to_node_map (void)
+{
+        int cpu, i, node;
+        for(node=0; node<MAX_NUMNODES; node++)
+                cpus_clear(node_to_cpu_mask[node]);
+        for(cpu = 0; cpu < NR_CPUS; ++cpu) {
+                /*
+                 * All Itanium NUMA platforms I know use ACPI, so maybe we
+                 * can drop this ifdef completely.                    [EF]
+                 */
+#ifdef CONFIG_ACPI_NUMA
+                node = -1;
+                for (i = 0; i < NR_CPUS; ++i)
+                        if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
+                                node = node_cpuid[i].nid;
+                                break;
+                        }
+#else
+#               error Fixme: Dunno how to build CPU-to-node map.
+#endif
+                cpu_to_node_map[cpu] = (node >= 0) ? node : 0;
+                if (node >= 0)
+                        cpu_set(cpu, node_to_cpu_mask[node]);
+        }
+}
+#endif /* CONFIG_NUMA */
+/*
+ * Cycle through the APs sending Wakeup IPIs to boot each.
+ */
+void __init
+smp_prepare_cpus (unsigned int max_cpus)
+{
+        int boot_cpu_id = hard_smp_processor_id();
+        /*
+         * Initialize the per-CPU profiling counter/multiplier
+         */
+        smp_setup_percpu_timer();
+        /*
+         * We have the boot CPU online for sure.
+         */
+        cpu_set(0, cpu_online_map);
+        cpu_set(0, cpu_callin_map);
+        local_cpu_data->loops_per_jiffy = loops_per_jiffy;
+        ia64_cpu_to_sapicid[0] = boot_cpu_id;
+        printk(KERN_INFO "Boot processor id 0x%x/0x%x\n", 0, boot_cpu_id);
+        current_thread_info()->cpu = 0;
+        /*
+         * If SMP should be disabled, then really disable it!
+         */
+        if (!max_cpus) {
+                printk(KERN_INFO "SMP mode deactivated.\n");
+                cpus_clear(cpu_online_map);
+                cpus_clear(cpu_present_map);
+                cpus_clear(cpu_possible_map);
+                cpu_set(0, cpu_online_map);
+                cpu_set(0, cpu_present_map);
+                cpu_set(0, cpu_possible_map);
+                return;
+        }
+}
+void __devinit smp_prepare_boot_cpu(void)
+{
+        cpu_set(smp_processor_id(), cpu_online_map);
+        cpu_set(smp_processor_id(), cpu_callin_map);
+}
+#ifdef CONFIG_HOTPLUG_CPU
+extern void fixup_irqs(void);
+/* must be called with cpucontrol mutex held */
+static int __devinit cpu_enable(unsigned int cpu)
+{
+        per_cpu(cpu_state,cpu) = CPU_UP_PREPARE;
+        wmb();
+        while (!cpu_online(cpu))
+                cpu_relax();
+        return 0;
+}
+int __cpu_disable(void)
+{
+        int cpu = smp_processor_id();
+        /*
+         * dont permit boot processor for now
+         */
+        if (cpu == 0)
+                return -EBUSY;
+        fixup_irqs();
+        local_flush_tlb_all();
+        printk ("Disabled cpu %u\n", smp_processor_id());
+        return 0;
+}
+void __cpu_die(unsigned int cpu)
+{
+        unsigned int i;
+        for (i = 0; i < 100; i++) {
+                /* They ack this in play_dead by setting CPU_DEAD */
+                if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+                {
+                        /*
+                         * TBD: Enable this when physical removal
+                         * or when we put the processor is put in
+                         * SAL_BOOT_RENDEZ mode
+                         * cpu_clear(cpu, cpu_callin_map);
+                         */
+                        return;
+                }
+                msleep(100);
+        }
+        printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+#else /* !CONFIG_HOTPLUG_CPU */
+static int __devinit cpu_enable(unsigned int cpu)
+{
+        return 0;
+}
+int __cpu_disable(void)
+{
+        return -ENOSYS;
+}
+void __cpu_die(unsigned int cpu)
+{
+        /* We said "no" in __cpu_disable */
+        BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+void
+smp_cpus_done (unsigned int dummy)
+{
+        int cpu;
+        unsigned long bogosum = 0;
+        /*
+         * Allow the user to impress friends.
+         */
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                if (cpu_online(cpu))
+                        bogosum += cpu_data(cpu)->loops_per_jiffy;
+        printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+               (int)num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100);
+}
+int __devinit
+__cpu_up (unsigned int cpu)
+{
+        int ret;
+        int sapicid;
+        sapicid = ia64_cpu_to_sapicid[cpu];
+        if (sapicid == -1)
+                return -EINVAL;
+        /*
+         * Already booted.. just enable and get outa idle lool
+         */
+        if (cpu_isset(cpu, cpu_callin_map))
+        {
+                cpu_enable(cpu);
+                local_irq_enable();
+                while (!cpu_isset(cpu, cpu_online_map))
+                        mb();
+                return 0;
+        }
+        /* Processor goes to start_secondary(), sets online flag */
+        ret = do_boot_cpu(sapicid, cpu);
+        if (ret < 0)
+                return ret;
+        return 0;
+}
+/*
+ * Assume that CPU's have been discovered by some platform-dependent interface.  For
+ * SoftSDV/Lion, that would be ACPI.
+ *
+ * Setup of the IPI irq handler is done in irq.c:init_IRQ_SMP().
+ */
+void __init
+init_smp_config(void)
+{
+        struct fptr {
+                unsigned long fp;
+                unsigned long gp;
+        } *ap_startup;
+        long sal_ret;
+        /* Tell SAL where to drop the AP's.  */
+        ap_startup = (struct fptr *) start_ap;
+        sal_ret = ia64_sal_set_vectors(SAL_VECTOR_OS_BOOT_RENDEZ,
+                                       ia64_tpa(ap_startup->fp), ia64_tpa(ap_startup->gp), 0, 0, 0, 0);
+        if (sal_ret < 0)
+                printk(KERN_ERR "SMP: Can't set SAL AP Boot Rendezvous: %s\n",
+                       ia64_sal_strerror(sal_ret));
+}
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
new file mode 100644
index 000000000000..3ac216e1c8bb
--- /dev/null
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -0,0 +1,298 @@
+/*
+ * This file contains various system calls that have different calling
+ * conventions on different platforms.
+ *
+ * Copyright (C) 1999-2000, 2002-2003, 2005 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/shm.h>
+#include <linux/file.h>         /* doh, must come after sched.h... */
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/syscalls.h>
+#include <linux/highuid.h>
+#include <linux/hugetlb.h>
+#include <asm/shmparam.h>
+#include <asm/uaccess.h>
+unsigned long
+arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len,
+                        unsigned long pgoff, unsigned long flags)
+{
+        long map_shared = (flags & MAP_SHARED);
+        unsigned long start_addr, align_mask = PAGE_SIZE - 1;
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        if (len > RGN_MAP_LIMIT)
+                return -ENOMEM;
+#ifdef CONFIG_HUGETLB_PAGE
+        if (REGION_NUMBER(addr) == REGION_HPAGE)
+                addr = 0;
+#endif
+        if (!addr)
+                addr = mm->free_area_cache;
+        if (map_shared && (TASK_SIZE > 0xfffffffful))
+                /*
+                 * For 64-bit tasks, align shared segments to 1MB to avoid potential
+                 * performance penalty due to virtual aliasing (see ASDM).  For 32-bit
+                 * tasks, we prefer to avoid exhausting the address space too quickly by
+                 * limiting alignment to a single page.
+                 */
+                align_mask = SHMLBA - 1;
+  full_search:
+        start_addr = addr = (addr + align_mask) & ~align_mask;
+        for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+                /* At this point:  (!vma || addr < vma->vm_end). */
+                if (TASK_SIZE - len < addr || RGN_MAP_LIMIT - len < REGION_OFFSET(addr)) {
+                        if (start_addr != TASK_UNMAPPED_BASE) {
+                                /* Start a new search --- just in case we missed some holes.  */
+                                addr = TASK_UNMAPPED_BASE;
+                                goto full_search;
+                        }
+                        return -ENOMEM;
+                }
+                if (!vma || addr + len <= vma->vm_start) {
+                        /* Remember the address where we stopped this search:  */
+                        mm->free_area_cache = addr + len;
+                        return addr;
+                }
+                addr = (vma->vm_end + align_mask) & ~align_mask;
+        }
+}
+asmlinkage long
+ia64_getpriority (int which, int who)
+{
+        long prio;
+        prio = sys_getpriority(which, who);
+        if (prio >= 0) {
+                force_successful_syscall_return();
+                prio = 20 - prio;
+        }
+        return prio;
+}
+/* XXX obsolete, but leave it here until the old libc is gone... */
+asmlinkage unsigned long
+sys_getpagesize (void)
+{
+        return PAGE_SIZE;
+}
+asmlinkage unsigned long
+ia64_shmat (int shmid, void __user *shmaddr, int shmflg)
+{
+        unsigned long raddr;
+        int retval;
+        retval = do_shmat(shmid, shmaddr, shmflg, &raddr);
+        if (retval < 0)
+                return retval;
+        force_successful_syscall_return();
+        return raddr;
+}
+asmlinkage unsigned long
+ia64_brk (unsigned long brk)
+{
+        unsigned long rlim, retval, newbrk, oldbrk;
+        struct mm_struct *mm = current->mm;
+        /*
+         * Most of this replicates the code in sys_brk() except for an additional safety
+         * check and the clearing of r8.  However, we can't call sys_brk() because we need
+         * to acquire the mmap_sem before we can do the test...
+         */
+        down_write(&mm->mmap_sem);
+        if (brk < mm->end_code)
+                goto out;
+        newbrk = PAGE_ALIGN(brk);
+        oldbrk = PAGE_ALIGN(mm->brk);
+        if (oldbrk == newbrk)
+                goto set_brk;
+        /* Always allow shrinking brk. */
+        if (brk <= mm->brk) {
+                if (!do_munmap(mm, newbrk, oldbrk-newbrk))
+                        goto set_brk;
+                goto out;
+        }
+        /* Check against unimplemented/unmapped addresses: */
+        if ((newbrk - oldbrk) > RGN_MAP_LIMIT || REGION_OFFSET(newbrk) > RGN_MAP_LIMIT)
+                goto out;
+        /* Check against rlimit.. */
+        rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+        if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
+                goto out;
+        /* Check against existing mmap mappings. */
+        if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+                goto out;
+        /* Ok, looks good - let it rip. */
+        if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+                goto out;
+set_brk:
+        mm->brk = brk;
+out:
+        retval = mm->brk;
+        up_write(&mm->mmap_sem);
+        force_successful_syscall_return();
+        return retval;
+}
+/*
+ * On IA-64, we return the two file descriptors in ret0 and ret1 (r8
+ * and r9) as this is faster than doing a copy_to_user().
+ */
+asmlinkage long
+sys_pipe (void)
+{
+        struct pt_regs *regs = ia64_task_regs(current);
+        int fd[2];
+        int retval;
+        retval = do_pipe(fd);
+        if (retval)
+                goto out;
+        retval = fd[0];
+        regs->r9 = fd[1];
+  out:
+        return retval;
+}
+static inline unsigned long
+do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, unsigned long pgoff)
+{
+        unsigned long roff;
+        struct file *file = NULL;
+        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+        if (!(flags & MAP_ANONYMOUS)) {
+                file = fget(fd);
+                if (!file)
+                        return -EBADF;
+                if (!file->f_op || !file->f_op->mmap) {
+                        addr = -ENODEV;
+                        goto out;
+                }
+        }
+        /*
+         * A zero mmap always succeeds in Linux, independent of whether or not the
+         * remaining arguments are valid.
+         */
+        if (len == 0)
+                goto out;
+        /* Careful about overflows.. */
+        len = PAGE_ALIGN(len);
+        if (!len || len > TASK_SIZE) {
+                addr = -EINVAL;
+                goto out;
+        }
+        /*
+         * Don't permit mappings into unmapped space, the virtual page table of a region,
+         * or across a region boundary.  Note: RGN_MAP_LIMIT is equal to 2^n-PAGE_SIZE
+         * (for some integer n <= 61) and len > 0.
+         */
+        roff = REGION_OFFSET(addr);
+        if ((len > RGN_MAP_LIMIT) || (roff > (RGN_MAP_LIMIT - len))) {
+                addr = -EINVAL;
+                goto out;
+        }
+        down_write(&current->mm->mmap_sem);
+        addr = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+        up_write(&current->mm->mmap_sem);
+out:    if (file)
+                fput(file);
+        return addr;
+}
+/*
+ * mmap2() is like mmap() except that the offset is expressed in units
+ * of PAGE_SIZE (instead of bytes).  This allows to mmap2() (pieces
+ * of) files that are larger than the address space of the CPU.
+ */
+asmlinkage unsigned long
+sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff)
+{
+        addr = do_mmap2(addr, len, prot, flags, fd, pgoff);
+        if (!IS_ERR((void *) addr))
+                force_successful_syscall_return();
+        return addr;
+}
+asmlinkage unsigned long
+sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, long off)
+{
+        if (offset_in_page(off) != 0)
+                return -EINVAL;
+        addr = do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+        if (!IS_ERR((void *) addr))
+                force_successful_syscall_return();
+        return addr;
+}
+asmlinkage unsigned long
+ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags,
+             unsigned long new_addr)
+{
+        extern unsigned long do_mremap (unsigned long addr,
+                                        unsigned long old_len,
+                                        unsigned long new_len,
+                                        unsigned long flags,
+                                        unsigned long new_addr);
+        down_write(&current->mm->mmap_sem);
+        {
+                addr = do_mremap(addr, old_len, new_len, flags, new_addr);
+        }
+        up_write(&current->mm->mmap_sem);
+        if (IS_ERR((void *) addr))
+                return addr;
+        force_successful_syscall_return();
+        return addr;
+}
+#ifndef CONFIG_PCI
+asmlinkage long
+sys_pciconfig_read (unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len,
+                    void *buf)
+{
+        return -ENOSYS;
+}
+asmlinkage long
+sys_pciconfig_write (unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len,
+                     void *buf)
+{
+        return -ENOSYS;
+}
+#endif /* CONFIG_PCI */
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
new file mode 100644
index 000000000000..8b8a5a45b621
--- /dev/null
+++ b/arch/ia64/kernel/time.c
@@ -0,0 +1,255 @@
+/*
+ * linux/arch/ia64/kernel/time.c
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *      David Mosberger <davidm@hpl.hp.com>
+ * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
+ * Copyright (C) 1999-2000 VA Linux Systems
+ * Copyright (C) 1999-2000 Walt Drummond <drummond@valinux.com>
+ */
+#include <linux/config.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/interrupt.h>
+#include <linux/efi.h>
+#include <linux/profile.h>
+#include <linux/timex.h>
+#include <asm/machvec.h>
+#include <asm/delay.h>
+#include <asm/hw_irq.h>
+#include <asm/ptrace.h>
+#include <asm/sal.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+extern unsigned long wall_jiffies;
+u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+EXPORT_SYMBOL(jiffies_64);
+#define TIME_KEEPER_ID  0       /* smp_processor_id() of time-keeper */
+#ifdef CONFIG_IA64_DEBUG_IRQ
+unsigned long last_cli_ip;
+EXPORT_SYMBOL(last_cli_ip);
+#endif
+static struct time_interpolator itc_interpolator = {
+        .shift = 16,
+        .mask = 0xffffffffffffffffLL,
+        .source = TIME_SOURCE_CPU
+};
+static irqreturn_t
+timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
+{
+        unsigned long new_itm;
+        if (unlikely(cpu_is_offline(smp_processor_id()))) {
+                return IRQ_HANDLED;
+        }
+        platform_timer_interrupt(irq, dev_id, regs);
+        new_itm = local_cpu_data->itm_next;
+        if (!time_after(ia64_get_itc(), new_itm))
+                printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n",
+                       ia64_get_itc(), new_itm);
+        profile_tick(CPU_PROFILING, regs);
+        while (1) {
+                update_process_times(user_mode(regs));
+                new_itm += local_cpu_data->itm_delta;
+                if (smp_processor_id() == TIME_KEEPER_ID) {
+                        /*
+                         * Here we are in the timer irq handler. We have irqs locally
+                         * disabled, but we don't know if the timer_bh is running on
+                         * another CPU. We need to avoid to SMP race by acquiring the
+                         * xtime_lock.
+                         */
+                        write_seqlock(&xtime_lock);
+                        do_timer(regs);
+                        local_cpu_data->itm_next = new_itm;
+                        write_sequnlock(&xtime_lock);
+                } else
+                        local_cpu_data->itm_next = new_itm;
+                if (time_after(new_itm, ia64_get_itc()))
+                        break;
+        }
+        do {
+                /*
+                 * If we're too close to the next clock tick for
+                 * comfort, we increase the safety margin by
+                 * intentionally dropping the next tick(s).  We do NOT
+                 * update itm.next because that would force us to call
+                 * do_timer() which in turn would let our clock run
+                 * too fast (with the potentially devastating effect
+                 * of losing monotony of time).
+                 */
+                while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2))
+                        new_itm += local_cpu_data->itm_delta;
+                ia64_set_itm(new_itm);
+                /* double check, in case we got hit by a (slow) PMI: */
+        } while (time_after_eq(ia64_get_itc(), new_itm));
+        return IRQ_HANDLED;
+}
+/*
+ * Encapsulate access to the itm structure for SMP.
+ */
+void
+ia64_cpu_local_tick (void)
+{
+        int cpu = smp_processor_id();
+        unsigned long shift = 0, delta;
+        /* arrange for the cycle counter to generate a timer interrupt: */
+        ia64_set_itv(IA64_TIMER_VECTOR);
+        delta = local_cpu_data->itm_delta;
+        /*
+         * Stagger the timer tick for each CPU so they don't occur all at (almost) the
+         * same time:
+         */
+        if (cpu) {
+                unsigned long hi = 1UL << ia64_fls(cpu);
+                shift = (2*(cpu - hi) + 1) * delta/hi/2;
+        }
+        local_cpu_data->itm_next = ia64_get_itc() + delta + shift;
+        ia64_set_itm(local_cpu_data->itm_next);
+}
+static int nojitter;
+static int __init nojitter_setup(char *str)
+{
+        nojitter = 1;
+        printk("Jitter checking for ITC timers disabled\n");
+        return 1;
+}
+__setup("nojitter", nojitter_setup);
+void __devinit
+ia64_init_itm (void)
+{
+        unsigned long platform_base_freq, itc_freq;
+        struct pal_freq_ratio itc_ratio, proc_ratio;
+        long status, platform_base_drift, itc_drift;
+        /*
+         * According to SAL v2.6, we need to use a SAL call to determine the platform base
+         * frequency and then a PAL call to determine the frequency ratio between the ITC
+         * and the base frequency.
+         */
+        status = ia64_sal_freq_base(SAL_FREQ_BASE_PLATFORM,
+                                    &platform_base_freq, &platform_base_drift);
+        if (status != 0) {
+                printk(KERN_ERR "SAL_FREQ_BASE_PLATFORM failed: %s\n", ia64_sal_strerror(status));
+        } else {
+                status = ia64_pal_freq_ratios(&proc_ratio, NULL, &itc_ratio);
+                if (status != 0)
+                        printk(KERN_ERR "PAL_FREQ_RATIOS failed with status=%ld\n", status);
+        }
+        if (status != 0) {
+                /* invent "random" values */
+                printk(KERN_ERR
+                       "SAL/PAL failed to obtain frequency info---inventing reasonable values\n");
+                platform_base_freq = 100000000;
+                platform_base_drift = -1;       /* no drift info */
+                itc_ratio.num = 3;
+                itc_ratio.den = 1;
+        }
+        if (platform_base_freq < 40000000) {
+                printk(KERN_ERR "Platform base frequency %lu bogus---resetting to 75MHz!\n",
+                       platform_base_freq);
+                platform_base_freq = 75000000;
+                platform_base_drift = -1;
+        }
+        if (!proc_ratio.den)
+                proc_ratio.den = 1;     /* avoid division by zero */
+        if (!itc_ratio.den)
+                itc_ratio.den = 1;      /* avoid division by zero */
+        itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den;
+        local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ;
+        printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%lu/%lu, "
+               "ITC freq=%lu.%03luMHz", smp_processor_id(),
+               platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000,
+               itc_ratio.num, itc_ratio.den, itc_freq / 1000000, (itc_freq / 1000) % 1000);
+        if (platform_base_drift != -1) {
+                itc_drift = platform_base_drift*itc_ratio.num/itc_ratio.den;
+                printk("+/-%ldppm\n", itc_drift);
+        } else {
+                itc_drift = -1;
+                printk("\n");
+        }
+        local_cpu_data->proc_freq = (platform_base_freq*proc_ratio.num)/proc_ratio.den;
+        local_cpu_data->itc_freq = itc_freq;
+        local_cpu_data->cyc_per_usec = (itc_freq + USEC_PER_SEC/2) / USEC_PER_SEC;
+        local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<<IA64_NSEC_PER_CYC_SHIFT)
+                                        + itc_freq/2)/itc_freq;
+        if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
+                itc_interpolator.frequency = local_cpu_data->itc_freq;
+                itc_interpolator.drift = itc_drift;
+#ifdef CONFIG_SMP
+                /* On IA64 in an SMP configuration ITCs are never accurately synchronized.
+                 * Jitter compensation requires a cmpxchg which may limit
+                 * the scalability of the syscalls for retrieving time.
+                 * The ITC synchronization is usually successful to within a few
+                 * ITC ticks but this is not a sure thing. If you need to improve
+                 * timer performance in SMP situations then boot the kernel with the
+                 * "nojitter" option. However, doing so may result in time fluctuating (maybe
+                 * even going backward) if the ITC offsets between the individual CPUs
+                 * are too large.
+                 */
+                if (!nojitter) itc_interpolator.jitter = 1;
+#endif
+                register_time_interpolator(&itc_interpolator);
+        }
+        /* Setup the CPU local timer tick */
+        ia64_cpu_local_tick();
+}
+static struct irqaction timer_irqaction = {
+        .handler =      timer_interrupt,
+        .flags =        SA_INTERRUPT,
+        .name =         "timer"
+};
+void __init
+time_init (void)
+{
+        register_percpu_irq(IA64_TIMER_VECTOR, &timer_irqaction);
+        efi_gettimeofday(&xtime);
+        ia64_init_itm();
+        /*
+         * Initialize wall_to_monotonic such that adding it to xtime will yield zero, the
+         * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC).
+         */
+        set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec);
+}
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
new file mode 100644
index 000000000000..f1aafd4c05f9
--- /dev/null
+++ b/arch/ia64/kernel/topology.c
@@ -0,0 +1,92 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * This file contains NUMA specific variables and functions which can
+ * be split away from DISCONTIGMEM and are used on NUMA machines with
+ * contiguous memory.
+ *              2002/08/07 Erich Focht <efocht@ess.nec.de>
+ * Populate cpu entries in sysfs for non-numa systems as well
+ *      Intel Corporation - Ashok Raj
+ */
+#include <linux/config.h>
+#include <linux/cpu.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/node.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/nodemask.h>
+#include <asm/mmzone.h>
+#include <asm/numa.h>
+#include <asm/cpu.h>
+#ifdef CONFIG_NUMA
+static struct node *sysfs_nodes;
+#endif
+static struct ia64_cpu *sysfs_cpus;
+int arch_register_cpu(int num)
+{
+        struct node *parent = NULL;
+        
+#ifdef CONFIG_NUMA
+        parent = &sysfs_nodes[cpu_to_node(num)];
+#endif /* CONFIG_NUMA */
+        return register_cpu(&sysfs_cpus[num].cpu, num, parent);
+}
+#ifdef CONFIG_HOTPLUG_CPU
+void arch_unregister_cpu(int num)
+{
+        struct node *parent = NULL;
+#ifdef CONFIG_NUMA
+        int node = cpu_to_node(num);
+        parent = &sysfs_nodes[node];
+#endif /* CONFIG_NUMA */
+        return unregister_cpu(&sysfs_cpus[num].cpu, parent);
+}
+EXPORT_SYMBOL(arch_register_cpu);
+EXPORT_SYMBOL(arch_unregister_cpu);
+#endif /*CONFIG_HOTPLUG_CPU*/
+static int __init topology_init(void)
+{
+        int i, err = 0;
+#ifdef CONFIG_NUMA
+        sysfs_nodes = kmalloc(sizeof(struct node) * MAX_NUMNODES, GFP_KERNEL);
+        if (!sysfs_nodes) {
+                err = -ENOMEM;
+                goto out;
+        }
+        memset(sysfs_nodes, 0, sizeof(struct node) * MAX_NUMNODES);
+        /* MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes? */
+        for_each_online_node(i)
+                if ((err = register_node(&sysfs_nodes[i], i, 0)))
+                        goto out;
+#endif
+        sysfs_cpus = kmalloc(sizeof(struct ia64_cpu) * NR_CPUS, GFP_KERNEL);
+        if (!sysfs_cpus) {
+                err = -ENOMEM;
+                goto out;
+        }
+        memset(sysfs_cpus, 0, sizeof(struct ia64_cpu) * NR_CPUS);
+        for_each_present_cpu(i)
+                if((err = arch_register_cpu(i)))
+                        goto out;
+out:
+        return err;
+}
+__initcall(topology_init);
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
new file mode 100644
index 000000000000..e82ad78081b3
--- /dev/null
+++ b/arch/ia64/kernel/traps.c
@@ -0,0 +1,609 @@
+/*
+ * Architecture-specific trap handling.
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 05/12/00 grao <goutham.rao@intel.com> : added isr in siginfo for SIGFPE
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>              /* For unblank_screen() */
+#include <linux/module.h>       /* for EXPORT_SYMBOL */
+#include <linux/hardirq.h>
+#include <asm/fpswa.h>
+#include <asm/ia32.h>
+#include <asm/intrinsics.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+extern spinlock_t timerlist_lock;
+fpswa_interface_t *fpswa_interface;
+EXPORT_SYMBOL(fpswa_interface);
+void __init
+trap_init (void)
+{
+        if (ia64_boot_param->fpswa)
+                /* FPSWA fixup: make the interface pointer a kernel virtual address: */
+                fpswa_interface = __va(ia64_boot_param->fpswa);
+}
+/*
+ * Unlock any spinlocks which will prevent us from getting the message out (timerlist_lock
+ * is acquired through the console unblank code)
+ */
+void
+bust_spinlocks (int yes)
+{
+        int loglevel_save = console_loglevel;
+        if (yes) {
+                oops_in_progress = 1;
+                return;
+        }
+#ifdef CONFIG_VT
+        unblank_screen();
+#endif
+        oops_in_progress = 0;
+        /*
+         * OK, the message is on the console.  Now we call printk() without
+         * oops_in_progress set so that printk will give klogd a poke.  Hold onto
+         * your hats...
+         */
+        console_loglevel = 15;          /* NMI oopser may have shut the console up */
+        printk(" ");
+        console_loglevel = loglevel_save;
+}
+void
+die (const char *str, struct pt_regs *regs, long err)
+{
+        static struct {
+                spinlock_t lock;
+                u32 lock_owner;
+                int lock_owner_depth;
+        } die = {
+                .lock =                 SPIN_LOCK_UNLOCKED,
+                .lock_owner =           -1,
+                .lock_owner_depth =     0
+        };
+        static int die_counter;
+        if (die.lock_owner != smp_processor_id()) {
+                console_verbose();
+                spin_lock_irq(&die.lock);
+                die.lock_owner = smp_processor_id();
+                die.lock_owner_depth = 0;
+                bust_spinlocks(1);
+        }
+        if (++die.lock_owner_depth < 3) {
+                printk("%s[%d]: %s %ld [%d]\n",
+                        current->comm, current->pid, str, err, ++die_counter);
+                show_regs(regs);
+        } else
+                printk(KERN_ERR "Recursive die() failure, output suppressed\n");
+        bust_spinlocks(0);
+        die.lock_owner = -1;
+        spin_unlock_irq(&die.lock);
+        do_exit(SIGSEGV);
+}
+void
+die_if_kernel (char *str, struct pt_regs *regs, long err)
+{
+        if (!user_mode(regs))
+                die(str, regs, err);
+}
+void
+ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
+{
+        siginfo_t siginfo;
+        int sig, code;
+        /* SIGILL, SIGFPE, SIGSEGV, and SIGBUS want these field initialized: */
+        siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
+        siginfo.si_imm = break_num;
+        siginfo.si_flags = 0;           /* clear __ISR_VALID */
+        siginfo.si_isr = 0;
+        switch (break_num) {
+              case 0: /* unknown error (used by GCC for __builtin_abort()) */
+                die_if_kernel("bugcheck!", regs, break_num);
+                sig = SIGILL; code = ILL_ILLOPC;
+                break;
+              case 1: /* integer divide by zero */
+                sig = SIGFPE; code = FPE_INTDIV;
+                break;
+              case 2: /* integer overflow */
+                sig = SIGFPE; code = FPE_INTOVF;
+                break;
+              case 3: /* range check/bounds check */
+                sig = SIGFPE; code = FPE_FLTSUB;
+                break;
+              case 4: /* null pointer dereference */
+                sig = SIGSEGV; code = SEGV_MAPERR;
+                break;
+              case 5: /* misaligned data */
+                sig = SIGSEGV; code = BUS_ADRALN;
+                break;
+              case 6: /* decimal overflow */
+                sig = SIGFPE; code = __FPE_DECOVF;
+                break;
+              case 7: /* decimal divide by zero */
+                sig = SIGFPE; code = __FPE_DECDIV;
+                break;
+              case 8: /* packed decimal error */
+                sig = SIGFPE; code = __FPE_DECERR;
+                break;
+              case 9: /* invalid ASCII digit */
+                sig = SIGFPE; code = __FPE_INVASC;
+                break;
+              case 10: /* invalid decimal digit */
+                sig = SIGFPE; code = __FPE_INVDEC;
+                break;
+              case 11: /* paragraph stack overflow */
+                sig = SIGSEGV; code = __SEGV_PSTKOVF;
+                break;
+              case 0x3f000 ... 0x3ffff: /* bundle-update in progress */
+                sig = SIGILL; code = __ILL_BNDMOD;
+                break;
+              default:
+                if (break_num < 0x40000 || break_num > 0x100000)
+                        die_if_kernel("Bad break", regs, break_num);
+                if (break_num < 0x80000) {
+                        sig = SIGILL; code = __ILL_BREAK;
+                } else {
+                        sig = SIGTRAP; code = TRAP_BRKPT;
+                }
+        }
+        siginfo.si_signo = sig;
+        siginfo.si_errno = 0;
+        siginfo.si_code = code;
+        force_sig_info(sig, &siginfo, current);
+}
+/*
+ * disabled_fph_fault() is called when a user-level process attempts to access f32..f127
+ * and it doesn't own the fp-high register partition.  When this happens, we save the
+ * current fph partition in the task_struct of the fpu-owner (if necessary) and then load
+ * the fp-high partition of the current task (if necessary).  Note that the kernel has
+ * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes
+ * care of clearing psr.dfh.
+ */
+static inline void
+disabled_fph_fault (struct pt_regs *regs)
+{
+        struct ia64_psr *psr = ia64_psr(regs);
+        /* first, grant user-level access to fph partition: */
+        psr->dfh = 0;
+#ifndef CONFIG_SMP
+        {
+                struct task_struct *fpu_owner
+                        = (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER);
+                if (ia64_is_local_fpu_owner(current))
+                        return;
+                if (fpu_owner)
+                        ia64_flush_fph(fpu_owner);
+        }
+#endif /* !CONFIG_SMP */
+        ia64_set_local_fpu_owner(current);
+        if ((current->thread.flags & IA64_THREAD_FPH_VALID) != 0) {
+                __ia64_load_fpu(current->thread.fph);
+                psr->mfh = 0;
+        } else {
+                __ia64_init_fpu();
+                /*
+                 * Set mfh because the state in thread.fph does not match the state in
+                 * the fph partition.
+                 */
+                psr->mfh = 1;
+        }
+}
+static inline int
+fp_emulate (int fp_fault, void *bundle, long *ipsr, long *fpsr, long *isr, long *pr, long *ifs,
+            struct pt_regs *regs)
+{
+        fp_state_t fp_state;
+        fpswa_ret_t ret;
+        if (!fpswa_interface)
+                return -1;
+        memset(&fp_state, 0, sizeof(fp_state_t));
+        /*
+         * compute fp_state.  only FP registers f6 - f11 are used by the
+         * kernel, so set those bits in the mask and set the low volatile
+         * pointer to point to these registers.
+         */
+        fp_state.bitmask_low64 = 0xfc0;  /* bit6..bit11 */
+        fp_state.fp_state_low_volatile = (fp_state_low_volatile_t *) &regs->f6;
+        /*
+         * unsigned long (*EFI_FPSWA) (
+         *      unsigned long    trap_type,
+         *      void             *Bundle,
+         *      unsigned long    *pipsr,
+         *      unsigned long    *pfsr,
+         *      unsigned long    *pisr,
+         *      unsigned long    *ppreds,
+         *      unsigned long    *pifs,
+         *      void             *fp_state);
+         */
+        ret = (*fpswa_interface->fpswa)((unsigned long) fp_fault, bundle,
+                                        (unsigned long *) ipsr, (unsigned long *) fpsr,
+                                        (unsigned long *) isr, (unsigned long *) pr,
+                                        (unsigned long *) ifs, &fp_state);
+        return ret.status;
+}
+/*
+ * Handle floating-point assist faults and traps.
+ */
+static int
+handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
+{
+        long exception, bundle[2];
+        unsigned long fault_ip;
+        struct siginfo siginfo;
+        static int fpu_swa_count = 0;
+        static unsigned long last_time;
+        fault_ip = regs->cr_iip;
+        if (!fp_fault && (ia64_psr(regs)->ri == 0))
+                fault_ip -= 16;
+        if (copy_from_user(bundle, (void __user *) fault_ip, sizeof(bundle)))
+                return -1;
+        if (jiffies - last_time > 5*HZ)
+                fpu_swa_count = 0;
+        if ((fpu_swa_count < 4) && !(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT)) {
+                last_time = jiffies;
+                ++fpu_swa_count;
+                printk(KERN_WARNING
+                       "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n",
+                       current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr);
+        }
+        exception = fp_emulate(fp_fault, bundle, &regs->cr_ipsr, &regs->ar_fpsr, &isr, &regs->pr,
+                               &regs->cr_ifs, regs);
+        if (fp_fault) {
+                if (exception == 0) {
+                        /* emulation was successful */
+                        ia64_increment_ip(regs);
+                } else if (exception == -1) {
+                        printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n");
+                        return -1;
+                } else {
+                        /* is next instruction a trap? */
+                        if (exception & 2) {
+                                ia64_increment_ip(regs);
+                        }
+                        siginfo.si_signo = SIGFPE;
+                        siginfo.si_errno = 0;
+                        siginfo.si_code = __SI_FAULT;   /* default code */
+                        siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
+                        if (isr & 0x11) {
+                                siginfo.si_code = FPE_FLTINV;
+                        } else if (isr & 0x22) {
+                                /* denormal operand gets the same si_code as underflow 
+                                * see arch/i386/kernel/traps.c:math_error()  */
+                                siginfo.si_code = FPE_FLTUND;
+                        } else if (isr & 0x44) {
+                                siginfo.si_code = FPE_FLTDIV;
+                        }
+                        siginfo.si_isr = isr;
+                        siginfo.si_flags = __ISR_VALID;
+                        siginfo.si_imm = 0;
+                        force_sig_info(SIGFPE, &siginfo, current);
+                }
+        } else {
+                if (exception == -1) {
+                        printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n");
+                        return -1;
+                } else if (exception != 0) {
+                        /* raise exception */
+                        siginfo.si_signo = SIGFPE;
+                        siginfo.si_errno = 0;
+                        siginfo.si_code = __SI_FAULT;   /* default code */
+                        siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
+                        if (isr & 0x880) {
+                                siginfo.si_code = FPE_FLTOVF;
+                        } else if (isr & 0x1100) {
+                                siginfo.si_code = FPE_FLTUND;
+                        } else if (isr & 0x2200) {
+                                siginfo.si_code = FPE_FLTRES;
+                        }
+                        siginfo.si_isr = isr;
+                        siginfo.si_flags = __ISR_VALID;
+                        siginfo.si_imm = 0;
+                        force_sig_info(SIGFPE, &siginfo, current);
+                }
+        }
+        return 0;
+}
+struct illegal_op_return {
+        unsigned long fkt, arg1, arg2, arg3;
+};
+struct illegal_op_return
+ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3,
+                       long arg4, long arg5, long arg6, long arg7,
+                       struct pt_regs regs)
+{
+        struct illegal_op_return rv;
+        struct siginfo si;
+        char buf[128];
+#ifdef CONFIG_IA64_BRL_EMU
+        {
+                extern struct illegal_op_return ia64_emulate_brl (struct pt_regs *, unsigned long);
+                rv = ia64_emulate_brl(&regs, ec);
+                if (rv.fkt != (unsigned long) -1)
+                        return rv;
+        }
+#endif
+        sprintf(buf, "IA-64 Illegal operation fault");
+        die_if_kernel(buf, &regs, 0);
+        memset(&si, 0, sizeof(si));
+        si.si_signo = SIGILL;
+        si.si_code = ILL_ILLOPC;
+        si.si_addr = (void __user *) (regs.cr_iip + ia64_psr(&regs)->ri);
+        force_sig_info(SIGILL, &si, current);
+        rv.fkt = 0;
+        return rv;
+}
+void
+ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
+            unsigned long iim, unsigned long itir, long arg5, long arg6,
+            long arg7, struct pt_regs regs)
+{
+        unsigned long code, error = isr, iip;
+        struct siginfo siginfo;
+        char buf[128];
+        int result, sig;
+        static const char *reason[] = {
+                "IA-64 Illegal Operation fault",
+                "IA-64 Privileged Operation fault",
+                "IA-64 Privileged Register fault",
+                "IA-64 Reserved Register/Field fault",
+                "Disabled Instruction Set Transition fault",
+                "Unknown fault 5", "Unknown fault 6", "Unknown fault 7", "Illegal Hazard fault",
+                "Unknown fault 9", "Unknown fault 10", "Unknown fault 11", "Unknown fault 12",
+                "Unknown fault 13", "Unknown fault 14", "Unknown fault 15"
+        };
+        if ((isr & IA64_ISR_NA) && ((isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) {
+                /*
+                 * This fault was due to lfetch.fault, set "ed" bit in the psr to cancel
+                 * the lfetch.
+                 */
+                ia64_psr(&regs)->ed = 1;
+                return;
+        }
+        iip = regs.cr_iip + ia64_psr(&regs)->ri;
+        switch (vector) {
+              case 24: /* General Exception */
+                code = (isr >> 4) & 0xf;
+                sprintf(buf, "General Exception: %s%s", reason[code],
+                        (code == 3) ? ((isr & (1UL << 37))
+                                       ? " (RSE access)" : " (data access)") : "");
+                if (code == 8) {
+# ifdef CONFIG_IA64_PRINT_HAZARDS
+                        printk("%s[%d]: possible hazard @ ip=%016lx (pr = %016lx)\n",
+                               current->comm, current->pid,
+                               regs.cr_iip + ia64_psr(&regs)->ri, regs.pr);
+# endif
+                        return;
+                }
+                break;
+              case 25: /* Disabled FP-Register */
+                if (isr & 2) {
+                        disabled_fph_fault(&regs);
+                        return;
+                }
+                sprintf(buf, "Disabled FPL fault---not supposed to happen!");
+                break;
+              case 26: /* NaT Consumption */
+                if (user_mode(&regs)) {
+                        void __user *addr;
+                        if (((isr >> 4) & 0xf) == 2) {
+                                /* NaT page consumption */
+                                sig = SIGSEGV;
+                                code = SEGV_ACCERR;
+                                addr = (void __user *) ifa;
+                        } else {
+                                /* register NaT consumption */
+                                sig = SIGILL;
+                                code = ILL_ILLOPN;
+                                addr = (void __user *) (regs.cr_iip
+                                                        + ia64_psr(&regs)->ri);
+                        }
+                        siginfo.si_signo = sig;
+                        siginfo.si_code = code;
+                        siginfo.si_errno = 0;
+                        siginfo.si_addr = addr;
+                        siginfo.si_imm = vector;
+                        siginfo.si_flags = __ISR_VALID;
+                        siginfo.si_isr = isr;
+                        force_sig_info(sig, &siginfo, current);
+                        return;
+                } else if (ia64_done_with_exception(&regs))
+                        return;
+                sprintf(buf, "NaT consumption");
+                break;
+              case 31: /* Unsupported Data Reference */
+                if (user_mode(&regs)) {
+                        siginfo.si_signo = SIGILL;
+                        siginfo.si_code = ILL_ILLOPN;
+                        siginfo.si_errno = 0;
+                        siginfo.si_addr = (void __user *) iip;
+                        siginfo.si_imm = vector;
+                        siginfo.si_flags = __ISR_VALID;
+                        siginfo.si_isr = isr;
+                        force_sig_info(SIGILL, &siginfo, current);
+                        return;
+                }
+                sprintf(buf, "Unsupported data reference");
+                break;
+              case 29: /* Debug */
+              case 35: /* Taken Branch Trap */
+              case 36: /* Single Step Trap */
+                if (fsys_mode(current, &regs)) {
+                        extern char __kernel_syscall_via_break[];
+                        /*
+                         * Got a trap in fsys-mode: Taken Branch Trap and Single Step trap
+                         * need special handling; Debug trap is not supposed to happen.
+                         */
+                        if (unlikely(vector == 29)) {
+                                die("Got debug trap in fsys-mode---not supposed to happen!",
+                                    &regs, 0);
+                                return;
+                        }
+                        /* re-do the system call via break 0x100000: */
+                        regs.cr_iip = (unsigned long) __kernel_syscall_via_break;
+                        ia64_psr(&regs)->ri = 0;
+                        ia64_psr(&regs)->cpl = 3;
+                        return;
+                }
+                switch (vector) {
+                      case 29:
+                        siginfo.si_code = TRAP_HWBKPT;
+#ifdef CONFIG_ITANIUM
+                        /*
+                         * Erratum 10 (IFA may contain incorrect address) now has
+                         * "NoFix" status.  There are no plans for fixing this.
+                         */
+                        if (ia64_psr(&regs)->is == 0)
+                          ifa = regs.cr_iip;
+#endif
+                        break;
+                      case 35: siginfo.si_code = TRAP_BRANCH; ifa = 0; break;
+                      case 36: siginfo.si_code = TRAP_TRACE; ifa = 0; break;
+                }
+                siginfo.si_signo = SIGTRAP;
+                siginfo.si_errno = 0;
+                siginfo.si_addr  = (void __user *) ifa;
+                siginfo.si_imm   = 0;
+                siginfo.si_flags = __ISR_VALID;
+                siginfo.si_isr   = isr;
+                force_sig_info(SIGTRAP, &siginfo, current);
+                return;
+              case 32: /* fp fault */
+              case 33: /* fp trap */
+                result = handle_fpu_swa((vector == 32) ? 1 : 0, &regs, isr);
+                if ((result < 0) || (current->thread.flags & IA64_THREAD_FPEMU_SIGFPE)) {
+                        siginfo.si_signo = SIGFPE;
+                        siginfo.si_errno = 0;
+                        siginfo.si_code = FPE_FLTINV;
+                        siginfo.si_addr = (void __user *) iip;
+                        siginfo.si_flags = __ISR_VALID;
+                        siginfo.si_isr = isr;
+                        siginfo.si_imm = 0;
+                        force_sig_info(SIGFPE, &siginfo, current);
+                }
+                return;
+              case 34:
+                if (isr & 0x2) {
+                        /* Lower-Privilege Transfer Trap */
+                        /*
+                         * Just clear PSR.lp and then return immediately: all the
+                         * interesting work (e.g., signal delivery is done in the kernel
+                         * exit path).
+                         */
+                        ia64_psr(&regs)->lp = 0;
+                        return;
+                } else {
+                        /* Unimplemented Instr. Address Trap */
+                        if (user_mode(&regs)) {
+                                siginfo.si_signo = SIGILL;
+                                siginfo.si_code = ILL_BADIADDR;
+                                siginfo.si_errno = 0;
+                                siginfo.si_flags = 0;
+                                siginfo.si_isr = 0;
+                                siginfo.si_imm = 0;
+                                siginfo.si_addr = (void __user *) iip;
+                                force_sig_info(SIGILL, &siginfo, current);
+                                return;
+                        }
+                        sprintf(buf, "Unimplemented Instruction Address fault");
+                }
+                break;
+              case 45:
+#ifdef CONFIG_IA32_SUPPORT
+                if (ia32_exception(&regs, isr) == 0)
+                        return;
+#endif
+                printk(KERN_ERR "Unexpected IA-32 exception (Trap 45)\n");
+                printk(KERN_ERR "  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx\n",
+                       iip, ifa, isr);
+                force_sig(SIGSEGV, current);
+                break;
+              case 46:
+#ifdef CONFIG_IA32_SUPPORT
+                if (ia32_intercept(&regs, isr) == 0)
+                        return;
+#endif
+                printk(KERN_ERR "Unexpected IA-32 intercept trap (Trap 46)\n");
+                printk(KERN_ERR "  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx, iim - 0x%lx\n",
+                       iip, ifa, isr, iim);
+                force_sig(SIGSEGV, current);
+                return;
+              case 47:
+                sprintf(buf, "IA-32 Interruption Fault (int 0x%lx)", isr >> 16);
+                break;
+              default:
+                sprintf(buf, "Fault %lu", vector);
+                break;
+        }
+        die_if_kernel(buf, &regs, error);
+        force_sig(SIGILL, current);
+}
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
new file mode 100644
index 000000000000..43b45b65ee5a
--- /dev/null
+++ b/arch/ia64/kernel/unaligned.c
@@ -0,0 +1,1521 @@
+/*
+ * Architecture-specific unaligned trap handling.
+ *
+ * Copyright (C) 1999-2002, 2004 Hewlett-Packard Co
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 2002/12/09   Fix rotating register handling (off-by-1 error, missing fr-rotation).  Fix
+ *              get_rse_reg() to not leak kernel bits to user-level (reading an out-of-frame
+ *              stacked register returns an undefined value; it does NOT trigger a
+ *              "rsvd register fault").
+ * 2001/10/11   Fix unaligned access to rotating registers in s/w pipelined loops.
+ * 2001/08/13   Correct size of extended floats (float_fsz) from 16 to 10 bytes.
+ * 2001/01/17   Add support emulation of unaligned kernel accesses.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/tty.h>
+#include <asm/intrinsics.h>
+#include <asm/processor.h>
+#include <asm/rse.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+extern void die_if_kernel(char *str, struct pt_regs *regs, long err) __attribute__ ((noreturn));
+#undef DEBUG_UNALIGNED_TRAP
+#ifdef DEBUG_UNALIGNED_TRAP
+# define DPRINT(a...)   do { printk("%s %u: ", __FUNCTION__, __LINE__); printk (a); } while (0)
+# define DDUMP(str,vp,len)      dump(str, vp, len)
+static void
+dump (const char *str, void *vp, size_t len)
+{
+        unsigned char *cp = vp;
+        int i;
+        printk("%s", str);
+        for (i = 0; i < len; ++i)
+                printk (" %02x", *cp++);
+        printk("\n");
+}
+#else
+# define DPRINT(a...)
+# define DDUMP(str,vp,len)
+#endif
+#define IA64_FIRST_STACKED_GR   32
+#define IA64_FIRST_ROTATING_FR  32
+#define SIGN_EXT9               0xffffffffffffff00ul
+/*
+ * For M-unit:
+ *
+ *  opcode |   m  |   x6    |
+ * --------|------|---------|
+ * [40-37] | [36] | [35:30] |
+ * --------|------|---------|
+ *     4   |   1  |    6    | = 11 bits
+ * --------------------------
+ * However bits [31:30] are not directly useful to distinguish between
+ * load/store so we can use [35:32] instead, which gives the following
+ * mask ([40:32]) using 9 bits. The 'e' comes from the fact that we defer
+ * checking the m-bit until later in the load/store emulation.
+ */
+#define IA64_OPCODE_MASK        0x1ef
+#define IA64_OPCODE_SHIFT       32
+/*
+ * Table C-28 Integer Load/Store
+ *
+ * We ignore [35:32]= 0x6, 0x7, 0xE, 0xF
+ *
+ * ld8.fill, st8.fill  MUST be aligned because the RNATs are based on
+ * the address (bits [8:3]), so we must failed.
+ */
+#define LD_OP            0x080
+#define LDS_OP           0x081
+#define LDA_OP           0x082
+#define LDSA_OP          0x083
+#define LDBIAS_OP        0x084
+#define LDACQ_OP         0x085
+/* 0x086, 0x087 are not relevant */
+#define LDCCLR_OP        0x088
+#define LDCNC_OP         0x089
+#define LDCCLRACQ_OP     0x08a
+#define ST_OP            0x08c
+#define STREL_OP         0x08d
+/* 0x08e,0x8f are not relevant */
+/*
+ * Table C-29 Integer Load +Reg
+ *
+ * we use the ld->m (bit [36:36]) field to determine whether or not we have
+ * a load/store of this form.
+ */
+/*
+ * Table C-30 Integer Load/Store +Imm
+ *
+ * We ignore [35:32]= 0x6, 0x7, 0xE, 0xF
+ *
+ * ld8.fill, st8.fill  must be aligned because the Nat register are based on
+ * the address, so we must fail and the program must be fixed.
+ */
+#define LD_IMM_OP            0x0a0
+#define LDS_IMM_OP           0x0a1
+#define LDA_IMM_OP           0x0a2
+#define LDSA_IMM_OP          0x0a3
+#define LDBIAS_IMM_OP        0x0a4
+#define LDACQ_IMM_OP         0x0a5
+/* 0x0a6, 0xa7 are not relevant */
+#define LDCCLR_IMM_OP        0x0a8
+#define LDCNC_IMM_OP         0x0a9
+#define LDCCLRACQ_IMM_OP     0x0aa
+#define ST_IMM_OP            0x0ac
+#define STREL_IMM_OP         0x0ad
+/* 0x0ae,0xaf are not relevant */
+/*
+ * Table C-32 Floating-point Load/Store
+ */
+#define LDF_OP           0x0c0
+#define LDFS_OP          0x0c1
+#define LDFA_OP          0x0c2
+#define LDFSA_OP         0x0c3
+/* 0x0c6 is irrelevant */
+#define LDFCCLR_OP       0x0c8
+#define LDFCNC_OP        0x0c9
+/* 0x0cb is irrelevant  */
+#define STF_OP           0x0cc
+/*
+ * Table C-33 Floating-point Load +Reg
+ *
+ * we use the ld->m (bit [36:36]) field to determine whether or not we have
+ * a load/store of this form.
+ */
+/*
+ * Table C-34 Floating-point Load/Store +Imm
+ */
+#define LDF_IMM_OP       0x0e0
+#define LDFS_IMM_OP      0x0e1
+#define LDFA_IMM_OP      0x0e2
+#define LDFSA_IMM_OP     0x0e3
+/* 0x0e6 is irrelevant */
+#define LDFCCLR_IMM_OP   0x0e8
+#define LDFCNC_IMM_OP    0x0e9
+#define STF_IMM_OP       0x0ec
+typedef struct {
+        unsigned long    qp:6;  /* [0:5]   */
+        unsigned long    r1:7;  /* [6:12]  */
+        unsigned long   imm:7;  /* [13:19] */
+        unsigned long    r3:7;  /* [20:26] */
+        unsigned long     x:1;  /* [27:27] */
+        unsigned long  hint:2;  /* [28:29] */
+        unsigned long x6_sz:2;  /* [30:31] */
+        unsigned long x6_op:4;  /* [32:35], x6 = x6_sz|x6_op */
+        unsigned long     m:1;  /* [36:36] */
+        unsigned long    op:4;  /* [37:40] */
+        unsigned long   pad:23; /* [41:63] */
+} load_store_t;
+typedef enum {
+        UPD_IMMEDIATE,  /* ldXZ r1=[r3],imm(9) */
+        UPD_REG         /* ldXZ r1=[r3],r2     */
+} update_t;
+/*
+ * We use tables to keep track of the offsets of registers in the saved state.
+ * This way we save having big switch/case statements.
+ *
+ * We use bit 0 to indicate switch_stack or pt_regs.
+ * The offset is simply shifted by 1 bit.
+ * A 2-byte value should be enough to hold any kind of offset
+ *
+ * In case the calling convention changes (and thus pt_regs/switch_stack)
+ * simply use RSW instead of RPT or vice-versa.
+ */
+#define RPO(x)  ((size_t) &((struct pt_regs *)0)->x)
+#define RSO(x)  ((size_t) &((struct switch_stack *)0)->x)
+#define RPT(x)          (RPO(x) << 1)
+#define RSW(x)          (1| RSO(x)<<1)
+#define GR_OFFS(x)      (gr_info[x]>>1)
+#define GR_IN_SW(x)     (gr_info[x] & 0x1)
+#define FR_OFFS(x)      (fr_info[x]>>1)
+#define FR_IN_SW(x)     (fr_info[x] & 0x1)
+static u16 gr_info[32]={
+        0,                      /* r0 is read-only : WE SHOULD NEVER GET THIS */
+        RPT(r1), RPT(r2), RPT(r3),
+        RSW(r4), RSW(r5), RSW(r6), RSW(r7),
+        RPT(r8), RPT(r9), RPT(r10), RPT(r11),
+        RPT(r12), RPT(r13), RPT(r14), RPT(r15),
+        RPT(r16), RPT(r17), RPT(r18), RPT(r19),
+        RPT(r20), RPT(r21), RPT(r22), RPT(r23),
+        RPT(r24), RPT(r25), RPT(r26), RPT(r27),
+        RPT(r28), RPT(r29), RPT(r30), RPT(r31)
+};
+static u16 fr_info[32]={
+        0,                      /* constant : WE SHOULD NEVER GET THIS */
+        0,                      /* constant : WE SHOULD NEVER GET THIS */
+        RSW(f2), RSW(f3), RSW(f4), RSW(f5),
+        RPT(f6), RPT(f7), RPT(f8), RPT(f9),
+        RPT(f10), RPT(f11),
+        RSW(f12), RSW(f13), RSW(f14),
+        RSW(f15), RSW(f16), RSW(f17), RSW(f18), RSW(f19),
+        RSW(f20), RSW(f21), RSW(f22), RSW(f23), RSW(f24),
+        RSW(f25), RSW(f26), RSW(f27), RSW(f28), RSW(f29),
+        RSW(f30), RSW(f31)
+};
+/* Invalidate ALAT entry for integer register REGNO.  */
+static void
+invala_gr (int regno)
+{
+#       define F(reg)   case reg: ia64_invala_gr(reg); break
+        switch (regno) {
+                F(  0); F(  1); F(  2); F(  3); F(  4); F(  5); F(  6); F(  7);
+                F(  8); F(  9); F( 10); F( 11); F( 12); F( 13); F( 14); F( 15);
+                F( 16); F( 17); F( 18); F( 19); F( 20); F( 21); F( 22); F( 23);
+                F( 24); F( 25); F( 26); F( 27); F( 28); F( 29); F( 30); F( 31);
+                F( 32); F( 33); F( 34); F( 35); F( 36); F( 37); F( 38); F( 39);
+                F( 40); F( 41); F( 42); F( 43); F( 44); F( 45); F( 46); F( 47);
+                F( 48); F( 49); F( 50); F( 51); F( 52); F( 53); F( 54); F( 55);
+                F( 56); F( 57); F( 58); F( 59); F( 60); F( 61); F( 62); F( 63);
+                F( 64); F( 65); F( 66); F( 67); F( 68); F( 69); F( 70); F( 71);
+                F( 72); F( 73); F( 74); F( 75); F( 76); F( 77); F( 78); F( 79);
+                F( 80); F( 81); F( 82); F( 83); F( 84); F( 85); F( 86); F( 87);
+                F( 88); F( 89); F( 90); F( 91); F( 92); F( 93); F( 94); F( 95);
+                F( 96); F( 97); F( 98); F( 99); F(100); F(101); F(102); F(103);
+                F(104); F(105); F(106); F(107); F(108); F(109); F(110); F(111);
+                F(112); F(113); F(114); F(115); F(116); F(117); F(118); F(119);
+                F(120); F(121); F(122); F(123); F(124); F(125); F(126); F(127);
+        }
+#       undef F
+}
+/* Invalidate ALAT entry for floating-point register REGNO.  */
+static void
+invala_fr (int regno)
+{
+#       define F(reg)   case reg: ia64_invala_fr(reg); break
+        switch (regno) {
+                F(  0); F(  1); F(  2); F(  3); F(  4); F(  5); F(  6); F(  7);
+                F(  8); F(  9); F( 10); F( 11); F( 12); F( 13); F( 14); F( 15);
+                F( 16); F( 17); F( 18); F( 19); F( 20); F( 21); F( 22); F( 23);
+                F( 24); F( 25); F( 26); F( 27); F( 28); F( 29); F( 30); F( 31);
+                F( 32); F( 33); F( 34); F( 35); F( 36); F( 37); F( 38); F( 39);
+                F( 40); F( 41); F( 42); F( 43); F( 44); F( 45); F( 46); F( 47);
+                F( 48); F( 49); F( 50); F( 51); F( 52); F( 53); F( 54); F( 55);
+                F( 56); F( 57); F( 58); F( 59); F( 60); F( 61); F( 62); F( 63);
+                F( 64); F( 65); F( 66); F( 67); F( 68); F( 69); F( 70); F( 71);
+                F( 72); F( 73); F( 74); F( 75); F( 76); F( 77); F( 78); F( 79);
+                F( 80); F( 81); F( 82); F( 83); F( 84); F( 85); F( 86); F( 87);
+                F( 88); F( 89); F( 90); F( 91); F( 92); F( 93); F( 94); F( 95);
+                F( 96); F( 97); F( 98); F( 99); F(100); F(101); F(102); F(103);
+                F(104); F(105); F(106); F(107); F(108); F(109); F(110); F(111);
+                F(112); F(113); F(114); F(115); F(116); F(117); F(118); F(119);
+                F(120); F(121); F(122); F(123); F(124); F(125); F(126); F(127);
+        }
+#       undef F
+}
+static inline unsigned long
+rotate_reg (unsigned long sor, unsigned long rrb, unsigned long reg)
+{
+        reg += rrb;
+        if (reg >= sor)
+                reg -= sor;
+        return reg;
+}
+static void
+set_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long val, int nat)
+{
+        struct switch_stack *sw = (struct switch_stack *) regs - 1;
+        unsigned long *bsp, *bspstore, *addr, *rnat_addr, *ubs_end;
+        unsigned long *kbs = (void *) current + IA64_RBS_OFFSET;
+        unsigned long rnats, nat_mask;
+        unsigned long on_kbs;
+        long sof = (regs->cr_ifs) & 0x7f;
+        long sor = 8 * ((regs->cr_ifs >> 14) & 0xf);
+        long rrb_gr = (regs->cr_ifs >> 18) & 0x7f;
+        long ridx = r1 - 32;
+        if (ridx >= sof) {
+                /* this should never happen, as the "rsvd register fault" has higher priority */
+                DPRINT("ignoring write to r%lu; only %lu registers are allocated!\n", r1, sof);
+                return;
+        }
+        if (ridx < sor)
+                ridx = rotate_reg(sor, rrb_gr, ridx);
+        DPRINT("r%lu, sw.bspstore=%lx pt.bspstore=%lx sof=%ld sol=%ld ridx=%ld\n",
+               r1, sw->ar_bspstore, regs->ar_bspstore, sof, (regs->cr_ifs >> 7) & 0x7f, ridx);
+        on_kbs = ia64_rse_num_regs(kbs, (unsigned long *) sw->ar_bspstore);
+        addr = ia64_rse_skip_regs((unsigned long *) sw->ar_bspstore, -sof + ridx);
+        if (addr >= kbs) {
+                /* the register is on the kernel backing store: easy... */
+                rnat_addr = ia64_rse_rnat_addr(addr);
+                if ((unsigned long) rnat_addr >= sw->ar_bspstore)
+                        rnat_addr = &sw->ar_rnat;
+                nat_mask = 1UL << ia64_rse_slot_num(addr);
+                *addr = val;
+                if (nat)
+                        *rnat_addr |=  nat_mask;
+                else
+                        *rnat_addr &= ~nat_mask;
+                return;
+        }
+        if (!user_stack(current, regs)) {
+                DPRINT("ignoring kernel write to r%lu; register isn't on the kernel RBS!", r1);
+                return;
+        }
+        bspstore = (unsigned long *)regs->ar_bspstore;
+        ubs_end = ia64_rse_skip_regs(bspstore, on_kbs);
+        bsp     = ia64_rse_skip_regs(ubs_end, -sof);
+        addr    = ia64_rse_skip_regs(bsp, ridx);
+        DPRINT("ubs_end=%p bsp=%p addr=%p\n", (void *) ubs_end, (void *) bsp, (void *) addr);
+        ia64_poke(current, sw, (unsigned long) ubs_end, (unsigned long) addr, val);
+        rnat_addr = ia64_rse_rnat_addr(addr);
+        ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, &rnats);
+        DPRINT("rnat @%p = 0x%lx nat=%d old nat=%ld\n",
+               (void *) rnat_addr, rnats, nat, (rnats >> ia64_rse_slot_num(addr)) & 1);
+        nat_mask = 1UL << ia64_rse_slot_num(addr);
+        if (nat)
+                rnats |=  nat_mask;
+        else
+                rnats &= ~nat_mask;
+        ia64_poke(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, rnats);
+        DPRINT("rnat changed to @%p = 0x%lx\n", (void *) rnat_addr, rnats);
+}
+static void
+get_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long *val, int *nat)
+{
+        struct switch_stack *sw = (struct switch_stack *) regs - 1;
+        unsigned long *bsp, *addr, *rnat_addr, *ubs_end, *bspstore;
+        unsigned long *kbs = (void *) current + IA64_RBS_OFFSET;
+        unsigned long rnats, nat_mask;
+        unsigned long on_kbs;
+        long sof = (regs->cr_ifs) & 0x7f;
+        long sor = 8 * ((regs->cr_ifs >> 14) & 0xf);
+        long rrb_gr = (regs->cr_ifs >> 18) & 0x7f;
+        long ridx = r1 - 32;
+        if (ridx >= sof) {
+                /* read of out-of-frame register returns an undefined value; 0 in our case.  */
+                DPRINT("ignoring read from r%lu; only %lu registers are allocated!\n", r1, sof);
+                goto fail;
+        }
+        if (ridx < sor)
+                ridx = rotate_reg(sor, rrb_gr, ridx);
+        DPRINT("r%lu, sw.bspstore=%lx pt.bspstore=%lx sof=%ld sol=%ld ridx=%ld\n",
+               r1, sw->ar_bspstore, regs->ar_bspstore, sof, (regs->cr_ifs >> 7) & 0x7f, ridx);
+        on_kbs = ia64_rse_num_regs(kbs, (unsigned long *) sw->ar_bspstore);
+        addr = ia64_rse_skip_regs((unsigned long *) sw->ar_bspstore, -sof + ridx);
+        if (addr >= kbs) {
+                /* the register is on the kernel backing store: easy... */
+                *val = *addr;
+                if (nat) {
+                        rnat_addr = ia64_rse_rnat_addr(addr);
+                        if ((unsigned long) rnat_addr >= sw->ar_bspstore)
+                                rnat_addr = &sw->ar_rnat;
+                        nat_mask = 1UL << ia64_rse_slot_num(addr);
+                        *nat = (*rnat_addr & nat_mask) != 0;
+                }
+                return;
+        }
+        if (!user_stack(current, regs)) {
+                DPRINT("ignoring kernel read of r%lu; register isn't on the RBS!", r1);
+                goto fail;
+        }
+        bspstore = (unsigned long *)regs->ar_bspstore;
+        ubs_end = ia64_rse_skip_regs(bspstore, on_kbs);
+        bsp     = ia64_rse_skip_regs(ubs_end, -sof);
+        addr    = ia64_rse_skip_regs(bsp, ridx);
+        DPRINT("ubs_end=%p bsp=%p addr=%p\n", (void *) ubs_end, (void *) bsp, (void *) addr);
+        ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) addr, val);
+        if (nat) {
+                rnat_addr = ia64_rse_rnat_addr(addr);
+                nat_mask = 1UL << ia64_rse_slot_num(addr);
+                DPRINT("rnat @%p = 0x%lx\n", (void *) rnat_addr, rnats);
+                ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, &rnats);
+                *nat = (rnats & nat_mask) != 0;
+        }
+        return;
+  fail:
+        *val = 0;
+        if (nat)
+                *nat = 0;
+        return;
+}
+static void
+setreg (unsigned long regnum, unsigned long val, int nat, struct pt_regs *regs)
+{
+        struct switch_stack *sw = (struct switch_stack *) regs - 1;
+        unsigned long addr;
+        unsigned long bitmask;
+        unsigned long *unat;
+        /*
+         * First takes care of stacked registers
+         */
+        if (regnum >= IA64_FIRST_STACKED_GR) {
+                set_rse_reg(regs, regnum, val, nat);
+                return;
+        }
+        /*
+         * Using r0 as a target raises a General Exception fault which has higher priority
+         * than the Unaligned Reference fault.
+         */
+        /*
+         * Now look at registers in [0-31] range and init correct UNAT
+         */
+        if (GR_IN_SW(regnum)) {
+                addr = (unsigned long)sw;
+                unat = &sw->ar_unat;
+        } else {
+                addr = (unsigned long)regs;
+                unat = &sw->caller_unat;
+        }
+        DPRINT("tmp_base=%lx switch_stack=%s offset=%d\n",
+               addr, unat==&sw->ar_unat ? "yes":"no", GR_OFFS(regnum));
+        /*
+         * add offset from base of struct
+         * and do it !
+         */
+        addr += GR_OFFS(regnum);
+        *(unsigned long *)addr = val;
+        /*
+         * We need to clear the corresponding UNAT bit to fully emulate the load
+         * UNAT bit_pos = GR[r3]{8:3} form EAS-2.4
+         */
+        bitmask   = 1UL << (addr >> 3 & 0x3f);
+        DPRINT("*0x%lx=0x%lx NaT=%d prev_unat @%p=%lx\n", addr, val, nat, (void *) unat, *unat);
+        if (nat) {
+                *unat |= bitmask;
+        } else {
+                *unat &= ~bitmask;
+        }
+        DPRINT("*0x%lx=0x%lx NaT=%d new unat: %p=%lx\n", addr, val, nat, (void *) unat,*unat);
+}
+/*
+ * Return the (rotated) index for floating point register REGNUM (REGNUM must be in the
+ * range from 32-127, result is in the range from 0-95.
+ */
+static inline unsigned long
+fph_index (struct pt_regs *regs, long regnum)
+{
+        unsigned long rrb_fr = (regs->cr_ifs >> 25) & 0x7f;
+        return rotate_reg(96, rrb_fr, (regnum - IA64_FIRST_ROTATING_FR));
+}
+static void
+setfpreg (unsigned long regnum, struct ia64_fpreg *fpval, struct pt_regs *regs)
+{
+        struct switch_stack *sw = (struct switch_stack *)regs - 1;
+        unsigned long addr;
+        /*
+         * From EAS-2.5: FPDisableFault has higher priority than Unaligned
+         * Fault. Thus, when we get here, we know the partition is enabled.
+         * To update f32-f127, there are three choices:
+         *
+         *      (1) save f32-f127 to thread.fph and update the values there
+         *      (2) use a gigantic switch statement to directly access the registers
+         *      (3) generate code on the fly to update the desired register
+         *
+         * For now, we are using approach (1).
+         */
+        if (regnum >= IA64_FIRST_ROTATING_FR) {
+                ia64_sync_fph(current);
+                current->thread.fph[fph_index(regs, regnum)] = *fpval;
+        } else {
+                /*
+                 * pt_regs or switch_stack ?
+                 */
+                if (FR_IN_SW(regnum)) {
+                        addr = (unsigned long)sw;
+                } else {
+                        addr = (unsigned long)regs;
+                }
+                DPRINT("tmp_base=%lx offset=%d\n", addr, FR_OFFS(regnum));
+                addr += FR_OFFS(regnum);
+                *(struct ia64_fpreg *)addr = *fpval;
+                /*
+                 * mark the low partition as being used now
+                 *
+                 * It is highly unlikely that this bit is not already set, but
+                 * let's do it for safety.
+                 */
+                regs->cr_ipsr |= IA64_PSR_MFL;
+        }
+}
+/*
+ * Those 2 inline functions generate the spilled versions of the constant floating point
+ * registers which can be used with stfX
+ */
+static inline void
+float_spill_f0 (struct ia64_fpreg *final)
+{
+        ia64_stf_spill(final, 0);
+}
+static inline void
+float_spill_f1 (struct ia64_fpreg *final)
+{
+        ia64_stf_spill(final, 1);
+}
+static void
+getfpreg (unsigned long regnum, struct ia64_fpreg *fpval, struct pt_regs *regs)
+{
+        struct switch_stack *sw = (struct switch_stack *) regs - 1;
+        unsigned long addr;
+        /*
+         * From EAS-2.5: FPDisableFault has higher priority than
+         * Unaligned Fault. Thus, when we get here, we know the partition is
+         * enabled.
+         *
+         * When regnum > 31, the register is still live and we need to force a save
+         * to current->thread.fph to get access to it.  See discussion in setfpreg()
+         * for reasons and other ways of doing this.
+         */
+        if (regnum >= IA64_FIRST_ROTATING_FR) {
+                ia64_flush_fph(current);
+                *fpval = current->thread.fph[fph_index(regs, regnum)];
+        } else {
+                /*
+                 * f0 = 0.0, f1= 1.0. Those registers are constant and are thus
+                 * not saved, we must generate their spilled form on the fly
+                 */
+                switch(regnum) {
+                case 0:
+                        float_spill_f0(fpval);
+                        break;
+                case 1:
+                        float_spill_f1(fpval);
+                        break;
+                default:
+                        /*
+                         * pt_regs or switch_stack ?
+                         */
+                        addr =  FR_IN_SW(regnum) ? (unsigned long)sw
+                                                 : (unsigned long)regs;
+                        DPRINT("is_sw=%d tmp_base=%lx offset=0x%x\n",
+                               FR_IN_SW(regnum), addr, FR_OFFS(regnum));
+                        addr  += FR_OFFS(regnum);
+                        *fpval = *(struct ia64_fpreg *)addr;
+                }
+        }
+}
+static void
+getreg (unsigned long regnum, unsigned long *val, int *nat, struct pt_regs *regs)
+{
+        struct switch_stack *sw = (struct switch_stack *) regs - 1;
+        unsigned long addr, *unat;
+        if (regnum >= IA64_FIRST_STACKED_GR) {
+                get_rse_reg(regs, regnum, val, nat);
+                return;
+        }
+        /*
+         * take care of r0 (read-only always evaluate to 0)
+         */
+        if (regnum == 0) {
+                *val = 0;
+                if (nat)
+                        *nat = 0;
+                return;
+        }
+        /*
+         * Now look at registers in [0-31] range and init correct UNAT
+         */
+        if (GR_IN_SW(regnum)) {
+                addr = (unsigned long)sw;
+                unat = &sw->ar_unat;
+        } else {
+                addr = (unsigned long)regs;
+                unat = &sw->caller_unat;
+        }
+        DPRINT("addr_base=%lx offset=0x%x\n", addr,  GR_OFFS(regnum));
+        addr += GR_OFFS(regnum);
+        *val  = *(unsigned long *)addr;
+        /*
+         * do it only when requested
+         */
+        if (nat)
+                *nat  = (*unat >> (addr >> 3 & 0x3f)) & 0x1UL;
+}
+static void
+emulate_load_updates (update_t type, load_store_t ld, struct pt_regs *regs, unsigned long ifa)
+{
+        /*
+         * IMPORTANT:
+         * Given the way we handle unaligned speculative loads, we should
+         * not get to this point in the code but we keep this sanity check,
+         * just in case.
+         */
+        if (ld.x6_op == 1 || ld.x6_op == 3) {
+                printk(KERN_ERR "%s: register update on speculative load, error\n", __FUNCTION__);
+                die_if_kernel("unaligned reference on speculative load with register update\n",
+                              regs, 30);
+        }
+        /*
+         * at this point, we know that the base register to update is valid i.e.,
+         * it's not r0
+         */
+        if (type == UPD_IMMEDIATE) {
+                unsigned long imm;
+                /*
+                 * Load +Imm: ldXZ r1=[r3],imm(9)
+                 *
+                 *
+                 * form imm9: [13:19] contain the first 7 bits
+                 */
+                imm = ld.x << 7 | ld.imm;
+                /*
+                 * sign extend (1+8bits) if m set
+                 */
+                if (ld.m) imm |= SIGN_EXT9;
+                /*
+                 * ifa == r3 and we know that the NaT bit on r3 was clear so
+                 * we can directly use ifa.
+                 */
+                ifa += imm;
+                setreg(ld.r3, ifa, 0, regs);
+                DPRINT("ld.x=%d ld.m=%d imm=%ld r3=0x%lx\n", ld.x, ld.m, imm, ifa);
+        } else if (ld.m) {
+                unsigned long r2;
+                int nat_r2;
+                /*
+                 * Load +Reg Opcode: ldXZ r1=[r3],r2
+                 *
+                 * Note: that we update r3 even in the case of ldfX.a
+                 * (where the load does not happen)
+                 *
+                 * The way the load algorithm works, we know that r3 does not
+                 * have its NaT bit set (would have gotten NaT consumption
+                 * before getting the unaligned fault). So we can use ifa
+                 * which equals r3 at this point.
+                 *
+                 * IMPORTANT:
+                 * The above statement holds ONLY because we know that we
+                 * never reach this code when trying to do a ldX.s.
+                 * If we ever make it to here on an ldfX.s then
+                 */
+                getreg(ld.imm, &r2, &nat_r2, regs);
+                ifa += r2;
+                /*
+                 * propagate Nat r2 -> r3
+                 */
+                setreg(ld.r3, ifa, nat_r2, regs);
+                DPRINT("imm=%d r2=%ld r3=0x%lx nat_r2=%d\n",ld.imm, r2, ifa, nat_r2);
+        }
+}
+static int
+emulate_load_int (unsigned long ifa, load_store_t ld, struct pt_regs *regs)
+{
+        unsigned int len = 1 << ld.x6_sz;
+        unsigned long val = 0;
+        /*
+         * r0, as target, doesn't need to be checked because Illegal Instruction
+         * faults have higher priority than unaligned faults.
+         *
+         * r0 cannot be found as the base as it would never generate an
+         * unaligned reference.
+         */
+        /*
+         * ldX.a we will emulate load and also invalidate the ALAT entry.
+         * See comment below for explanation on how we handle ldX.a
+         */
+        if (len != 2 && len != 4 && len != 8) {
+                DPRINT("unknown size: x6=%d\n", ld.x6_sz);
+                return -1;
+        }
+        /* this assumes little-endian byte-order: */
+        if (copy_from_user(&val, (void __user *) ifa, len))
+                return -1;
+        setreg(ld.r1, val, 0, regs);
+        /*
+         * check for updates on any kind of loads
+         */
+        if (ld.op == 0x5 || ld.m)
+                emulate_load_updates(ld.op == 0x5 ? UPD_IMMEDIATE: UPD_REG, ld, regs, ifa);
+        /*
+         * handling of various loads (based on EAS2.4):
+         *
+         * ldX.acq (ordered load):
+         *      - acquire semantics would have been used, so force fence instead.
+         *
+         * ldX.c.clr (check load and clear):
+         *      - if we get to this handler, it's because the entry was not in the ALAT.
+         *        Therefore the operation reverts to a normal load
+         *
+         * ldX.c.nc (check load no clear):
+         *      - same as previous one
+         *
+         * ldX.c.clr.acq (ordered check load and clear):
+         *      - same as above for c.clr part. The load needs to have acquire semantics. So
+         *        we use the fence semantics which is stronger and thus ensures correctness.
+         *
+         * ldX.a (advanced load):
+         *      - suppose ldX.a r1=[r3]. If we get to the unaligned trap it's because the
+         *        address doesn't match requested size alignment. This means that we would
+         *        possibly need more than one load to get the result.
+         *
+         *        The load part can be handled just like a normal load, however the difficult
+         *        part is to get the right thing into the ALAT. The critical piece of information
+         *        in the base address of the load & size. To do that, a ld.a must be executed,
+         *        clearly any address can be pushed into the table by using ld1.a r1=[r3]. Now
+         *        if we use the same target register, we will be okay for the check.a instruction.
+         *        If we look at the store, basically a stX [r3]=r1 checks the ALAT  for any entry
+         *        which would overlap within [r3,r3+X] (the size of the load was store in the
+         *        ALAT). If such an entry is found the entry is invalidated. But this is not good
+         *        enough, take the following example:
+         *              r3=3
+         *              ld4.a r1=[r3]
+         *
+         *        Could be emulated by doing:
+         *              ld1.a r1=[r3],1
+         *              store to temporary;
+         *              ld1.a r1=[r3],1
+         *              store & shift to temporary;
+         *              ld1.a r1=[r3],1
+         *              store & shift to temporary;
+         *              ld1.a r1=[r3]
+         *              store & shift to temporary;
+         *              r1=temporary
+         *
+         *        So in this case, you would get the right value is r1 but the wrong info in
+         *        the ALAT.  Notice that you could do it in reverse to finish with address 3
+         *        but you would still get the size wrong.  To get the size right, one needs to
+         *        execute exactly the same kind of load. You could do it from a aligned
+         *        temporary location, but you would get the address wrong.
+         *
+         *        So no matter what, it is not possible to emulate an advanced load
+         *        correctly. But is that really critical ?
+         *
+         *        We will always convert ld.a into a normal load with ALAT invalidated.  This
+         *        will enable compiler to do optimization where certain code path after ld.a
+         *        is not required to have ld.c/chk.a, e.g., code path with no intervening stores.
+         *
+         *        If there is a store after the advanced load, one must either do a ld.c.* or
+         *        chk.a.* to reuse the value stored in the ALAT. Both can "fail" (meaning no
+         *        entry found in ALAT), and that's perfectly ok because:
+         *
+         *              - ld.c.*, if the entry is not present a  normal load is executed
+         *              - chk.a.*, if the entry is not present, execution jumps to recovery code
+         *
+         *        In either case, the load can be potentially retried in another form.
+         *
+         *        ALAT must be invalidated for the register (so that chk.a or ld.c don't pick
+         *        up a stale entry later). The register base update MUST also be performed.
+         */
+        /*
+         * when the load has the .acq completer then
+         * use ordering fence.
+         */
+        if (ld.x6_op == 0x5 || ld.x6_op == 0xa)
+                mb();
+        /*
+         * invalidate ALAT entry in case of advanced load
+         */
+        if (ld.x6_op == 0x2)
+                invala_gr(ld.r1);
+        return 0;
+}
+static int
+emulate_store_int (unsigned long ifa, load_store_t ld, struct pt_regs *regs)
+{
+        unsigned long r2;
+        unsigned int len = 1 << ld.x6_sz;
+        /*
+         * if we get to this handler, Nat bits on both r3 and r2 have already
+         * been checked. so we don't need to do it
+         *
+         * extract the value to be stored
+         */
+        getreg(ld.imm, &r2, NULL, regs);
+        /*
+         * we rely on the macros in unaligned.h for now i.e.,
+         * we let the compiler figure out how to read memory gracefully.
+         *
+         * We need this switch/case because the way the inline function
+         * works. The code is optimized by the compiler and looks like
+         * a single switch/case.
+         */
+        DPRINT("st%d [%lx]=%lx\n", len, ifa, r2);
+        if (len != 2 && len != 4 && len != 8) {
+                DPRINT("unknown size: x6=%d\n", ld.x6_sz);
+                return -1;
+        }
+        /* this assumes little-endian byte-order: */
+        if (copy_to_user((void __user *) ifa, &r2, len))
+                return -1;
+        /*
+         * stX [r3]=r2,imm(9)
+         *
+         * NOTE:
+         * ld.r3 can never be r0, because r0 would not generate an
+         * unaligned access.
+         */
+        if (ld.op == 0x5) {
+                unsigned long imm;
+                /*
+                 * form imm9: [12:6] contain first 7bits
+                 */
+                imm = ld.x << 7 | ld.r1;
+                /*
+                 * sign extend (8bits) if m set
+                 */
+                if (ld.m) imm |= SIGN_EXT9;
+                /*
+                 * ifa == r3 (NaT is necessarily cleared)
+                 */
+                ifa += imm;
+                DPRINT("imm=%lx r3=%lx\n", imm, ifa);
+                setreg(ld.r3, ifa, 0, regs);
+        }
+        /*
+         * we don't have alat_invalidate_multiple() so we need
+         * to do the complete flush :-<<
+         */
+        ia64_invala();
+        /*
+         * stX.rel: use fence instead of release
+         */
+        if (ld.x6_op == 0xd)
+                mb();
+        return 0;
+}
+/*
+ * floating point operations sizes in bytes
+ */
+static const unsigned char float_fsz[4]={
+        10, /* extended precision (e) */
+        8,  /* integer (8)            */
+        4,  /* single precision (s)   */
+        8   /* double precision (d)   */
+};
+static inline void
+mem2float_extended (struct ia64_fpreg *init, struct ia64_fpreg *final)
+{
+        ia64_ldfe(6, init);
+        ia64_stop();
+        ia64_stf_spill(final, 6);
+}
+static inline void
+mem2float_integer (struct ia64_fpreg *init, struct ia64_fpreg *final)
+{
+        ia64_ldf8(6, init);
+        ia64_stop();
+        ia64_stf_spill(final, 6);
+}
+static inline void
+mem2float_single (struct ia64_fpreg *init, struct ia64_fpreg *final)
+{
+        ia64_ldfs(6, init);
+        ia64_stop();
+        ia64_stf_spill(final, 6);
+}
+static inline void
+mem2float_double (struct ia64_fpreg *init, struct ia64_fpreg *final)
+{
+        ia64_ldfd(6, init);
+        ia64_stop();
+        ia64_stf_spill(final, 6);
+}
+static inline void
+float2mem_extended (struct ia64_fpreg *init, struct ia64_fpreg *final)
+{
+        ia64_ldf_fill(6, init);
+        ia64_stop();
+        ia64_stfe(final, 6);
+}
+static inline void
+float2mem_integer (struct ia64_fpreg *init, struct ia64_fpreg *final)
+{
+        ia64_ldf_fill(6, init);
+        ia64_stop();
+        ia64_stf8(final, 6);
+}
+static inline void
+float2mem_single (struct ia64_fpreg *init, struct ia64_fpreg *final)
+{
+        ia64_ldf_fill(6, init);
+        ia64_stop();
+        ia64_stfs(final, 6);
+}
+static inline void
+float2mem_double (struct ia64_fpreg *init, struct ia64_fpreg *final)
+{
+        ia64_ldf_fill(6, init);
+        ia64_stop();
+        ia64_stfd(final, 6);
+}
+static int
+emulate_load_floatpair (unsigned long ifa, load_store_t ld, struct pt_regs *regs)
+{
+        struct ia64_fpreg fpr_init[2];
+        struct ia64_fpreg fpr_final[2];
+        unsigned long len = float_fsz[ld.x6_sz];
+        /*
+         * fr0 & fr1 don't need to be checked because Illegal Instruction faults have
+         * higher priority than unaligned faults.
+         *
+         * r0 cannot be found as the base as it would never generate an unaligned
+         * reference.
+         */
+        /*
+         * make sure we get clean buffers
+         */
+        memset(&fpr_init, 0, sizeof(fpr_init));
+        memset(&fpr_final, 0, sizeof(fpr_final));
+        /*
+         * ldfpX.a: we don't try to emulate anything but we must
+         * invalidate the ALAT entry and execute updates, if any.
+         */
+        if (ld.x6_op != 0x2) {
+                /*
+                 * This assumes little-endian byte-order.  Note that there is no "ldfpe"
+                 * instruction:
+                 */
+                if (copy_from_user(&fpr_init[0], (void __user *) ifa, len)
+                    || copy_from_user(&fpr_init[1], (void __user *) (ifa + len), len))
+                        return -1;
+                DPRINT("ld.r1=%d ld.imm=%d x6_sz=%d\n", ld.r1, ld.imm, ld.x6_sz);
+                DDUMP("frp_init =", &fpr_init, 2*len);
+                /*
+                 * XXX fixme
+                 * Could optimize inlines by using ldfpX & 2 spills
+                 */
+                switch( ld.x6_sz ) {
+                        case 0:
+                                mem2float_extended(&fpr_init[0], &fpr_final[0]);
+                                mem2float_extended(&fpr_init[1], &fpr_final[1]);
+                                break;
+                        case 1:
+                                mem2float_integer(&fpr_init[0], &fpr_final[0]);
+                                mem2float_integer(&fpr_init[1], &fpr_final[1]);
+                                break;
+                        case 2:
+                                mem2float_single(&fpr_init[0], &fpr_final[0]);
+                                mem2float_single(&fpr_init[1], &fpr_final[1]);
+                                break;
+                        case 3:
+                                mem2float_double(&fpr_init[0], &fpr_final[0]);
+                                mem2float_double(&fpr_init[1], &fpr_final[1]);
+                                break;
+                }
+                DDUMP("fpr_final =", &fpr_final, 2*len);
+                /*
+                 * XXX fixme
+                 *
+                 * A possible optimization would be to drop fpr_final and directly
+                 * use the storage from the saved context i.e., the actual final
+                 * destination (pt_regs, switch_stack or thread structure).
+                 */
+                setfpreg(ld.r1, &fpr_final[0], regs);
+                setfpreg(ld.imm, &fpr_final[1], regs);
+        }
+        /*
+         * Check for updates: only immediate updates are available for this
+         * instruction.
+         */
+        if (ld.m) {
+                /*
+                 * the immediate is implicit given the ldsz of the operation:
+                 * single: 8 (2x4) and for  all others it's 16 (2x8)
+                 */
+                ifa += len<<1;
+                /*
+                 * IMPORTANT:
+                 * the fact that we force the NaT of r3 to zero is ONLY valid
+                 * as long as we don't come here with a ldfpX.s.
+                 * For this reason we keep this sanity check
+                 */
+                if (ld.x6_op == 1 || ld.x6_op == 3)
+                        printk(KERN_ERR "%s: register update on speculative load pair, error\n",
+                               __FUNCTION__);
+                setreg(ld.r3, ifa, 0, regs);
+        }
+        /*
+         * Invalidate ALAT entries, if any, for both registers.
+         */
+        if (ld.x6_op == 0x2) {
+                invala_fr(ld.r1);
+                invala_fr(ld.imm);
+        }
+        return 0;
+}
+static int
+emulate_load_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs)
+{
+        struct ia64_fpreg fpr_init;
+        struct ia64_fpreg fpr_final;
+        unsigned long len = float_fsz[ld.x6_sz];
+        /*
+         * fr0 & fr1 don't need to be checked because Illegal Instruction
+         * faults have higher priority than unaligned faults.
+         *
+         * r0 cannot be found as the base as it would never generate an
+         * unaligned reference.
+         */
+        /*
+         * make sure we get clean buffers
+         */
+        memset(&fpr_init,0, sizeof(fpr_init));
+        memset(&fpr_final,0, sizeof(fpr_final));
+        /*
+         * ldfX.a we don't try to emulate anything but we must
+         * invalidate the ALAT entry.
+         * See comments in ldX for descriptions on how the various loads are handled.
+         */
+        if (ld.x6_op != 0x2) {
+                if (copy_from_user(&fpr_init, (void __user *) ifa, len))
+                        return -1;
+                DPRINT("ld.r1=%d x6_sz=%d\n", ld.r1, ld.x6_sz);
+                DDUMP("fpr_init =", &fpr_init, len);
+                /*
+                 * we only do something for x6_op={0,8,9}
+                 */
+                switch( ld.x6_sz ) {
+                        case 0:
+                                mem2float_extended(&fpr_init, &fpr_final);
+                                break;
+                        case 1:
+                                mem2float_integer(&fpr_init, &fpr_final);
+                                break;
+                        case 2:
+                                mem2float_single(&fpr_init, &fpr_final);
+                                break;
+                        case 3:
+                                mem2float_double(&fpr_init, &fpr_final);
+                                break;
+                }
+                DDUMP("fpr_final =", &fpr_final, len);
+                /*
+                 * XXX fixme
+                 *
+                 * A possible optimization would be to drop fpr_final and directly
+                 * use the storage from the saved context i.e., the actual final
+                 * destination (pt_regs, switch_stack or thread structure).
+                 */
+                setfpreg(ld.r1, &fpr_final, regs);
+        }
+        /*
+         * check for updates on any loads
+         */
+        if (ld.op == 0x7 || ld.m)
+                emulate_load_updates(ld.op == 0x7 ? UPD_IMMEDIATE: UPD_REG, ld, regs, ifa);
+        /*
+         * invalidate ALAT entry in case of advanced floating point loads
+         */
+        if (ld.x6_op == 0x2)
+                invala_fr(ld.r1);
+        return 0;
+}
+static int
+emulate_store_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs)
+{
+        struct ia64_fpreg fpr_init;
+        struct ia64_fpreg fpr_final;
+        unsigned long len = float_fsz[ld.x6_sz];
+        /*
+         * make sure we get clean buffers
+         */
+        memset(&fpr_init,0, sizeof(fpr_init));
+        memset(&fpr_final,0, sizeof(fpr_final));
+        /*
+         * if we get to this handler, Nat bits on both r3 and r2 have already
+         * been checked. so we don't need to do it
+         *
+         * extract the value to be stored
+         */
+        getfpreg(ld.imm, &fpr_init, regs);
+        /*
+         * during this step, we extract the spilled registers from the saved
+         * context i.e., we refill. Then we store (no spill) to temporary
+         * aligned location
+         */
+        switch( ld.x6_sz ) {
+                case 0:
+                        float2mem_extended(&fpr_init, &fpr_final);
+                        break;
+                case 1:
+                        float2mem_integer(&fpr_init, &fpr_final);
+                        break;
+                case 2:
+                        float2mem_single(&fpr_init, &fpr_final);
+                        break;
+                case 3:
+                        float2mem_double(&fpr_init, &fpr_final);
+                        break;
+        }
+        DPRINT("ld.r1=%d x6_sz=%d\n", ld.r1, ld.x6_sz);
+        DDUMP("fpr_init =", &fpr_init, len);
+        DDUMP("fpr_final =", &fpr_final, len);
+        if (copy_to_user((void __user *) ifa, &fpr_final, len))
+                return -1;
+        /*
+         * stfX [r3]=r2,imm(9)
+         *
+         * NOTE:
+         * ld.r3 can never be r0, because r0 would not generate an
+         * unaligned access.
+         */
+        if (ld.op == 0x7) {
+                unsigned long imm;
+                /*
+                 * form imm9: [12:6] contain first 7bits
+                 */
+                imm = ld.x << 7 | ld.r1;
+                /*
+                 * sign extend (8bits) if m set
+                 */
+                if (ld.m)
+                        imm |= SIGN_EXT9;
+                /*
+                 * ifa == r3 (NaT is necessarily cleared)
+                 */
+                ifa += imm;
+                DPRINT("imm=%lx r3=%lx\n", imm, ifa);
+                setreg(ld.r3, ifa, 0, regs);
+        }
+        /*
+         * we don't have alat_invalidate_multiple() so we need
+         * to do the complete flush :-<<
+         */
+        ia64_invala();
+        return 0;
+}
+/*
+ * Make sure we log the unaligned access, so that user/sysadmin can notice it and
+ * eventually fix the program.  However, we don't want to do that for every access so we
+ * pace it with jiffies.  This isn't really MP-safe, but it doesn't really have to be
+ * either...
+ */
+static int
+within_logging_rate_limit (void)
+{
+        static unsigned long count, last_time;
+        if (jiffies - last_time > 5*HZ)
+                count = 0;
+        if (++count < 5) {
+                last_time = jiffies;
+                return 1;
+        }
+        return 0;
+}
+void
+ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
+{
+        struct ia64_psr *ipsr = ia64_psr(regs);
+        mm_segment_t old_fs = get_fs();
+        unsigned long bundle[2];
+        unsigned long opcode;
+        struct siginfo si;
+        const struct exception_table_entry *eh = NULL;
+        union {
+                unsigned long l;
+                load_store_t insn;
+        } u;
+        int ret = -1;
+        if (ia64_psr(regs)->be) {
+                /* we don't support big-endian accesses */
+                die_if_kernel("big-endian unaligned accesses are not supported", regs, 0);
+                goto force_sigbus;
+        }
+        /*
+         * Treat kernel accesses for which there is an exception handler entry the same as
+         * user-level unaligned accesses.  Otherwise, a clever program could trick this
+         * handler into reading an arbitrary kernel addresses...
+         */
+        if (!user_mode(regs))
+                eh = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
+        if (user_mode(regs) || eh) {
+                if ((current->thread.flags & IA64_THREAD_UAC_SIGBUS) != 0)
+                        goto force_sigbus;
+                if (!(current->thread.flags & IA64_THREAD_UAC_NOPRINT)
+                    && within_logging_rate_limit())
+                {
+                        char buf[200];  /* comm[] is at most 16 bytes... */
+                        size_t len;
+                        len = sprintf(buf, "%s(%d): unaligned access to 0x%016lx, "
+                                      "ip=0x%016lx\n\r", current->comm, current->pid,
+                                      ifa, regs->cr_iip + ipsr->ri);
+                        /*
+                         * Don't call tty_write_message() if we're in the kernel; we might
+                         * be holding locks...
+                         */
+                        if (user_mode(regs))
+                                tty_write_message(current->signal->tty, buf);
+                        buf[len-1] = '\0';      /* drop '\r' */
+                        printk(KERN_WARNING "%s", buf); /* watch for command names containing %s */
+                }
+        } else {
+                if (within_logging_rate_limit())
+                        printk(KERN_WARNING "kernel unaligned access to 0x%016lx, ip=0x%016lx\n",
+                               ifa, regs->cr_iip + ipsr->ri);
+                set_fs(KERNEL_DS);
+        }
+        DPRINT("iip=%lx ifa=%lx isr=%lx (ei=%d, sp=%d)\n",
+               regs->cr_iip, ifa, regs->cr_ipsr, ipsr->ri, ipsr->it);
+        if (__copy_from_user(bundle, (void __user *) regs->cr_iip, 16))
+                goto failure;
+        /*
+         * extract the instruction from the bundle given the slot number
+         */
+        switch (ipsr->ri) {
+              case 0: u.l = (bundle[0] >>  5); break;
+              case 1: u.l = (bundle[0] >> 46) | (bundle[1] << 18); break;
+              case 2: u.l = (bundle[1] >> 23); break;
+        }
+        opcode = (u.l >> IA64_OPCODE_SHIFT) & IA64_OPCODE_MASK;
+        DPRINT("opcode=%lx ld.qp=%d ld.r1=%d ld.imm=%d ld.r3=%d ld.x=%d ld.hint=%d "
+               "ld.x6=0x%x ld.m=%d ld.op=%d\n", opcode, u.insn.qp, u.insn.r1, u.insn.imm,
+               u.insn.r3, u.insn.x, u.insn.hint, u.insn.x6_sz, u.insn.m, u.insn.op);
+        /*
+         * IMPORTANT:
+         * Notice that the switch statement DOES not cover all possible instructions
+         * that DO generate unaligned references. This is made on purpose because for some
+         * instructions it DOES NOT make sense to try and emulate the access. Sometimes it
+         * is WRONG to try and emulate. Here is a list of instruction we don't emulate i.e.,
+         * the program will get a signal and die:
+         *
+         *      load/store:
+         *              - ldX.spill
+         *              - stX.spill
+         *      Reason: RNATs are based on addresses
+         *              - ld16
+         *              - st16
+         *      Reason: ld16 and st16 are supposed to occur in a single
+         *              memory op
+         *
+         *      synchronization:
+         *              - cmpxchg
+         *              - fetchadd
+         *              - xchg
+         *      Reason: ATOMIC operations cannot be emulated properly using multiple
+         *              instructions.
+         *
+         *      speculative loads:
+         *              - ldX.sZ
+         *      Reason: side effects, code must be ready to deal with failure so simpler
+         *              to let the load fail.
+         * ---------------------------------------------------------------------------------
+         * XXX fixme
+         *
+         * I would like to get rid of this switch case and do something
+         * more elegant.
+         */
+        switch (opcode) {
+              case LDS_OP:
+              case LDSA_OP:
+                if (u.insn.x)
+                        /* oops, really a semaphore op (cmpxchg, etc) */
+                        goto failure;
+                /* no break */
+              case LDS_IMM_OP:
+              case LDSA_IMM_OP:
+              case LDFS_OP:
+              case LDFSA_OP:
+              case LDFS_IMM_OP:
+                /*
+                 * The instruction will be retried with deferred exceptions turned on, and
+                 * we should get Nat bit installed
+                 *
+                 * IMPORTANT: When PSR_ED is set, the register & immediate update forms
+                 * are actually executed even though the operation failed. So we don't
+                 * need to take care of this.
+                 */
+                DPRINT("forcing PSR_ED\n");
+                regs->cr_ipsr |= IA64_PSR_ED;
+                goto done;
+              case LD_OP:
+              case LDA_OP:
+              case LDBIAS_OP:
+              case LDACQ_OP:
+              case LDCCLR_OP:
+              case LDCNC_OP:
+              case LDCCLRACQ_OP:
+                if (u.insn.x)
+                        /* oops, really a semaphore op (cmpxchg, etc) */
+                        goto failure;
+                /* no break */
+              case LD_IMM_OP:
+              case LDA_IMM_OP:
+              case LDBIAS_IMM_OP:
+              case LDACQ_IMM_OP:
+              case LDCCLR_IMM_OP:
+              case LDCNC_IMM_OP:
+              case LDCCLRACQ_IMM_OP:
+                ret = emulate_load_int(ifa, u.insn, regs);
+                break;
+              case ST_OP:
+              case STREL_OP:
+                if (u.insn.x)
+                        /* oops, really a semaphore op (cmpxchg, etc) */
+                        goto failure;
+                /* no break */
+              case ST_IMM_OP:
+              case STREL_IMM_OP:
+                ret = emulate_store_int(ifa, u.insn, regs);
+                break;
+              case LDF_OP:
+              case LDFA_OP:
+              case LDFCCLR_OP:
+              case LDFCNC_OP:
+              case LDF_IMM_OP:
+              case LDFA_IMM_OP:
+              case LDFCCLR_IMM_OP:
+              case LDFCNC_IMM_OP:
+                if (u.insn.x)
+                        ret = emulate_load_floatpair(ifa, u.insn, regs);
+                else
+                        ret = emulate_load_float(ifa, u.insn, regs);
+                break;
+              case STF_OP:
+              case STF_IMM_OP:
+                ret = emulate_store_float(ifa, u.insn, regs);
+                break;
+              default:
+                goto failure;
+        }
+        DPRINT("ret=%d\n", ret);
+        if (ret)
+                goto failure;
+        if (ipsr->ri == 2)
+                /*
+                 * given today's architecture this case is not likely to happen because a
+                 * memory access instruction (M) can never be in the last slot of a
+                 * bundle. But let's keep it for now.
+                 */
+                regs->cr_iip += 16;
+        ipsr->ri = (ipsr->ri + 1) & 0x3;
+        DPRINT("ipsr->ri=%d iip=%lx\n", ipsr->ri, regs->cr_iip);
+  done:
+        set_fs(old_fs);         /* restore original address limit */
+        return;
+  failure:
+        /* something went wrong... */
+        if (!user_mode(regs)) {
+                if (eh) {
+                        ia64_handle_exception(regs, eh);
+                        goto done;
+                }
+                die_if_kernel("error during unaligned kernel access\n", regs, ret);
+                /* NOT_REACHED */
+        }
+  force_sigbus:
+        si.si_signo = SIGBUS;
+        si.si_errno = 0;
+        si.si_code = BUS_ADRALN;
+        si.si_addr = (void __user *) ifa;
+        si.si_flags = 0;
+        si.si_isr = 0;
+        si.si_imm = 0;
+        force_sig_info(SIGBUS, &si, current);
+        goto done;
+}
diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c
new file mode 100644
index 000000000000..d494ff647cac
--- /dev/null
+++ b/arch/ia64/kernel/unwind.c
@@ -0,0 +1,2306 @@
+/*
+ * Copyright (C) 1999-2004 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2003 Fenghua Yu <fenghua.yu@intel.com>
+ *      - Change pt_regs_off() to make it less dependant on pt_regs structure.
+ */
+/*
+ * This file implements call frame unwind support for the Linux
+ * kernel.  Parsing and processing the unwind information is
+ * time-consuming, so this implementation translates the unwind
+ * descriptors into unwind scripts.  These scripts are very simple
+ * (basically a sequence of assignments) and efficient to execute.
+ * They are cached for later re-use.  Each script is specific for a
+ * given instruction pointer address and the set of predicate values
+ * that the script depends on (most unwind descriptors are
+ * unconditional and scripts often do not depend on predicates at
+ * all).  This code is based on the unwind conventions described in
+ * the "IA-64 Software Conventions and Runtime Architecture" manual.
+ *
+ * SMP conventions:
+ *      o updates to the global unwind data (in structure "unw") are serialized
+ *        by the unw.lock spinlock
+ *      o each unwind script has its own read-write lock; a thread must acquire
+ *        a read lock before executing a script and must acquire a write lock
+ *        before modifying a script
+ *      o if both the unw.lock spinlock and a script's read-write lock must be
+ *        acquired, then the read-write lock must be acquired first.
+ */
+#include <linux/module.h>
+#include <linux/bootmem.h>
+#include <linux/elf.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <asm/unwind.h>
+#include <asm/delay.h>
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/ptrace_offsets.h>
+#include <asm/rse.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include "entry.h"
+#include "unwind_i.h"
+#define UNW_LOG_CACHE_SIZE      7       /* each unw_script is ~256 bytes in size */
+#define UNW_CACHE_SIZE          (1 << UNW_LOG_CACHE_SIZE)
+#define UNW_LOG_HASH_SIZE       (UNW_LOG_CACHE_SIZE + 1)
+#define UNW_HASH_SIZE           (1 << UNW_LOG_HASH_SIZE)
+#define UNW_STATS       0       /* WARNING: this disabled interrupts for long time-spans!! */
+#ifdef UNW_DEBUG
+  static unsigned int unw_debug_level = UNW_DEBUG;
+#  define UNW_DEBUG_ON(n)       unw_debug_level >= n
+   /* Do not code a printk level, not all debug lines end in newline */
+#  define UNW_DPRINT(n, ...)  if (UNW_DEBUG_ON(n)) printk(__VA_ARGS__)
+#  define inline
+#else /* !UNW_DEBUG */
+#  define UNW_DEBUG_ON(n)  0
+#  define UNW_DPRINT(n, ...)
+#endif /* UNW_DEBUG */
+#if UNW_STATS
+# define STAT(x...)     x
+#else
+# define STAT(x...)
+#endif
+#define alloc_reg_state()       kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC)
+#define free_reg_state(usr)     kfree(usr)
+#define alloc_labeled_state()   kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC)
+#define free_labeled_state(usr) kfree(usr)
+typedef unsigned long unw_word;
+typedef unsigned char unw_hash_index_t;
+static struct {
+        spinlock_t lock;                        /* spinlock for unwind data */
+        /* list of unwind tables (one per load-module) */
+        struct unw_table *tables;
+        unsigned long r0;                       /* constant 0 for r0 */
+        /* table of registers that prologues can save (and order in which they're saved): */
+        const unsigned char save_order[8];
+        /* maps a preserved register index (preg_index) to corresponding switch_stack offset: */
+        unsigned short sw_off[sizeof(struct unw_frame_info) / 8];
+        unsigned short lru_head;                /* index of lead-recently used script */
+        unsigned short lru_tail;                /* index of most-recently used script */
+        /* index into unw_frame_info for preserved register i */
+        unsigned short preg_index[UNW_NUM_REGS];
+        short pt_regs_offsets[32];
+        /* unwind table for the kernel: */
+        struct unw_table kernel_table;
+        /* unwind table describing the gate page (kernel code that is mapped into user space): */
+        size_t gate_table_size;
+        unsigned long *gate_table;
+        /* hash table that maps instruction pointer to script index: */
+        unsigned short hash[UNW_HASH_SIZE];
+        /* script cache: */
+        struct unw_script cache[UNW_CACHE_SIZE];
+# ifdef UNW_DEBUG
+        const char *preg_name[UNW_NUM_REGS];
+# endif
+# if UNW_STATS
+        struct {
+                struct {
+                        int lookups;
+                        int hinted_hits;
+                        int normal_hits;
+                        int collision_chain_traversals;
+                } cache;
+                struct {
+                        unsigned long build_time;
+                        unsigned long run_time;
+                        unsigned long parse_time;
+                        int builds;
+                        int news;
+                        int collisions;
+                        int runs;
+                } script;
+                struct {
+                        unsigned long init_time;
+                        unsigned long unwind_time;
+                        int inits;
+                        int unwinds;
+                } api;
+        } stat;
+# endif
+} unw = {
+        .tables = &unw.kernel_table,
+        .lock = SPIN_LOCK_UNLOCKED,
+        .save_order = {
+                UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR,
+                UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR
+        },
+        .preg_index = {
+                offsetof(struct unw_frame_info, pri_unat_loc)/8,        /* PRI_UNAT_GR */
+                offsetof(struct unw_frame_info, pri_unat_loc)/8,        /* PRI_UNAT_MEM */
+                offsetof(struct unw_frame_info, bsp_loc)/8,
+                offsetof(struct unw_frame_info, bspstore_loc)/8,
+                offsetof(struct unw_frame_info, pfs_loc)/8,
+                offsetof(struct unw_frame_info, rnat_loc)/8,
+                offsetof(struct unw_frame_info, psp)/8,
+                offsetof(struct unw_frame_info, rp_loc)/8,
+                offsetof(struct unw_frame_info, r4)/8,
+                offsetof(struct unw_frame_info, r5)/8,
+                offsetof(struct unw_frame_info, r6)/8,
+                offsetof(struct unw_frame_info, r7)/8,
+                offsetof(struct unw_frame_info, unat_loc)/8,
+                offsetof(struct unw_frame_info, pr_loc)/8,
+                offsetof(struct unw_frame_info, lc_loc)/8,
+                offsetof(struct unw_frame_info, fpsr_loc)/8,
+                offsetof(struct unw_frame_info, b1_loc)/8,
+                offsetof(struct unw_frame_info, b2_loc)/8,
+                offsetof(struct unw_frame_info, b3_loc)/8,
+                offsetof(struct unw_frame_info, b4_loc)/8,
+                offsetof(struct unw_frame_info, b5_loc)/8,
+                offsetof(struct unw_frame_info, f2_loc)/8,
+                offsetof(struct unw_frame_info, f3_loc)/8,
+                offsetof(struct unw_frame_info, f4_loc)/8,
+                offsetof(struct unw_frame_info, f5_loc)/8,
+                offsetof(struct unw_frame_info, fr_loc[16 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[17 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[18 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[19 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[20 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[21 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[22 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[23 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[24 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[25 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[26 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[27 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[28 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[29 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[30 - 16])/8,
+                offsetof(struct unw_frame_info, fr_loc[31 - 16])/8,
+        },
+        .pt_regs_offsets = {
+                [0] = -1,
+                offsetof(struct pt_regs,  r1),
+                offsetof(struct pt_regs,  r2),
+                offsetof(struct pt_regs,  r3),
+                [4] = -1, [5] = -1, [6] = -1, [7] = -1,
+                offsetof(struct pt_regs,  r8),
+                offsetof(struct pt_regs,  r9),
+                offsetof(struct pt_regs, r10),
+                offsetof(struct pt_regs, r11),
+                offsetof(struct pt_regs, r12),
+                offsetof(struct pt_regs, r13),
+                offsetof(struct pt_regs, r14),
+                offsetof(struct pt_regs, r15),
+                offsetof(struct pt_regs, r16),
+                offsetof(struct pt_regs, r17),
+                offsetof(struct pt_regs, r18),
+                offsetof(struct pt_regs, r19),
+                offsetof(struct pt_regs, r20),
+                offsetof(struct pt_regs, r21),
+                offsetof(struct pt_regs, r22),
+                offsetof(struct pt_regs, r23),
+                offsetof(struct pt_regs, r24),
+                offsetof(struct pt_regs, r25),
+                offsetof(struct pt_regs, r26),
+                offsetof(struct pt_regs, r27),
+                offsetof(struct pt_regs, r28),
+                offsetof(struct pt_regs, r29),
+                offsetof(struct pt_regs, r30),
+                offsetof(struct pt_regs, r31),
+        },
+        .hash = { [0 ... UNW_HASH_SIZE - 1] = -1 },
+#ifdef UNW_DEBUG
+        .preg_name = {
+                "pri_unat_gr", "pri_unat_mem", "bsp", "bspstore", "ar.pfs", "ar.rnat", "psp", "rp",
+                "r4", "r5", "r6", "r7",
+                "ar.unat", "pr", "ar.lc", "ar.fpsr",
+                "b1", "b2", "b3", "b4", "b5",
+                "f2", "f3", "f4", "f5",
+                "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
+                "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31"
+        }
+#endif
+};
+static inline int
+read_only (void *addr)
+{
+        return (unsigned long) ((char *) addr - (char *) &unw.r0) < sizeof(unw.r0);
+}
+/*
+ * Returns offset of rREG in struct pt_regs.
+ */
+static inline unsigned long
+pt_regs_off (unsigned long reg)
+{
+        short off = -1;
+        if (reg < ARRAY_SIZE(unw.pt_regs_offsets))
+                off = unw.pt_regs_offsets[reg];
+        if (off < 0) {
+                UNW_DPRINT(0, "unwind.%s: bad scratch reg r%lu\n", __FUNCTION__, reg);
+                off = 0;
+        }
+        return (unsigned long) off;
+}
+static inline struct pt_regs *
+get_scratch_regs (struct unw_frame_info *info)
+{
+        if (!info->pt) {
+                /* This should not happen with valid unwind info.  */
+                UNW_DPRINT(0, "unwind.%s: bad unwind info: resetting info->pt\n", __FUNCTION__);
+                if (info->flags & UNW_FLAG_INTERRUPT_FRAME)
+                        info->pt = (unsigned long) ((struct pt_regs *) info->psp - 1);
+                else
+                        info->pt = info->sp - 16;
+        }
+        UNW_DPRINT(3, "unwind.%s: sp 0x%lx pt 0x%lx\n", __FUNCTION__, info->sp, info->pt);
+        return (struct pt_regs *) info->pt;
+}
+/* Unwind accessors.  */
+int
+unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char *nat, int write)
+{
+        unsigned long *addr, *nat_addr, nat_mask = 0, dummy_nat;
+        struct unw_ireg *ireg;
+        struct pt_regs *pt;
+        if ((unsigned) regnum - 1 >= 127) {
+                if (regnum == 0 && !write) {
+                        *val = 0;       /* read r0 always returns 0 */
+                        *nat = 0;
+                        return 0;
+                }
+                UNW_DPRINT(0, "unwind.%s: trying to access non-existent r%u\n",
+                           __FUNCTION__, regnum);
+                return -1;
+        }
+        if (regnum < 32) {
+                if (regnum >= 4 && regnum <= 7) {
+                        /* access a preserved register */
+                        ireg = &info->r4 + (regnum - 4);
+                        addr = ireg->loc;
+                        if (addr) {
+                                nat_addr = addr + ireg->nat.off;
+                                switch (ireg->nat.type) {
+                                      case UNW_NAT_VAL:
+                                        /* simulate getf.sig/setf.sig */
+                                        if (write) {
+                                                if (*nat) {
+                                                        /* write NaTVal and be done with it */
+                                                        addr[0] = 0;
+                                                        addr[1] = 0x1fffe;
+                                                        return 0;
+                                                }
+                                                addr[1] = 0x1003e;
+                                        } else {
+                                                if (addr[0] == 0 && addr[1] == 0x1ffe) {
+                                                        /* return NaT and be done with it */
+                                                        *val = 0;
+                                                        *nat = 1;
+                                                        return 0;
+                                                }
+                                        }
+                                        /* fall through */
+                                      case UNW_NAT_NONE:
+                                        dummy_nat = 0;
+                                        nat_addr = &dummy_nat;
+                                        break;
+                                      case UNW_NAT_MEMSTK:
+                                        nat_mask = (1UL << ((long) addr & 0x1f8)/8);
+                                        break;
+                                      case UNW_NAT_REGSTK:
+                                        nat_addr = ia64_rse_rnat_addr(addr);
+                                        if ((unsigned long) addr < info->regstk.limit
+                                            || (unsigned long) addr >= info->regstk.top)
+                                        {
+                                                UNW_DPRINT(0, "unwind.%s: %p outside of regstk "
+                                                        "[0x%lx-0x%lx)\n",
+                                                        __FUNCTION__, (void *) addr,
+                                                        info->regstk.limit,
+                                                        info->regstk.top);
+                                                return -1;
+                                        }
+                                        if ((unsigned long) nat_addr >= info->regstk.top)
+                                                nat_addr = &info->sw->ar_rnat;
+                                        nat_mask = (1UL << ia64_rse_slot_num(addr));
+                                        break;
+                                }
+                        } else {
+                                addr = &info->sw->r4 + (regnum - 4);
+                                nat_addr = &info->sw->ar_unat;
+                                nat_mask = (1UL << ((long) addr & 0x1f8)/8);
+                        }
+                } else {
+                        /* access a scratch register */
+                        pt = get_scratch_regs(info);
+                        addr = (unsigned long *) ((unsigned long)pt + pt_regs_off(regnum));
+                        if (info->pri_unat_loc)
+                                nat_addr = info->pri_unat_loc;
+                        else
+                                nat_addr = &info->sw->ar_unat;
+                        nat_mask = (1UL << ((long) addr & 0x1f8)/8);
+                }
+        } else {
+                /* access a stacked register */
+                addr = ia64_rse_skip_regs((unsigned long *) info->bsp, regnum - 32);
+                nat_addr = ia64_rse_rnat_addr(addr);
+                if ((unsigned long) addr < info->regstk.limit
+                    || (unsigned long) addr >= info->regstk.top)
+                {
+                        UNW_DPRINT(0, "unwind.%s: ignoring attempt to access register outside "
+                                   "of rbs\n",  __FUNCTION__);
+                        return -1;
+                }
+                if ((unsigned long) nat_addr >= info->regstk.top)
+                        nat_addr = &info->sw->ar_rnat;
+                nat_mask = (1UL << ia64_rse_slot_num(addr));
+        }
+        if (write) {
+                if (read_only(addr)) {
+                        UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
+                                __FUNCTION__);
+                } else {
+                        *addr = *val;
+                        if (*nat)
+                                *nat_addr |= nat_mask;
+                        else
+                                *nat_addr &= ~nat_mask;
+                }
+        } else {
+                if ((*nat_addr & nat_mask) == 0) {
+                        *val = *addr;
+                        *nat = 0;
+                } else {
+                        *val = 0;       /* if register is a NaT, *addr may contain kernel data! */
+                        *nat = 1;
+                }
+        }
+        return 0;
+}
+EXPORT_SYMBOL(unw_access_gr);
+int
+unw_access_br (struct unw_frame_info *info, int regnum, unsigned long *val, int write)
+{
+        unsigned long *addr;
+        struct pt_regs *pt;
+        switch (regnum) {
+                /* scratch: */
+              case 0: pt = get_scratch_regs(info); addr = &pt->b0; break;
+              case 6: pt = get_scratch_regs(info); addr = &pt->b6; break;
+              case 7: pt = get_scratch_regs(info); addr = &pt->b7; break;
+                /* preserved: */
+              case 1: case 2: case 3: case 4: case 5:
+                addr = *(&info->b1_loc + (regnum - 1));
+                if (!addr)
+                        addr = &info->sw->b1 + (regnum - 1);
+                break;
+              default:
+                UNW_DPRINT(0, "unwind.%s: trying to access non-existent b%u\n",
+                           __FUNCTION__, regnum);
+                return -1;
+        }
+        if (write)
+                if (read_only(addr)) {
+                        UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
+                                __FUNCTION__);
+                } else
+                        *addr = *val;
+        else
+                *val = *addr;
+        return 0;
+}
+EXPORT_SYMBOL(unw_access_br);
+int
+unw_access_fr (struct unw_frame_info *info, int regnum, struct ia64_fpreg *val, int write)
+{
+        struct ia64_fpreg *addr = NULL;
+        struct pt_regs *pt;
+        if ((unsigned) (regnum - 2) >= 126) {
+                UNW_DPRINT(0, "unwind.%s: trying to access non-existent f%u\n",
+                           __FUNCTION__, regnum);
+                return -1;
+        }
+        if (regnum <= 5) {
+                addr = *(&info->f2_loc + (regnum - 2));
+                if (!addr)
+                        addr = &info->sw->f2 + (regnum - 2);
+        } else if (regnum <= 15) {
+                if (regnum <= 11) {
+                        pt = get_scratch_regs(info);
+                        addr = &pt->f6  + (regnum - 6);
+                }
+                else
+                        addr = &info->sw->f12 + (regnum - 12);
+        } else if (regnum <= 31) {
+                addr = info->fr_loc[regnum - 16];
+                if (!addr)
+                        addr = &info->sw->f16 + (regnum - 16);
+        } else {
+                struct task_struct *t = info->task;
+                if (write)
+                        ia64_sync_fph(t);
+                else
+                        ia64_flush_fph(t);
+                addr = t->thread.fph + (regnum - 32);
+        }
+        if (write)
+                if (read_only(addr)) {
+                        UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
+                                __FUNCTION__);
+                } else
+                        *addr = *val;
+        else
+                *val = *addr;
+        return 0;
+}
+EXPORT_SYMBOL(unw_access_fr);
+int
+unw_access_ar (struct unw_frame_info *info, int regnum, unsigned long *val, int write)
+{
+        unsigned long *addr;
+        struct pt_regs *pt;
+        switch (regnum) {
+              case UNW_AR_BSP:
+                addr = info->bsp_loc;
+                if (!addr)
+                        addr = &info->sw->ar_bspstore;
+                break;
+              case UNW_AR_BSPSTORE:
+                addr = info->bspstore_loc;
+                if (!addr)
+                        addr = &info->sw->ar_bspstore;
+                break;
+              case UNW_AR_PFS:
+                addr = info->pfs_loc;
+                if (!addr)
+                        addr = &info->sw->ar_pfs;
+                break;
+              case UNW_AR_RNAT:
+                addr = info->rnat_loc;
+                if (!addr)
+                        addr = &info->sw->ar_rnat;
+                break;
+              case UNW_AR_UNAT:
+                addr = info->unat_loc;
+                if (!addr)
+                        addr = &info->sw->ar_unat;
+                break;
+              case UNW_AR_LC:
+                addr = info->lc_loc;
+                if (!addr)
+                        addr = &info->sw->ar_lc;
+                break;
+              case UNW_AR_EC:
+                if (!info->cfm_loc)
+                        return -1;
+                if (write)
+                        *info->cfm_loc =
+                                (*info->cfm_loc & ~(0x3fUL << 52)) | ((*val & 0x3f) << 52);
+                else
+                        *val = (*info->cfm_loc >> 52) & 0x3f;
+                return 0;
+              case UNW_AR_FPSR:
+                addr = info->fpsr_loc;
+                if (!addr)
+                        addr = &info->sw->ar_fpsr;
+                break;
+              case UNW_AR_RSC:
+                pt = get_scratch_regs(info);
+                addr = &pt->ar_rsc;
+                break;
+              case UNW_AR_CCV:
+                pt = get_scratch_regs(info);
+                addr = &pt->ar_ccv;
+                break;
+              case UNW_AR_CSD:
+                pt = get_scratch_regs(info);
+                addr = &pt->ar_csd;
+                break;
+              case UNW_AR_SSD:
+                pt = get_scratch_regs(info);
+                addr = &pt->ar_ssd;
+                break;
+              default:
+                UNW_DPRINT(0, "unwind.%s: trying to access non-existent ar%u\n",
+                           __FUNCTION__, regnum);
+                return -1;
+        }
+        if (write) {
+                if (read_only(addr)) {
+                        UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
+                                __FUNCTION__);
+                } else
+                        *addr = *val;
+        } else
+                *val = *addr;
+        return 0;
+}
+EXPORT_SYMBOL(unw_access_ar);
+int
+unw_access_pr (struct unw_frame_info *info, unsigned long *val, int write)
+{
+        unsigned long *addr;
+        addr = info->pr_loc;
+        if (!addr)
+                addr = &info->sw->pr;
+        if (write) {
+                if (read_only(addr)) {
+                        UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
+                                __FUNCTION__);
+                } else
+                        *addr = *val;
+        } else
+                *val = *addr;
+        return 0;
+}
+EXPORT_SYMBOL(unw_access_pr);
+/* Routines to manipulate the state stack.  */
+static inline void
+push (struct unw_state_record *sr)
+{
+        struct unw_reg_state *rs;
+        rs = alloc_reg_state();
+        if (!rs) {
+                printk(KERN_ERR "unwind: cannot stack reg state!\n");
+                return;
+        }
+        memcpy(rs, &sr->curr, sizeof(*rs));
+        sr->curr.next = rs;
+}
+static void
+pop (struct unw_state_record *sr)
+{
+        struct unw_reg_state *rs = sr->curr.next;
+        if (!rs) {
+                printk(KERN_ERR "unwind: stack underflow!\n");
+                return;
+        }
+        memcpy(&sr->curr, rs, sizeof(*rs));
+        free_reg_state(rs);
+}
+/* Make a copy of the state stack.  Non-recursive to avoid stack overflows.  */
+static struct unw_reg_state *
+dup_state_stack (struct unw_reg_state *rs)
+{
+        struct unw_reg_state *copy, *prev = NULL, *first = NULL;
+        while (rs) {
+                copy = alloc_reg_state();
+                if (!copy) {
+                        printk(KERN_ERR "unwind.dup_state_stack: out of memory\n");
+                        return NULL;
+                }
+                memcpy(copy, rs, sizeof(*copy));
+                if (first)
+                        prev->next = copy;
+                else
+                        first = copy;
+                rs = rs->next;
+                prev = copy;
+        }
+        return first;
+}
+/* Free all stacked register states (but not RS itself).  */
+static void
+free_state_stack (struct unw_reg_state *rs)
+{
+        struct unw_reg_state *p, *next;
+        for (p = rs->next; p != NULL; p = next) {
+                next = p->next;
+                free_reg_state(p);
+        }
+        rs->next = NULL;
+}
+/* Unwind decoder routines */
+static enum unw_register_index __attribute_const__
+decode_abreg (unsigned char abreg, int memory)
+{
+        switch (abreg) {
+              case 0x04 ... 0x07: return UNW_REG_R4 + (abreg - 0x04);
+              case 0x22 ... 0x25: return UNW_REG_F2 + (abreg - 0x22);
+              case 0x30 ... 0x3f: return UNW_REG_F16 + (abreg - 0x30);
+              case 0x41 ... 0x45: return UNW_REG_B1 + (abreg - 0x41);
+              case 0x60: return UNW_REG_PR;
+              case 0x61: return UNW_REG_PSP;
+              case 0x62: return memory ? UNW_REG_PRI_UNAT_MEM : UNW_REG_PRI_UNAT_GR;
+              case 0x63: return UNW_REG_RP;
+              case 0x64: return UNW_REG_BSP;
+              case 0x65: return UNW_REG_BSPSTORE;
+              case 0x66: return UNW_REG_RNAT;
+              case 0x67: return UNW_REG_UNAT;
+              case 0x68: return UNW_REG_FPSR;
+              case 0x69: return UNW_REG_PFS;
+              case 0x6a: return UNW_REG_LC;
+              default:
+                break;
+        }
+        UNW_DPRINT(0, "unwind.%s: bad abreg=0x%x\n", __FUNCTION__, abreg);
+        return UNW_REG_LC;
+}
+static void
+set_reg (struct unw_reg_info *reg, enum unw_where where, int when, unsigned long val)
+{
+        reg->val = val;
+        reg->where = where;
+        if (reg->when == UNW_WHEN_NEVER)
+                reg->when = when;
+}
+static void
+alloc_spill_area (unsigned long *offp, unsigned long regsize,
+                  struct unw_reg_info *lo, struct unw_reg_info *hi)
+{
+        struct unw_reg_info *reg;
+        for (reg = hi; reg >= lo; --reg) {
+                if (reg->where == UNW_WHERE_SPILL_HOME) {
+                        reg->where = UNW_WHERE_PSPREL;
+                        *offp -= regsize;
+                        reg->val = *offp;
+                }
+        }
+}
+static inline void
+spill_next_when (struct unw_reg_info **regp, struct unw_reg_info *lim, unw_word t)
+{
+        struct unw_reg_info *reg;
+        for (reg = *regp; reg <= lim; ++reg) {
+                if (reg->where == UNW_WHERE_SPILL_HOME) {
+                        reg->when = t;
+                        *regp = reg + 1;
+                        return;
+                }
+        }
+        UNW_DPRINT(0, "unwind.%s: excess spill!\n",  __FUNCTION__);
+}
+static inline void
+finish_prologue (struct unw_state_record *sr)
+{
+        struct unw_reg_info *reg;
+        unsigned long off;
+        int i;
+        /*
+         * First, resolve implicit register save locations (see Section "11.4.2.3 Rules
+         * for Using Unwind Descriptors", rule 3):
+         */
+        for (i = 0; i < (int) ARRAY_SIZE(unw.save_order); ++i) {
+                reg = sr->curr.reg + unw.save_order[i];
+                if (reg->where == UNW_WHERE_GR_SAVE) {
+                        reg->where = UNW_WHERE_GR;
+                        reg->val = sr->gr_save_loc++;
+                }
+        }
+        /*
+         * Next, compute when the fp, general, and branch registers get
+         * saved.  This must come before alloc_spill_area() because
+         * we need to know which registers are spilled to their home
+         * locations.
+         */
+        if (sr->imask) {
+                unsigned char kind, mask = 0, *cp = sr->imask;
+                int t;
+                static const unsigned char limit[3] = {
+                        UNW_REG_F31, UNW_REG_R7, UNW_REG_B5
+                };
+                struct unw_reg_info *(regs[3]);
+                regs[0] = sr->curr.reg + UNW_REG_F2;
+                regs[1] = sr->curr.reg + UNW_REG_R4;
+                regs[2] = sr->curr.reg + UNW_REG_B1;
+                for (t = 0; t < sr->region_len; ++t) {
+                        if ((t & 3) == 0)
+                                mask = *cp++;
+                        kind = (mask >> 2*(3-(t & 3))) & 3;
+                        if (kind > 0)
+                                spill_next_when(&regs[kind - 1], sr->curr.reg + limit[kind - 1],
+                                                sr->region_start + t);
+                }
+        }
+        /*
+         * Next, lay out the memory stack spill area:
+         */
+        if (sr->any_spills) {
+                off = sr->spill_offset;
+                alloc_spill_area(&off, 16, sr->curr.reg + UNW_REG_F2, sr->curr.reg + UNW_REG_F31);
+                alloc_spill_area(&off,  8, sr->curr.reg + UNW_REG_B1, sr->curr.reg + UNW_REG_B5);
+                alloc_spill_area(&off,  8, sr->curr.reg + UNW_REG_R4, sr->curr.reg + UNW_REG_R7);
+        }
+}
+/*
+ * Region header descriptors.
+ */
+static void
+desc_prologue (int body, unw_word rlen, unsigned char mask, unsigned char grsave,
+               struct unw_state_record *sr)
+{
+        int i, region_start;
+        if (!(sr->in_body || sr->first_region))
+                finish_prologue(sr);
+        sr->first_region = 0;
+        /* check if we're done: */
+        if (sr->when_target < sr->region_start + sr->region_len) {
+                sr->done = 1;
+                return;
+        }
+        region_start = sr->region_start + sr->region_len;
+        for (i = 0; i < sr->epilogue_count; ++i)
+                pop(sr);
+        sr->epilogue_count = 0;
+        sr->epilogue_start = UNW_WHEN_NEVER;
+        sr->region_start = region_start;
+        sr->region_len = rlen;
+        sr->in_body = body;
+        if (!body) {
+                push(sr);
+                for (i = 0; i < 4; ++i) {
+                        if (mask & 0x8)
+                                set_reg(sr->curr.reg + unw.save_order[i], UNW_WHERE_GR,
+                                        sr->region_start + sr->region_len - 1, grsave++);
+                        mask <<= 1;
+                }
+                sr->gr_save_loc = grsave;
+                sr->any_spills = 0;
+                sr->imask = NULL;
+                sr->spill_offset = 0x10;        /* default to psp+16 */
+        }
+}
+/*
+ * Prologue descriptors.
+ */
+static inline void
+desc_abi (unsigned char abi, unsigned char context, struct unw_state_record *sr)
+{
+        if (abi == 3 && context == 'i') {
+                sr->flags |= UNW_FLAG_INTERRUPT_FRAME;
+                UNW_DPRINT(3, "unwind.%s: interrupt frame\n",  __FUNCTION__);
+        }
+        else
+                UNW_DPRINT(0, "unwind%s: ignoring unwabi(abi=0x%x,context=0x%x)\n",
+                                __FUNCTION__, abi, context);
+}
+static inline void
+desc_br_gr (unsigned char brmask, unsigned char gr, struct unw_state_record *sr)
+{
+        int i;
+        for (i = 0; i < 5; ++i) {
+                if (brmask & 1)
+                        set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_GR,
+                                sr->region_start + sr->region_len - 1, gr++);
+                brmask >>= 1;
+        }
+}
+static inline void
+desc_br_mem (unsigned char brmask, struct unw_state_record *sr)
+{
+        int i;
+        for (i = 0; i < 5; ++i) {
+                if (brmask & 1) {
+                        set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_SPILL_HOME,
+                                sr->region_start + sr->region_len - 1, 0);
+                        sr->any_spills = 1;
+                }
+                brmask >>= 1;
+        }
+}
+static inline void
+desc_frgr_mem (unsigned char grmask, unw_word frmask, struct unw_state_record *sr)
+{
+        int i;
+        for (i = 0; i < 4; ++i) {
+                if ((grmask & 1) != 0) {
+                        set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME,
+                                sr->region_start + sr->region_len - 1, 0);
+                        sr->any_spills = 1;
+                }
+                grmask >>= 1;
+        }
+        for (i = 0; i < 20; ++i) {
+                if ((frmask & 1) != 0) {
+                        int base = (i < 4) ? UNW_REG_F2 : UNW_REG_F16 - 4;
+                        set_reg(sr->curr.reg + base + i, UNW_WHERE_SPILL_HOME,
+                                sr->region_start + sr->region_len - 1, 0);
+                        sr->any_spills = 1;
+                }
+                frmask >>= 1;
+        }
+}
+static inline void
+desc_fr_mem (unsigned char frmask, struct unw_state_record *sr)
+{
+        int i;
+        for (i = 0; i < 4; ++i) {
+                if ((frmask & 1) != 0) {
+                        set_reg(sr->curr.reg + UNW_REG_F2 + i, UNW_WHERE_SPILL_HOME,
+                                sr->region_start + sr->region_len - 1, 0);
+                        sr->any_spills = 1;
+                }
+                frmask >>= 1;
+        }
+}
+static inline void
+desc_gr_gr (unsigned char grmask, unsigned char gr, struct unw_state_record *sr)
+{
+        int i;
+        for (i = 0; i < 4; ++i) {
+                if ((grmask & 1) != 0)
+                        set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_GR,
+                                sr->region_start + sr->region_len - 1, gr++);
+                grmask >>= 1;
+        }
+}
+static inline void
+desc_gr_mem (unsigned char grmask, struct unw_state_record *sr)
+{
+        int i;
+        for (i = 0; i < 4; ++i) {
+                if ((grmask & 1) != 0) {
+                        set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME,
+                                sr->region_start + sr->region_len - 1, 0);
+                        sr->any_spills = 1;
+                }
+                grmask >>= 1;
+        }
+}
+static inline void
+desc_mem_stack_f (unw_word t, unw_word size, struct unw_state_record *sr)
+{
+        set_reg(sr->curr.reg + UNW_REG_PSP, UNW_WHERE_NONE,
+                sr->region_start + min_t(int, t, sr->region_len - 1), 16*size);
+}
+static inline void
+desc_mem_stack_v (unw_word t, struct unw_state_record *sr)
+{
+        sr->curr.reg[UNW_REG_PSP].when = sr->region_start + min_t(int, t, sr->region_len - 1);
+}
+static inline void
+desc_reg_gr (unsigned char reg, unsigned char dst, struct unw_state_record *sr)
+{
+        set_reg(sr->curr.reg + reg, UNW_WHERE_GR, sr->region_start + sr->region_len - 1, dst);
+}
+static inline void
+desc_reg_psprel (unsigned char reg, unw_word pspoff, struct unw_state_record *sr)
+{
+        set_reg(sr->curr.reg + reg, UNW_WHERE_PSPREL, sr->region_start + sr->region_len - 1,
+                0x10 - 4*pspoff);
+}
+static inline void
+desc_reg_sprel (unsigned char reg, unw_word spoff, struct unw_state_record *sr)
+{
+        set_reg(sr->curr.reg + reg, UNW_WHERE_SPREL, sr->region_start + sr->region_len - 1,
+                4*spoff);
+}
+static inline void
+desc_rp_br (unsigned char dst, struct unw_state_record *sr)
+{
+        sr->return_link_reg = dst;
+}
+static inline void
+desc_reg_when (unsigned char regnum, unw_word t, struct unw_state_record *sr)
+{
+        struct unw_reg_info *reg = sr->curr.reg + regnum;
+        if (reg->where == UNW_WHERE_NONE)
+                reg->where = UNW_WHERE_GR_SAVE;
+        reg->when = sr->region_start + min_t(int, t, sr->region_len - 1);
+}
+static inline void
+desc_spill_base (unw_word pspoff, struct unw_state_record *sr)
+{
+        sr->spill_offset = 0x10 - 4*pspoff;
+}
+static inline unsigned char *
+desc_spill_mask (unsigned char *imaskp, struct unw_state_record *sr)
+{
+        sr->imask = imaskp;
+        return imaskp + (2*sr->region_len + 7)/8;
+}
+/*
+ * Body descriptors.
+ */
+static inline void
+desc_epilogue (unw_word t, unw_word ecount, struct unw_state_record *sr)
+{
+        sr->epilogue_start = sr->region_start + sr->region_len - 1 - t;
+        sr->epilogue_count = ecount + 1;
+}
+static inline void
+desc_copy_state (unw_word label, struct unw_state_record *sr)
+{
+        struct unw_labeled_state *ls;
+        for (ls = sr->labeled_states; ls; ls = ls->next) {
+                if (ls->label == label) {
+                        free_state_stack(&sr->curr);
+                        memcpy(&sr->curr, &ls->saved_state, sizeof(sr->curr));
+                        sr->curr.next = dup_state_stack(ls->saved_state.next);
+                        return;
+                }
+        }
+        printk(KERN_ERR "unwind: failed to find state labeled 0x%lx\n", label);
+}
+static inline void
+desc_label_state (unw_word label, struct unw_state_record *sr)
+{
+        struct unw_labeled_state *ls;
+        ls = alloc_labeled_state();
+        if (!ls) {
+                printk(KERN_ERR "unwind.desc_label_state(): out of memory\n");
+                return;
+        }
+        ls->label = label;
+        memcpy(&ls->saved_state, &sr->curr, sizeof(ls->saved_state));
+        ls->saved_state.next = dup_state_stack(sr->curr.next);
+        /* insert into list of labeled states: */
+        ls->next = sr->labeled_states;
+        sr->labeled_states = ls;
+}
+/*
+ * General descriptors.
+ */
+static inline int
+desc_is_active (unsigned char qp, unw_word t, struct unw_state_record *sr)
+{
+        if (sr->when_target <= sr->region_start + min_t(int, t, sr->region_len - 1))
+                return 0;
+        if (qp > 0) {
+                if ((sr->pr_val & (1UL << qp)) == 0)
+                        return 0;
+                sr->pr_mask |= (1UL << qp);
+        }
+        return 1;
+}
+static inline void
+desc_restore_p (unsigned char qp, unw_word t, unsigned char abreg, struct unw_state_record *sr)
+{
+        struct unw_reg_info *r;
+        if (!desc_is_active(qp, t, sr))
+                return;
+        r = sr->curr.reg + decode_abreg(abreg, 0);
+        r->where = UNW_WHERE_NONE;
+        r->when = UNW_WHEN_NEVER;
+        r->val = 0;
+}
+static inline void
+desc_spill_reg_p (unsigned char qp, unw_word t, unsigned char abreg, unsigned char x,
+                     unsigned char ytreg, struct unw_state_record *sr)
+{
+        enum unw_where where = UNW_WHERE_GR;
+        struct unw_reg_info *r;
+        if (!desc_is_active(qp, t, sr))
+                return;
+        if (x)
+                where = UNW_WHERE_BR;
+        else if (ytreg & 0x80)
+                where = UNW_WHERE_FR;
+        r = sr->curr.reg + decode_abreg(abreg, 0);
+        r->where = where;
+        r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
+        r->val = (ytreg & 0x7f);
+}
+static inline void
+desc_spill_psprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word pspoff,
+                     struct unw_state_record *sr)
+{
+        struct unw_reg_info *r;
+        if (!desc_is_active(qp, t, sr))
+                return;
+        r = sr->curr.reg + decode_abreg(abreg, 1);
+        r->where = UNW_WHERE_PSPREL;
+        r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
+        r->val = 0x10 - 4*pspoff;
+}
+static inline void
+desc_spill_sprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word spoff,
+                       struct unw_state_record *sr)
+{
+        struct unw_reg_info *r;
+        if (!desc_is_active(qp, t, sr))
+                return;
+        r = sr->curr.reg + decode_abreg(abreg, 1);
+        r->where = UNW_WHERE_SPREL;
+        r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
+        r->val = 4*spoff;
+}
+#define UNW_DEC_BAD_CODE(code)                  printk(KERN_ERR "unwind: unknown code 0x%02x\n", \
+                                                       code);
+/*
+ * region headers:
+ */
+#define UNW_DEC_PROLOGUE_GR(fmt,r,m,gr,arg)     desc_prologue(0,r,m,gr,arg)
+#define UNW_DEC_PROLOGUE(fmt,b,r,arg)           desc_prologue(b,r,0,32,arg)
+/*
+ * prologue descriptors:
+ */
+#define UNW_DEC_ABI(fmt,a,c,arg)                desc_abi(a,c,arg)
+#define UNW_DEC_BR_GR(fmt,b,g,arg)              desc_br_gr(b,g,arg)
+#define UNW_DEC_BR_MEM(fmt,b,arg)               desc_br_mem(b,arg)
+#define UNW_DEC_FRGR_MEM(fmt,g,f,arg)           desc_frgr_mem(g,f,arg)
+#define UNW_DEC_FR_MEM(fmt,f,arg)               desc_fr_mem(f,arg)
+#define UNW_DEC_GR_GR(fmt,m,g,arg)              desc_gr_gr(m,g,arg)
+#define UNW_DEC_GR_MEM(fmt,m,arg)               desc_gr_mem(m,arg)
+#define UNW_DEC_MEM_STACK_F(fmt,t,s,arg)        desc_mem_stack_f(t,s,arg)
+#define UNW_DEC_MEM_STACK_V(fmt,t,arg)          desc_mem_stack_v(t,arg)
+#define UNW_DEC_REG_GR(fmt,r,d,arg)             desc_reg_gr(r,d,arg)
+#define UNW_DEC_REG_PSPREL(fmt,r,o,arg)         desc_reg_psprel(r,o,arg)
+#define UNW_DEC_REG_SPREL(fmt,r,o,arg)          desc_reg_sprel(r,o,arg)
+#define UNW_DEC_REG_WHEN(fmt,r,t,arg)           desc_reg_when(r,t,arg)
+#define UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg)      desc_reg_when(UNW_REG_PRI_UNAT_GR,t,arg)
+#define UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg)     desc_reg_when(UNW_REG_PRI_UNAT_MEM,t,arg)
+#define UNW_DEC_PRIUNAT_GR(fmt,r,arg)           desc_reg_gr(UNW_REG_PRI_UNAT_GR,r,arg)
+#define UNW_DEC_PRIUNAT_PSPREL(fmt,o,arg)       desc_reg_psprel(UNW_REG_PRI_UNAT_MEM,o,arg)
+#define UNW_DEC_PRIUNAT_SPREL(fmt,o,arg)        desc_reg_sprel(UNW_REG_PRI_UNAT_MEM,o,arg)
+#define UNW_DEC_RP_BR(fmt,d,arg)                desc_rp_br(d,arg)
+#define UNW_DEC_SPILL_BASE(fmt,o,arg)           desc_spill_base(o,arg)
+#define UNW_DEC_SPILL_MASK(fmt,m,arg)           (m = desc_spill_mask(m,arg))
+/*
+ * body descriptors:
+ */
+#define UNW_DEC_EPILOGUE(fmt,t,c,arg)           desc_epilogue(t,c,arg)
+#define UNW_DEC_COPY_STATE(fmt,l,arg)           desc_copy_state(l,arg)
+#define UNW_DEC_LABEL_STATE(fmt,l,arg)          desc_label_state(l,arg)
+/*
+ * general unwind descriptors:
+ */
+#define UNW_DEC_SPILL_REG_P(f,p,t,a,x,y,arg)    desc_spill_reg_p(p,t,a,x,y,arg)
+#define UNW_DEC_SPILL_REG(f,t,a,x,y,arg)        desc_spill_reg_p(0,t,a,x,y,arg)
+#define UNW_DEC_SPILL_PSPREL_P(f,p,t,a,o,arg)   desc_spill_psprel_p(p,t,a,o,arg)
+#define UNW_DEC_SPILL_PSPREL(f,t,a,o,arg)       desc_spill_psprel_p(0,t,a,o,arg)
+#define UNW_DEC_SPILL_SPREL_P(f,p,t,a,o,arg)    desc_spill_sprel_p(p,t,a,o,arg)
+#define UNW_DEC_SPILL_SPREL(f,t,a,o,arg)        desc_spill_sprel_p(0,t,a,o,arg)
+#define UNW_DEC_RESTORE_P(f,p,t,a,arg)          desc_restore_p(p,t,a,arg)
+#define UNW_DEC_RESTORE(f,t,a,arg)              desc_restore_p(0,t,a,arg)
+#include "unwind_decoder.c"
+/* Unwind scripts. */
+static inline unw_hash_index_t
+hash (unsigned long ip)
+{
+#       define hashmagic        0x9e3779b97f4a7c16UL    /* based on (sqrt(5)/2-1)*2^64 */
+        return (ip >> 4)*hashmagic >> (64 - UNW_LOG_HASH_SIZE);
+#undef hashmagic
+}
+static inline long
+cache_match (struct unw_script *script, unsigned long ip, unsigned long pr)
+{
+        read_lock(&script->lock);
+        if (ip == script->ip && ((pr ^ script->pr_val) & script->pr_mask) == 0)
+                /* keep the read lock... */
+                return 1;
+        read_unlock(&script->lock);
+        return 0;
+}
+static inline struct unw_script *
+script_lookup (struct unw_frame_info *info)
+{
+        struct unw_script *script = unw.cache + info->hint;
+        unsigned short index;
+        unsigned long ip, pr;
+        if (UNW_DEBUG_ON(0))
+                return NULL;    /* Always regenerate scripts in debug mode */
+        STAT(++unw.stat.cache.lookups);
+        ip = info->ip;
+        pr = info->pr;
+        if (cache_match(script, ip, pr)) {
+                STAT(++unw.stat.cache.hinted_hits);
+                return script;
+        }
+        index = unw.hash[hash(ip)];
+        if (index >= UNW_CACHE_SIZE)
+                return NULL;
+        script = unw.cache + index;
+        while (1) {
+                if (cache_match(script, ip, pr)) {
+                        /* update hint; no locking required as single-word writes are atomic */
+                        STAT(++unw.stat.cache.normal_hits);
+                        unw.cache[info->prev_script].hint = script - unw.cache;
+                        return script;
+                }
+                if (script->coll_chain >= UNW_HASH_SIZE)
+                        return NULL;
+                script = unw.cache + script->coll_chain;
+                STAT(++unw.stat.cache.collision_chain_traversals);
+        }
+}
+/*
+ * On returning, a write lock for the SCRIPT is still being held.
+ */
+static inline struct unw_script *
+script_new (unsigned long ip)
+{
+        struct unw_script *script, *prev, *tmp;
+        unw_hash_index_t index;
+        unsigned short head;
+        STAT(++unw.stat.script.news);
+        /*
+         * Can't (easily) use cmpxchg() here because of ABA problem
+         * that is intrinsic in cmpxchg()...
+         */
+        head = unw.lru_head;
+        script = unw.cache + head;
+        unw.lru_head = script->lru_chain;
+        /*
+         * We'd deadlock here if we interrupted a thread that is holding a read lock on
+         * script->lock.  Thus, if the write_trylock() fails, we simply bail out.  The
+         * alternative would be to disable interrupts whenever we hold a read-lock, but
+         * that seems silly.
+         */
+        if (!write_trylock(&script->lock))
+                return NULL;
+        /* re-insert script at the tail of the LRU chain: */
+        unw.cache[unw.lru_tail].lru_chain = head;
+        unw.lru_tail = head;
+        /* remove the old script from the hash table (if it's there): */
+        if (script->ip) {
+                index = hash(script->ip);
+                tmp = unw.cache + unw.hash[index];
+                prev = NULL;
+                while (1) {
+                        if (tmp == script) {
+                                if (prev)
+                                        prev->coll_chain = tmp->coll_chain;
+                                else
+                                        unw.hash[index] = tmp->coll_chain;
+                                break;
+                        } else
+                                prev = tmp;
+                        if (tmp->coll_chain >= UNW_CACHE_SIZE)
+                        /* old script wasn't in the hash-table */
+                                break;
+                        tmp = unw.cache + tmp->coll_chain;
+                }
+        }
+        /* enter new script in the hash table */
+        index = hash(ip);
+        script->coll_chain = unw.hash[index];
+        unw.hash[index] = script - unw.cache;
+        script->ip = ip;        /* set new IP while we're holding the locks */
+        STAT(if (script->coll_chain < UNW_CACHE_SIZE) ++unw.stat.script.collisions);
+        script->flags = 0;
+        script->hint = 0;
+        script->count = 0;
+        return script;
+}
+static void
+script_finalize (struct unw_script *script, struct unw_state_record *sr)
+{
+        script->pr_mask = sr->pr_mask;
+        script->pr_val = sr->pr_val;
+        /*
+         * We could down-grade our write-lock on script->lock here but
+         * the rwlock API doesn't offer atomic lock downgrading, so
+         * we'll just keep the write-lock and release it later when
+         * we're done using the script.
+         */
+}
+static inline void
+script_emit (struct unw_script *script, struct unw_insn insn)
+{
+        if (script->count >= UNW_MAX_SCRIPT_LEN) {
+                UNW_DPRINT(0, "unwind.%s: script exceeds maximum size of %u instructions!\n",
+                        __FUNCTION__, UNW_MAX_SCRIPT_LEN);
+                return;
+        }
+        script->insn[script->count++] = insn;
+}
+static inline void
+emit_nat_info (struct unw_state_record *sr, int i, struct unw_script *script)
+{
+        struct unw_reg_info *r = sr->curr.reg + i;
+        enum unw_insn_opcode opc;
+        struct unw_insn insn;
+        unsigned long val = 0;
+        switch (r->where) {
+              case UNW_WHERE_GR:
+                if (r->val >= 32) {
+                        /* register got spilled to a stacked register */
+                        opc = UNW_INSN_SETNAT_TYPE;
+                        val = UNW_NAT_REGSTK;
+                } else
+                        /* register got spilled to a scratch register */
+                        opc = UNW_INSN_SETNAT_MEMSTK;
+                break;
+              case UNW_WHERE_FR:
+                opc = UNW_INSN_SETNAT_TYPE;
+                val = UNW_NAT_VAL;
+                break;
+              case UNW_WHERE_BR:
+                opc = UNW_INSN_SETNAT_TYPE;
+                val = UNW_NAT_NONE;
+                break;
+              case UNW_WHERE_PSPREL:
+              case UNW_WHERE_SPREL:
+                opc = UNW_INSN_SETNAT_MEMSTK;
+                break;
+              default:
+                UNW_DPRINT(0, "unwind.%s: don't know how to emit nat info for where = %u\n",
+                           __FUNCTION__, r->where);
+                return;
+        }
+        insn.opc = opc;
+        insn.dst = unw.preg_index[i];
+        insn.val = val;
+        script_emit(script, insn);
+}
+static void
+compile_reg (struct unw_state_record *sr, int i, struct unw_script *script)
+{
+        struct unw_reg_info *r = sr->curr.reg + i;
+        enum unw_insn_opcode opc;
+        unsigned long val, rval;
+        struct unw_insn insn;
+        long need_nat_info;
+        if (r->where == UNW_WHERE_NONE || r->when >= sr->when_target)
+                return;
+        opc = UNW_INSN_MOVE;
+        val = rval = r->val;
+        need_nat_info = (i >= UNW_REG_R4 && i <= UNW_REG_R7);
+        switch (r->where) {
+              case UNW_WHERE_GR:
+                if (rval >= 32) {
+                        opc = UNW_INSN_MOVE_STACKED;
+                        val = rval - 32;
+                } else if (rval >= 4 && rval <= 7) {
+                        if (need_nat_info) {
+                                opc = UNW_INSN_MOVE2;
+                                need_nat_info = 0;
+                        }
+                        val = unw.preg_index[UNW_REG_R4 + (rval - 4)];
+                } else if (rval == 0) {
+                        opc = UNW_INSN_MOVE_CONST;
+                        val = 0;
+                } else {
+                        /* register got spilled to a scratch register */
+                        opc = UNW_INSN_MOVE_SCRATCH;
+                        val = pt_regs_off(rval);
+                }
+                break;
+              case UNW_WHERE_FR:
+                if (rval <= 5)
+                        val = unw.preg_index[UNW_REG_F2  + (rval -  2)];
+                else if (rval >= 16 && rval <= 31)
+                        val = unw.preg_index[UNW_REG_F16 + (rval - 16)];
+                else {
+                        opc = UNW_INSN_MOVE_SCRATCH;
+                        if (rval <= 11)
+                                val = offsetof(struct pt_regs, f6) + 16*(rval - 6);
+                        else
+                                UNW_DPRINT(0, "unwind.%s: kernel may not touch f%lu\n",
+                                           __FUNCTION__, rval);
+                }
+                break;
+              case UNW_WHERE_BR:
+                if (rval >= 1 && rval <= 5)
+                        val = unw.preg_index[UNW_REG_B1 + (rval - 1)];
+                else {
+                        opc = UNW_INSN_MOVE_SCRATCH;
+                        if (rval == 0)
+                                val = offsetof(struct pt_regs, b0);
+                        else if (rval == 6)
+                                val = offsetof(struct pt_regs, b6);
+                        else
+                                val = offsetof(struct pt_regs, b7);
+                }
+                break;
+              case UNW_WHERE_SPREL:
+                opc = UNW_INSN_ADD_SP;
+                break;
+              case UNW_WHERE_PSPREL:
+                opc = UNW_INSN_ADD_PSP;
+                break;
+              default:
+                UNW_DPRINT(0, "unwind%s: register %u has unexpected `where' value of %u\n",
+                           __FUNCTION__, i, r->where);
+                break;
+        }
+        insn.opc = opc;
+        insn.dst = unw.preg_index[i];
+        insn.val = val;
+        script_emit(script, insn);
+        if (need_nat_info)
+                emit_nat_info(sr, i, script);
+        if (i == UNW_REG_PSP) {
+                /*
+                 * info->psp must contain the _value_ of the previous
+                 * sp, not it's save location.  We get this by
+                 * dereferencing the value we just stored in
+                 * info->psp:
+                 */
+                insn.opc = UNW_INSN_LOAD;
+                insn.dst = insn.val = unw.preg_index[UNW_REG_PSP];
+                script_emit(script, insn);
+        }
+}
+static inline const struct unw_table_entry *
+lookup (struct unw_table *table, unsigned long rel_ip)
+{
+        const struct unw_table_entry *e = NULL;
+        unsigned long lo, hi, mid;
+        /* do a binary search for right entry: */
+        for (lo = 0, hi = table->length; lo < hi; ) {
+                mid = (lo + hi) / 2;
+                e = &table->array[mid];
+                if (rel_ip < e->start_offset)
+                        hi = mid;
+                else if (rel_ip >= e->end_offset)
+                        lo = mid + 1;
+                else
+                        break;
+        }
+        if (rel_ip < e->start_offset || rel_ip >= e->end_offset)
+                return NULL;
+        return e;
+}
+/*
+ * Build an unwind script that unwinds from state OLD_STATE to the
+ * entrypoint of the function that called OLD_STATE.
+ */
+static inline struct unw_script *
+build_script (struct unw_frame_info *info)
+{
+        const struct unw_table_entry *e = NULL;
+        struct unw_script *script = NULL;
+        struct unw_labeled_state *ls, *next;
+        unsigned long ip = info->ip;
+        struct unw_state_record sr;
+        struct unw_table *table;
+        struct unw_reg_info *r;
+        struct unw_insn insn;
+        u8 *dp, *desc_end;
+        u64 hdr;
+        int i;
+        STAT(unsigned long start, parse_start;)
+        STAT(++unw.stat.script.builds; start = ia64_get_itc());
+        /* build state record */
+        memset(&sr, 0, sizeof(sr));
+        for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r)
+                r->when = UNW_WHEN_NEVER;
+        sr.pr_val = info->pr;
+        UNW_DPRINT(3, "unwind.%s: ip 0x%lx\n", __FUNCTION__, ip);
+        script = script_new(ip);
+        if (!script) {
+                UNW_DPRINT(0, "unwind.%s: failed to create unwind script\n",  __FUNCTION__);
+                STAT(unw.stat.script.build_time += ia64_get_itc() - start);
+                return NULL;
+        }
+        unw.cache[info->prev_script].hint = script - unw.cache;
+        /* search the kernels and the modules' unwind tables for IP: */
+        STAT(parse_start = ia64_get_itc());
+        for (table = unw.tables; table; table = table->next) {
+                if (ip >= table->start && ip < table->end) {
+                        e = lookup(table, ip - table->segment_base);
+                        break;
+                }
+        }
+        if (!e) {
+                /* no info, return default unwinder (leaf proc, no mem stack, no saved regs)  */
+                UNW_DPRINT(1, "unwind.%s: no unwind info for ip=0x%lx (prev ip=0x%lx)\n",
+                        __FUNCTION__, ip, unw.cache[info->prev_script].ip);
+                sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR;
+                sr.curr.reg[UNW_REG_RP].when = -1;
+                sr.curr.reg[UNW_REG_RP].val = 0;
+                compile_reg(&sr, UNW_REG_RP, script);
+                script_finalize(script, &sr);
+                STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
+                STAT(unw.stat.script.build_time += ia64_get_itc() - start);
+                return script;
+        }
+        sr.when_target = (3*((ip & ~0xfUL) - (table->segment_base + e->start_offset))/16
+                          + (ip & 0xfUL));
+        hdr = *(u64 *) (table->segment_base + e->info_offset);
+        dp =   (u8 *)  (table->segment_base + e->info_offset + 8);
+        desc_end = dp + 8*UNW_LENGTH(hdr);
+        while (!sr.done && dp < desc_end)
+                dp = unw_decode(dp, sr.in_body, &sr);
+        if (sr.when_target > sr.epilogue_start) {
+                /*
+                 * sp has been restored and all values on the memory stack below
+                 * psp also have been restored.
+                 */
+                sr.curr.reg[UNW_REG_PSP].val = 0;
+                sr.curr.reg[UNW_REG_PSP].where = UNW_WHERE_NONE;
+                sr.curr.reg[UNW_REG_PSP].when = UNW_WHEN_NEVER;
+                for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r)
+                        if ((r->where == UNW_WHERE_PSPREL && r->val <= 0x10)
+                            || r->where == UNW_WHERE_SPREL)
+                        {
+                                r->val = 0;
+                                r->where = UNW_WHERE_NONE;
+                                r->when = UNW_WHEN_NEVER;
+                        }
+        }
+        script->flags = sr.flags;
+        /*
+         * If RP did't get saved, generate entry for the return link
+         * register.
+         */
+        if (sr.curr.reg[UNW_REG_RP].when >= sr.when_target) {
+                sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR;
+                sr.curr.reg[UNW_REG_RP].when = -1;
+                sr.curr.reg[UNW_REG_RP].val = sr.return_link_reg;
+                UNW_DPRINT(1, "unwind.%s: using default for rp at ip=0x%lx where=%d val=0x%lx\n",
+                           __FUNCTION__, ip, sr.curr.reg[UNW_REG_RP].where,
+                           sr.curr.reg[UNW_REG_RP].val);
+        }
+#ifdef UNW_DEBUG
+        UNW_DPRINT(1, "unwind.%s: state record for func 0x%lx, t=%u:\n",
+                __FUNCTION__, table->segment_base + e->start_offset, sr.when_target);
+        for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) {
+                if (r->where != UNW_WHERE_NONE || r->when != UNW_WHEN_NEVER) {
+                        UNW_DPRINT(1, "  %s <- ", unw.preg_name[r - sr.curr.reg]);
+                        switch (r->where) {
+                              case UNW_WHERE_GR:     UNW_DPRINT(1, "r%lu", r->val); break;
+                              case UNW_WHERE_FR:     UNW_DPRINT(1, "f%lu", r->val); break;
+                              case UNW_WHERE_BR:     UNW_DPRINT(1, "b%lu", r->val); break;
+                              case UNW_WHERE_SPREL:  UNW_DPRINT(1, "[sp+0x%lx]", r->val); break;
+                              case UNW_WHERE_PSPREL: UNW_DPRINT(1, "[psp+0x%lx]", r->val); break;
+                              case UNW_WHERE_NONE:
+                                UNW_DPRINT(1, "%s+0x%lx", unw.preg_name[r - sr.curr.reg], r->val);
+                                break;
+                              default:
+                                UNW_DPRINT(1, "BADWHERE(%d)", r->where);
+                                break;
+                        }
+                        UNW_DPRINT(1, "\t\t%d\n", r->when);
+                }
+        }
+#endif
+        STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
+        /* translate state record into unwinder instructions: */
+        /*
+         * First, set psp if we're dealing with a fixed-size frame;
+         * subsequent instructions may depend on this value.
+         */
+        if (sr.when_target > sr.curr.reg[UNW_REG_PSP].when
+            && (sr.curr.reg[UNW_REG_PSP].where == UNW_WHERE_NONE)
+            && sr.curr.reg[UNW_REG_PSP].val != 0) {
+                /* new psp is sp plus frame size */
+                insn.opc = UNW_INSN_ADD;
+                insn.dst = offsetof(struct unw_frame_info, psp)/8;
+                insn.val = sr.curr.reg[UNW_REG_PSP].val;        /* frame size */
+                script_emit(script, insn);
+        }
+        /* determine where the primary UNaT is: */
+        if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_GR].when)
+                i = UNW_REG_PRI_UNAT_MEM;
+        else if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when)
+                i = UNW_REG_PRI_UNAT_GR;
+        else if (sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when > sr.curr.reg[UNW_REG_PRI_UNAT_GR].when)
+                i = UNW_REG_PRI_UNAT_MEM;
+        else
+                i = UNW_REG_PRI_UNAT_GR;
+        compile_reg(&sr, i, script);
+        for (i = UNW_REG_BSP; i < UNW_NUM_REGS; ++i)
+                compile_reg(&sr, i, script);
+        /* free labeled register states & stack: */
+        STAT(parse_start = ia64_get_itc());
+        for (ls = sr.labeled_states; ls; ls = next) {
+                next = ls->next;
+                free_state_stack(&ls->saved_state);
+                free_labeled_state(ls);
+        }
+        free_state_stack(&sr.curr);
+        STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
+        script_finalize(script, &sr);
+        STAT(unw.stat.script.build_time += ia64_get_itc() - start);
+        return script;
+}
+/*
+ * Apply the unwinding actions represented by OPS and update SR to
+ * reflect the state that existed upon entry to the function that this
+ * unwinder represents.
+ */
+static inline void
+run_script (struct unw_script *script, struct unw_frame_info *state)
+{
+        struct unw_insn *ip, *limit, next_insn;
+        unsigned long opc, dst, val, off;
+        unsigned long *s = (unsigned long *) state;
+        STAT(unsigned long start;)
+        STAT(++unw.stat.script.runs; start = ia64_get_itc());
+        state->flags = script->flags;
+        ip = script->insn;
+        limit = script->insn + script->count;
+        next_insn = *ip;
+        while (ip++ < limit) {
+                opc = next_insn.opc;
+                dst = next_insn.dst;
+                val = next_insn.val;
+                next_insn = *ip;
+          redo:
+                switch (opc) {
+                      case UNW_INSN_ADD:
+                        s[dst] += val;
+                        break;
+                      case UNW_INSN_MOVE2:
+                        if (!s[val])
+                                goto lazy_init;
+                        s[dst+1] = s[val+1];
+                        s[dst] = s[val];
+                        break;
+                      case UNW_INSN_MOVE:
+                        if (!s[val])
+                                goto lazy_init;
+                        s[dst] = s[val];
+                        break;
+                      case UNW_INSN_MOVE_SCRATCH:
+                        if (state->pt) {
+                                s[dst] = (unsigned long) get_scratch_regs(state) + val;
+                        } else {
+                                s[dst] = 0;
+                                UNW_DPRINT(0, "unwind.%s: no state->pt, dst=%ld, val=%ld\n",
+                                           __FUNCTION__, dst, val);
+                        }
+                        break;
+                      case UNW_INSN_MOVE_CONST:
+                        if (val == 0)
+                                s[dst] = (unsigned long) &unw.r0;
+                        else {
+                                s[dst] = 0;
+                                UNW_DPRINT(0, "unwind.%s: UNW_INSN_MOVE_CONST bad val=%ld\n",
+                                           __FUNCTION__, val);
+                        }
+                        break;
+                      case UNW_INSN_MOVE_STACKED:
+                        s[dst] = (unsigned long) ia64_rse_skip_regs((unsigned long *)state->bsp,
+                                                                    val);
+                        break;
+                      case UNW_INSN_ADD_PSP:
+                        s[dst] = state->psp + val;
+                        break;
+                      case UNW_INSN_ADD_SP:
+                        s[dst] = state->sp + val;
+                        break;
+                      case UNW_INSN_SETNAT_MEMSTK:
+                        if (!state->pri_unat_loc)
+                                state->pri_unat_loc = &state->sw->ar_unat;
+                        /* register off. is a multiple of 8, so the least 3 bits (type) are 0 */
+                        s[dst+1] = ((unsigned long) state->pri_unat_loc - s[dst]) | UNW_NAT_MEMSTK;
+                        break;
+                      case UNW_INSN_SETNAT_TYPE:
+                        s[dst+1] = val;
+                        break;
+                      case UNW_INSN_LOAD:
+#ifdef UNW_DEBUG
+                        if ((s[val] & (local_cpu_data->unimpl_va_mask | 0x7)) != 0
+                            || s[val] < TASK_SIZE)
+                        {
+                                UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n",
+                                           __FUNCTION__, s[val]);
+                                break;
+                        }
+#endif
+                        s[dst] = *(unsigned long *) s[val];
+                        break;
+                }
+        }
+        STAT(unw.stat.script.run_time += ia64_get_itc() - start);
+        return;
+  lazy_init:
+        off = unw.sw_off[val];
+        s[val] = (unsigned long) state->sw + off;
+        if (off >= offsetof(struct switch_stack, r4) && off <= offsetof(struct switch_stack, r7))
+                /*
+                 * We're initializing a general register: init NaT info, too.  Note that
+                 * the offset is a multiple of 8 which gives us the 3 bits needed for
+                 * the type field.
+                 */
+                s[val+1] = (offsetof(struct switch_stack, ar_unat) - off) | UNW_NAT_MEMSTK;
+        goto redo;
+}
+static int
+find_save_locs (struct unw_frame_info *info)
+{
+        int have_write_lock = 0;
+        struct unw_script *scr;
+        unsigned long flags = 0;
+        if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) || info->ip < TASK_SIZE) {
+                /* don't let obviously bad addresses pollute the cache */
+                /* FIXME: should really be level 0 but it occurs too often. KAO */
+                UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __FUNCTION__, info->ip);
+                info->rp_loc = NULL;
+                return -1;
+        }
+        scr = script_lookup(info);
+        if (!scr) {
+                spin_lock_irqsave(&unw.lock, flags);
+                scr = build_script(info);
+                if (!scr) {
+                        spin_unlock_irqrestore(&unw.lock, flags);
+                        UNW_DPRINT(0,
+                                   "unwind.%s: failed to locate/build unwind script for ip %lx\n",
+                                   __FUNCTION__, info->ip);
+                        return -1;
+                }
+                have_write_lock = 1;
+        }
+        info->hint = scr->hint;
+        info->prev_script = scr - unw.cache;
+        run_script(scr, info);
+        if (have_write_lock) {
+                write_unlock(&scr->lock);
+                spin_unlock_irqrestore(&unw.lock, flags);
+        } else
+                read_unlock(&scr->lock);
+        return 0;
+}
+int
+unw_unwind (struct unw_frame_info *info)
+{
+        unsigned long prev_ip, prev_sp, prev_bsp;
+        unsigned long ip, pr, num_regs;
+        STAT(unsigned long start, flags;)
+        int retval;
+        STAT(local_irq_save(flags); ++unw.stat.api.unwinds; start = ia64_get_itc());
+        prev_ip = info->ip;
+        prev_sp = info->sp;
+        prev_bsp = info->bsp;
+        /* restore the ip */
+        if (!info->rp_loc) {
+                /* FIXME: should really be level 0 but it occurs too often. KAO */
+                UNW_DPRINT(1, "unwind.%s: failed to locate return link (ip=0x%lx)!\n",
+                           __FUNCTION__, info->ip);
+                STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
+                return -1;
+        }
+        ip = info->ip = *info->rp_loc;
+        if (ip < GATE_ADDR) {
+                UNW_DPRINT(2, "unwind.%s: reached user-space (ip=0x%lx)\n", __FUNCTION__, ip);
+                STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
+                return -1;
+        }
+        /* restore the cfm: */
+        if (!info->pfs_loc) {
+                UNW_DPRINT(0, "unwind.%s: failed to locate ar.pfs!\n", __FUNCTION__);
+                STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
+                return -1;
+        }
+        info->cfm_loc = info->pfs_loc;
+        /* restore the bsp: */
+        pr = info->pr;
+        num_regs = 0;
+        if ((info->flags & UNW_FLAG_INTERRUPT_FRAME)) {
+                info->pt = info->sp + 16;
+                if ((pr & (1UL << PRED_NON_SYSCALL)) != 0)
+                        num_regs = *info->cfm_loc & 0x7f;               /* size of frame */
+                info->pfs_loc =
+                        (unsigned long *) (info->pt + offsetof(struct pt_regs, ar_pfs));
+                UNW_DPRINT(3, "unwind.%s: interrupt_frame pt 0x%lx\n", __FUNCTION__, info->pt);
+        } else
+                num_regs = (*info->cfm_loc >> 7) & 0x7f;        /* size of locals */
+        info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->bsp, -num_regs);
+        if (info->bsp < info->regstk.limit || info->bsp > info->regstk.top) {
+                UNW_DPRINT(0, "unwind.%s: bsp (0x%lx) out of range [0x%lx-0x%lx]\n",
+                        __FUNCTION__, info->bsp, info->regstk.limit, info->regstk.top);
+                STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
+                return -1;
+        }
+        /* restore the sp: */
+        info->sp = info->psp;
+        if (info->sp < info->memstk.top || info->sp > info->memstk.limit) {
+                UNW_DPRINT(0, "unwind.%s: sp (0x%lx) out of range [0x%lx-0x%lx]\n",
+                        __FUNCTION__, info->sp, info->memstk.top, info->memstk.limit);
+                STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
+                return -1;
+        }
+        if (info->ip == prev_ip && info->sp == prev_sp && info->bsp == prev_bsp) {
+                UNW_DPRINT(0, "unwind.%s: ip, sp, bsp unchanged; stopping here (ip=0x%lx)\n",
+                           __FUNCTION__, ip);
+                STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
+                return -1;
+        }
+        /* as we unwind, the saved ar.unat becomes the primary unat: */
+        info->pri_unat_loc = info->unat_loc;
+        /* finally, restore the predicates: */
+        unw_get_pr(info, &info->pr);
+        retval = find_save_locs(info);
+        STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
+        return retval;
+}
+EXPORT_SYMBOL(unw_unwind);
+int
+unw_unwind_to_user (struct unw_frame_info *info)
+{
+        unsigned long ip, sp;
+        while (unw_unwind(info) >= 0) {
+                if (unw_get_rp(info, &ip) < 0) {
+                        unw_get_ip(info, &ip);
+                        UNW_DPRINT(0, "unwind.%s: failed to read return pointer (ip=0x%lx)\n",
+                                   __FUNCTION__, ip);
+                        return -1;
+                }
+                unw_get_sp(info, &sp);
+                if (sp >= (unsigned long)info->task + IA64_STK_OFFSET)
+                        break;
+                if (ip < FIXADDR_USER_END)
+                        return 0;
+        }
+        unw_get_ip(info, &ip);
+        UNW_DPRINT(0, "unwind.%s: failed to unwind to user-level (ip=0x%lx)\n", __FUNCTION__, ip);
+        return -1;
+}
+EXPORT_SYMBOL(unw_unwind_to_user);
+static void
+init_frame_info (struct unw_frame_info *info, struct task_struct *t,
+                 struct switch_stack *sw, unsigned long stktop)
+{
+        unsigned long rbslimit, rbstop, stklimit;
+        STAT(unsigned long start, flags;)
+        STAT(local_irq_save(flags); ++unw.stat.api.inits; start = ia64_get_itc());
+        /*
+         * Subtle stuff here: we _could_ unwind through the switch_stack frame but we
+         * don't want to do that because it would be slow as each preserved register would
+         * have to be processed.  Instead, what we do here is zero out the frame info and
+         * start the unwind process at the function that created the switch_stack frame.
+         * When a preserved value in switch_stack needs to be accessed, run_script() will
+         * initialize the appropriate pointer on demand.
+         */
+        memset(info, 0, sizeof(*info));
+        rbslimit = (unsigned long) t + IA64_RBS_OFFSET;
+        rbstop   = sw->ar_bspstore;
+        if (rbstop - (unsigned long) t >= IA64_STK_OFFSET)
+                rbstop = rbslimit;
+        stklimit = (unsigned long) t + IA64_STK_OFFSET;
+        if (stktop <= rbstop)
+                stktop = rbstop;
+        info->regstk.limit = rbslimit;
+        info->regstk.top   = rbstop;
+        info->memstk.limit = stklimit;
+        info->memstk.top   = stktop;
+        info->task = t;
+        info->sw  = sw;
+        info->sp = info->psp = stktop;
+        info->pr = sw->pr;
+        UNW_DPRINT(3, "unwind.%s:\n"
+                   "  task   0x%lx\n"
+                   "  rbs = [0x%lx-0x%lx)\n"
+                   "  stk = [0x%lx-0x%lx)\n"
+                   "  pr     0x%lx\n"
+                   "  sw     0x%lx\n"
+                   "  sp     0x%lx\n",
+                   __FUNCTION__, (unsigned long) t, rbslimit, rbstop, stktop, stklimit,
+                   info->pr, (unsigned long) info->sw, info->sp);
+        STAT(unw.stat.api.init_time += ia64_get_itc() - start; local_irq_restore(flags));
+}
+void
+unw_init_from_interruption (struct unw_frame_info *info, struct task_struct *t,
+                            struct pt_regs *pt, struct switch_stack *sw)
+{
+        unsigned long sof;
+        init_frame_info(info, t, sw, pt->r12);
+        info->cfm_loc = &pt->cr_ifs;
+        info->unat_loc = &pt->ar_unat;
+        info->pfs_loc = &pt->ar_pfs;
+        sof = *info->cfm_loc & 0x7f;
+        info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sof);
+        info->ip = pt->cr_iip + ia64_psr(pt)->ri;
+        info->pt = (unsigned long) pt;
+        UNW_DPRINT(3, "unwind.%s:\n"
+                   "  bsp    0x%lx\n"
+                   "  sof    0x%lx\n"
+                   "  ip     0x%lx\n",
+                   __FUNCTION__, info->bsp, sof, info->ip);
+        find_save_locs(info);
+}
+void
+unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct switch_stack *sw)
+{
+        unsigned long sol;
+        init_frame_info(info, t, sw, (unsigned long) (sw + 1) - 16);
+        info->cfm_loc = &sw->ar_pfs;
+        sol = (*info->cfm_loc >> 7) & 0x7f;
+        info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sol);
+        info->ip = sw->b0;
+        UNW_DPRINT(3, "unwind.%s:\n"
+                   "  bsp    0x%lx\n"
+                   "  sol    0x%lx\n"
+                   "  ip     0x%lx\n",
+                   __FUNCTION__, info->bsp, sol, info->ip);
+        find_save_locs(info);
+}
+EXPORT_SYMBOL(unw_init_frame_info);
+void
+unw_init_from_blocked_task (struct unw_frame_info *info, struct task_struct *t)
+{
+        struct switch_stack *sw = (struct switch_stack *) (t->thread.ksp + 16);
+        UNW_DPRINT(1, "unwind.%s\n", __FUNCTION__);
+        unw_init_frame_info(info, t, sw);
+}
+EXPORT_SYMBOL(unw_init_from_blocked_task);
+static void
+init_unwind_table (struct unw_table *table, const char *name, unsigned long segment_base,
+                   unsigned long gp, const void *table_start, const void *table_end)
+{
+        const struct unw_table_entry *start = table_start, *end = table_end;
+        table->name = name;
+        table->segment_base = segment_base;
+        table->gp = gp;
+        table->start = segment_base + start[0].start_offset;
+        table->end = segment_base + end[-1].end_offset;
+        table->array = start;
+        table->length = end - start;
+}
+void *
+unw_add_unwind_table (const char *name, unsigned long segment_base, unsigned long gp,
+                      const void *table_start, const void *table_end)
+{
+        const struct unw_table_entry *start = table_start, *end = table_end;
+        struct unw_table *table;
+        unsigned long flags;
+        if (end - start <= 0) {
+                UNW_DPRINT(0, "unwind.%s: ignoring attempt to insert empty unwind table\n",
+                           __FUNCTION__);
+                return NULL;
+        }
+        table = kmalloc(sizeof(*table), GFP_USER);
+        if (!table)
+                return NULL;
+        init_unwind_table(table, name, segment_base, gp, table_start, table_end);
+        spin_lock_irqsave(&unw.lock, flags);
+        {
+                /* keep kernel unwind table at the front (it's searched most commonly): */
+                table->next = unw.tables->next;
+                unw.tables->next = table;
+        }
+        spin_unlock_irqrestore(&unw.lock, flags);
+        return table;
+}
+void
+unw_remove_unwind_table (void *handle)
+{
+        struct unw_table *table, *prev;
+        struct unw_script *tmp;
+        unsigned long flags;
+        long index;
+        if (!handle) {
+                UNW_DPRINT(0, "unwind.%s: ignoring attempt to remove non-existent unwind table\n",
+                           __FUNCTION__);
+                return;
+        }
+        table = handle;
+        if (table == &unw.kernel_table) {
+                UNW_DPRINT(0, "unwind.%s: sorry, freeing the kernel's unwind table is a "
+                           "no-can-do!\n", __FUNCTION__);
+                return;
+        }
+        spin_lock_irqsave(&unw.lock, flags);
+        {
+                /* first, delete the table: */
+                for (prev = (struct unw_table *) &unw.tables; prev; prev = prev->next)
+                        if (prev->next == table)
+                                break;
+                if (!prev) {
+                        UNW_DPRINT(0, "unwind.%s: failed to find unwind table %p\n",
+                                   __FUNCTION__, (void *) table);
+                        spin_unlock_irqrestore(&unw.lock, flags);
+                        return;
+                }
+                prev->next = table->next;
+        }
+        spin_unlock_irqrestore(&unw.lock, flags);
+        /* next, remove hash table entries for this table */
+        for (index = 0; index <= UNW_HASH_SIZE; ++index) {
+                tmp = unw.cache + unw.hash[index];
+                if (unw.hash[index] >= UNW_CACHE_SIZE
+                    || tmp->ip < table->start || tmp->ip >= table->end)
+                        continue;
+                write_lock(&tmp->lock);
+                {
+                        if (tmp->ip >= table->start && tmp->ip < table->end) {
+                                unw.hash[index] = tmp->coll_chain;
+                                tmp->ip = 0;
+                        }
+                }
+                write_unlock(&tmp->lock);
+        }
+        kfree(table);
+}
+static int __init
+create_gate_table (void)
+{
+        const struct unw_table_entry *entry, *start, *end;
+        unsigned long *lp, segbase = GATE_ADDR;
+        size_t info_size, size;
+        char *info;
+        Elf64_Phdr *punw = NULL, *phdr = (Elf64_Phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
+        int i;
+        for (i = 0; i < GATE_EHDR->e_phnum; ++i, ++phdr)
+                if (phdr->p_type == PT_IA_64_UNWIND) {
+                        punw = phdr;
+                        break;
+                }
+        if (!punw) {
+                printk("%s: failed to find gate DSO's unwind table!\n", __FUNCTION__);
+                return 0;
+        }
+        start = (const struct unw_table_entry *) punw->p_vaddr;
+        end = (struct unw_table_entry *) ((char *) start + punw->p_memsz);
+        size  = 0;
+        unw_add_unwind_table("linux-gate.so", segbase, 0, start, end);
+        for (entry = start; entry < end; ++entry)
+                size += 3*8 + 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset));
+        size += 8;      /* reserve space for "end of table" marker */
+        unw.gate_table = kmalloc(size, GFP_KERNEL);
+        if (!unw.gate_table) {
+                unw.gate_table_size = 0;
+                printk(KERN_ERR "%s: unable to create unwind data for gate page!\n", __FUNCTION__);
+                return 0;
+        }
+        unw.gate_table_size = size;
+        lp = unw.gate_table;
+        info = (char *) unw.gate_table + size;
+        for (entry = start; entry < end; ++entry, lp += 3) {
+                info_size = 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset));
+                info -= info_size;
+                memcpy(info, (char *) segbase + entry->info_offset, info_size);
+                lp[0] = segbase + entry->start_offset;          /* start */
+                lp[1] = segbase + entry->end_offset;            /* end */
+                lp[2] = info - (char *) unw.gate_table;         /* info */
+        }
+        *lp = 0;        /* end-of-table marker */
+        return 0;
+}
+__initcall(create_gate_table);
+void __init
+unw_init (void)
+{
+        extern char __gp[];
+        extern void unw_hash_index_t_is_too_narrow (void);
+        long i, off;
+        if (8*sizeof(unw_hash_index_t) < UNW_LOG_HASH_SIZE)
+                unw_hash_index_t_is_too_narrow();
+        unw.sw_off[unw.preg_index[UNW_REG_PRI_UNAT_GR]] = SW(AR_UNAT);
+        unw.sw_off[unw.preg_index[UNW_REG_BSPSTORE]] = SW(AR_BSPSTORE);
+        unw.sw_off[unw.preg_index[UNW_REG_PFS]] = SW(AR_UNAT);
+        unw.sw_off[unw.preg_index[UNW_REG_RP]] = SW(B0);
+        unw.sw_off[unw.preg_index[UNW_REG_UNAT]] = SW(AR_UNAT);
+        unw.sw_off[unw.preg_index[UNW_REG_PR]] = SW(PR);
+        unw.sw_off[unw.preg_index[UNW_REG_LC]] = SW(AR_LC);
+        unw.sw_off[unw.preg_index[UNW_REG_FPSR]] = SW(AR_FPSR);
+        for (i = UNW_REG_R4, off = SW(R4); i <= UNW_REG_R7; ++i, off += 8)
+                unw.sw_off[unw.preg_index[i]] = off;
+        for (i = UNW_REG_B1, off = SW(B1); i <= UNW_REG_B5; ++i, off += 8)
+                unw.sw_off[unw.preg_index[i]] = off;
+        for (i = UNW_REG_F2, off = SW(F2); i <= UNW_REG_F5; ++i, off += 16)
+                unw.sw_off[unw.preg_index[i]] = off;
+        for (i = UNW_REG_F16, off = SW(F16); i <= UNW_REG_F31; ++i, off += 16)
+                unw.sw_off[unw.preg_index[i]] = off;
+        for (i = 0; i < UNW_CACHE_SIZE; ++i) {
+                if (i > 0)
+                        unw.cache[i].lru_chain = (i - 1);
+                unw.cache[i].coll_chain = -1;
+                rwlock_init(&unw.cache[i].lock);
+        }
+        unw.lru_head = UNW_CACHE_SIZE - 1;
+        unw.lru_tail = 0;
+        init_unwind_table(&unw.kernel_table, "kernel", KERNEL_START, (unsigned long) __gp,
+                          __start_unwind, __end_unwind);
+}
+/*
+ * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED
+ *
+ *      This system call has been deprecated.  The new and improved way to get
+ *      at the kernel's unwind info is via the gate DSO.  The address of the
+ *      ELF header for this DSO is passed to user-level via AT_SYSINFO_EHDR.
+ *
+ * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED
+ *
+ * This system call copies the unwind data into the buffer pointed to by BUF and returns
+ * the size of the unwind data.  If BUF_SIZE is smaller than the size of the unwind data
+ * or if BUF is NULL, nothing is copied, but the system call still returns the size of the
+ * unwind data.
+ *
+ * The first portion of the unwind data contains an unwind table and rest contains the
+ * associated unwind info (in no particular order).  The unwind table consists of a table
+ * of entries of the form:
+ *
+ *      u64 start;      (64-bit address of start of function)
+ *      u64 end;        (64-bit address of start of function)
+ *      u64 info;       (BUF-relative offset to unwind info)
+ *
+ * The end of the unwind table is indicated by an entry with a START address of zero.
+ *
+ * Please see the IA-64 Software Conventions and Runtime Architecture manual for details
+ * on the format of the unwind info.
+ *
+ * ERRORS
+ *      EFAULT  BUF points outside your accessible address space.
+ */
+asmlinkage long
+sys_getunwind (void __user *buf, size_t buf_size)
+{
+        if (buf && buf_size >= unw.gate_table_size)
+                if (copy_to_user(buf, unw.gate_table, unw.gate_table_size) != 0)
+                        return -EFAULT;
+        return unw.gate_table_size;
+}
diff --git a/arch/ia64/kernel/unwind_decoder.c b/arch/ia64/kernel/unwind_decoder.c
new file mode 100644
index 000000000000..50ac2d82f9bf
--- /dev/null
+++ b/arch/ia64/kernel/unwind_decoder.c
@@ -0,0 +1,459 @@
+/*
+ * Copyright (C) 2000 Hewlett-Packard Co
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * Generic IA-64 unwind info decoder.
+ *
+ * This file is used both by the Linux kernel and objdump.  Please keep
+ * the two copies of this file in sync.
+ *
+ * You need to customize the decoder by defining the following
+ * macros/constants before including this file:
+ *
+ *  Types:
+ *      unw_word        Unsigned integer type with at least 64 bits 
+ *
+ *  Register names:
+ *      UNW_REG_BSP
+ *      UNW_REG_BSPSTORE
+ *      UNW_REG_FPSR
+ *      UNW_REG_LC
+ *      UNW_REG_PFS
+ *      UNW_REG_PR
+ *      UNW_REG_RNAT
+ *      UNW_REG_PSP
+ *      UNW_REG_RP
+ *      UNW_REG_UNAT
+ *
+ *  Decoder action macros:
+ *      UNW_DEC_BAD_CODE(code)
+ *      UNW_DEC_ABI(fmt,abi,context,arg)
+ *      UNW_DEC_BR_GR(fmt,brmask,gr,arg)
+ *      UNW_DEC_BR_MEM(fmt,brmask,arg)
+ *      UNW_DEC_COPY_STATE(fmt,label,arg)
+ *      UNW_DEC_EPILOGUE(fmt,t,ecount,arg)
+ *      UNW_DEC_FRGR_MEM(fmt,grmask,frmask,arg)
+ *      UNW_DEC_FR_MEM(fmt,frmask,arg)
+ *      UNW_DEC_GR_GR(fmt,grmask,gr,arg)
+ *      UNW_DEC_GR_MEM(fmt,grmask,arg)
+ *      UNW_DEC_LABEL_STATE(fmt,label,arg)
+ *      UNW_DEC_MEM_STACK_F(fmt,t,size,arg)
+ *      UNW_DEC_MEM_STACK_V(fmt,t,arg)
+ *      UNW_DEC_PRIUNAT_GR(fmt,r,arg)
+ *      UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg)
+ *      UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg)
+ *      UNW_DEC_PRIUNAT_WHEN_PSPREL(fmt,pspoff,arg)
+ *      UNW_DEC_PRIUNAT_WHEN_SPREL(fmt,spoff,arg)
+ *      UNW_DEC_PROLOGUE(fmt,body,rlen,arg)
+ *      UNW_DEC_PROLOGUE_GR(fmt,rlen,mask,grsave,arg)
+ *      UNW_DEC_REG_PSPREL(fmt,reg,pspoff,arg)
+ *      UNW_DEC_REG_REG(fmt,src,dst,arg)
+ *      UNW_DEC_REG_SPREL(fmt,reg,spoff,arg)
+ *      UNW_DEC_REG_WHEN(fmt,reg,t,arg)
+ *      UNW_DEC_RESTORE(fmt,t,abreg,arg)
+ *      UNW_DEC_RESTORE_P(fmt,qp,t,abreg,arg)
+ *      UNW_DEC_SPILL_BASE(fmt,pspoff,arg)
+ *      UNW_DEC_SPILL_MASK(fmt,imaskp,arg)
+ *      UNW_DEC_SPILL_PSPREL(fmt,t,abreg,pspoff,arg)
+ *      UNW_DEC_SPILL_PSPREL_P(fmt,qp,t,abreg,pspoff,arg)
+ *      UNW_DEC_SPILL_REG(fmt,t,abreg,x,ytreg,arg)
+ *      UNW_DEC_SPILL_REG_P(fmt,qp,t,abreg,x,ytreg,arg)
+ *      UNW_DEC_SPILL_SPREL(fmt,t,abreg,spoff,arg)
+ *      UNW_DEC_SPILL_SPREL_P(fmt,qp,t,abreg,pspoff,arg)
+ */
+static unw_word
+unw_decode_uleb128 (unsigned char **dpp)
+{
+  unsigned shift = 0;
+  unw_word byte, result = 0;
+  unsigned char *bp = *dpp;
+  while (1)
+    {
+      byte = *bp++;
+      result |= (byte & 0x7f) << shift;
+      if ((byte & 0x80) == 0)
+        break;
+      shift += 7;
+    }
+  *dpp = bp;
+  return result;
+}
+static unsigned char *
+unw_decode_x1 (unsigned char *dp, unsigned char code, void *arg)
+{
+  unsigned char byte1, abreg;
+  unw_word t, off;
+  byte1 = *dp++;
+  t = unw_decode_uleb128 (&dp);
+  off = unw_decode_uleb128 (&dp);
+  abreg = (byte1 & 0x7f);
+  if (byte1 & 0x80)
+          UNW_DEC_SPILL_SPREL(X1, t, abreg, off, arg);
+  else
+          UNW_DEC_SPILL_PSPREL(X1, t, abreg, off, arg);
+  return dp;
+}
+static unsigned char *
+unw_decode_x2 (unsigned char *dp, unsigned char code, void *arg)
+{
+  unsigned char byte1, byte2, abreg, x, ytreg;
+  unw_word t;
+  byte1 = *dp++; byte2 = *dp++;
+  t = unw_decode_uleb128 (&dp);
+  abreg = (byte1 & 0x7f);
+  ytreg = byte2;
+  x = (byte1 >> 7) & 1;
+  if ((byte1 & 0x80) == 0 && ytreg == 0)
+    UNW_DEC_RESTORE(X2, t, abreg, arg);
+  else
+    UNW_DEC_SPILL_REG(X2, t, abreg, x, ytreg, arg);
+  return dp;
+}
+static unsigned char *
+unw_decode_x3 (unsigned char *dp, unsigned char code, void *arg)
+{
+  unsigned char byte1, byte2, abreg, qp;
+  unw_word t, off;
+  byte1 = *dp++; byte2 = *dp++;
+  t = unw_decode_uleb128 (&dp);
+  off = unw_decode_uleb128 (&dp);
+  qp = (byte1 & 0x3f);
+  abreg = (byte2 & 0x7f);
+  if (byte1 & 0x80)
+    UNW_DEC_SPILL_SPREL_P(X3, qp, t, abreg, off, arg);
+  else
+    UNW_DEC_SPILL_PSPREL_P(X3, qp, t, abreg, off, arg);
+  return dp;
+}
+static unsigned char *
+unw_decode_x4 (unsigned char *dp, unsigned char code, void *arg)
+{
+  unsigned char byte1, byte2, byte3, qp, abreg, x, ytreg;
+  unw_word t;
+  byte1 = *dp++; byte2 = *dp++; byte3 = *dp++;
+  t = unw_decode_uleb128 (&dp);
+  qp = (byte1 & 0x3f);
+  abreg = (byte2 & 0x7f);
+  x = (byte2 >> 7) & 1;
+  ytreg = byte3;
+  if ((byte2 & 0x80) == 0 && byte3 == 0)
+    UNW_DEC_RESTORE_P(X4, qp, t, abreg, arg);
+  else
+    UNW_DEC_SPILL_REG_P(X4, qp, t, abreg, x, ytreg, arg);
+  return dp;
+}
+static unsigned char *
+unw_decode_r1 (unsigned char *dp, unsigned char code, void *arg)
+{
+  int body = (code & 0x20) != 0;
+  unw_word rlen;
+  rlen = (code & 0x1f);
+  UNW_DEC_PROLOGUE(R1, body, rlen, arg);
+  return dp;
+}
+static unsigned char *
+unw_decode_r2 (unsigned char *dp, unsigned char code, void *arg)
+{
+  unsigned char byte1, mask, grsave;
+  unw_word rlen;
+  byte1 = *dp++;
+  mask = ((code & 0x7) << 1) | ((byte1 >> 7) & 1);
+  grsave = (byte1 & 0x7f);
+  rlen = unw_decode_uleb128 (&dp);
+  UNW_DEC_PROLOGUE_GR(R2, rlen, mask, grsave, arg);
+  return dp;
+}
+static unsigned char *
+unw_decode_r3 (unsigned char *dp, unsigned char code, void *arg)
+{
+  unw_word rlen;
+  rlen = unw_decode_uleb128 (&dp);
+  UNW_DEC_PROLOGUE(R3, ((code & 0x3) == 1), rlen, arg);
+  return dp;
+}
+static unsigned char *
+unw_decode_p1 (unsigned char *dp, unsigned char code, void *arg)
+{
+  unsigned char brmask = (code & 0x1f);
+  UNW_DEC_BR_MEM(P1, brmask, arg);
+  return dp;
+}
+static unsigned char *
+unw_decode_p2_p5 (unsigned char *dp, unsigned char code, void *arg)
+{
+  if ((code & 0x10) == 0)
+    {
+      unsigned char byte1 = *dp++;
+      UNW_DEC_BR_GR(P2, ((code & 0xf) << 1) | ((byte1 >> 7) & 1),
+                    (byte1 & 0x7f), arg);
+    }
+  else if ((code & 0x08) == 0)
+    {
+      unsigned char byte1 = *dp++, r, dst;
+      r = ((code & 0x7) << 1) | ((byte1 >> 7) & 1);
+      dst = (byte1 & 0x7f);
+      switch (r)
+        {
+        case 0: UNW_DEC_REG_GR(P3, UNW_REG_PSP, dst, arg); break;
+        case 1: UNW_DEC_REG_GR(P3, UNW_REG_RP, dst, arg); break;
+        case 2: UNW_DEC_REG_GR(P3, UNW_REG_PFS, dst, arg); break;
+        case 3: UNW_DEC_REG_GR(P3, UNW_REG_PR, dst, arg); break;
+        case 4: UNW_DEC_REG_GR(P3, UNW_REG_UNAT, dst, arg); break;
+        case 5: UNW_DEC_REG_GR(P3, UNW_REG_LC, dst, arg); break;
+        case 6: UNW_DEC_RP_BR(P3, dst, arg); break;
+        case 7: UNW_DEC_REG_GR(P3, UNW_REG_RNAT, dst, arg); break;
+        case 8: UNW_DEC_REG_GR(P3, UNW_REG_BSP, dst, arg); break;
+        case 9: UNW_DEC_REG_GR(P3, UNW_REG_BSPSTORE, dst, arg); break;
+        case 10: UNW_DEC_REG_GR(P3, UNW_REG_FPSR, dst, arg); break;
+        case 11: UNW_DEC_PRIUNAT_GR(P3, dst, arg); break;
+        default: UNW_DEC_BAD_CODE(r); break;
+        }
+    }
+  else if ((code & 0x7) == 0)
+    UNW_DEC_SPILL_MASK(P4, dp, arg);
+  else if ((code & 0x7) == 1)
+    {
+      unw_word grmask, frmask, byte1, byte2, byte3;
+      byte1 = *dp++; byte2 = *dp++; byte3 = *dp++;
+      grmask = ((byte1 >> 4) & 0xf);
+      frmask = ((byte1 & 0xf) << 16) | (byte2 << 8) | byte3;
+      UNW_DEC_FRGR_MEM(P5, grmask, frmask, arg);
+    }
+  else
+    UNW_DEC_BAD_CODE(code);
+  return dp;
+}
+static unsigned char *
+unw_decode_p6 (unsigned char *dp, unsigned char code, void *arg)
+{
+  int gregs = (code & 0x10) != 0;
+  unsigned char mask = (code & 0x0f);
+  if (gregs)
+    UNW_DEC_GR_MEM(P6, mask, arg);
+  else
+    UNW_DEC_FR_MEM(P6, mask, arg);
+  return dp;
+}
+static unsigned char *
+unw_decode_p7_p10 (unsigned char *dp, unsigned char code, void *arg)
+{
+  unsigned char r, byte1, byte2;
+  unw_word t, size;
+  if ((code & 0x10) == 0)
+    {
+      r = (code & 0xf);
+      t = unw_decode_uleb128 (&dp);
+      switch (r)
+        {
+        case 0:
+          size = unw_decode_uleb128 (&dp);
+          UNW_DEC_MEM_STACK_F(P7, t, size, arg);
+          break;
+        case 1: UNW_DEC_MEM_STACK_V(P7, t, arg); break;
+        case 2: UNW_DEC_SPILL_BASE(P7, t, arg); break;
+        case 3: UNW_DEC_REG_SPREL(P7, UNW_REG_PSP, t, arg); break;
+        case 4: UNW_DEC_REG_WHEN(P7, UNW_REG_RP, t, arg); break;
+        case 5: UNW_DEC_REG_PSPREL(P7, UNW_REG_RP, t, arg); break;
+        case 6: UNW_DEC_REG_WHEN(P7, UNW_REG_PFS, t, arg); break;
+        case 7: UNW_DEC_REG_PSPREL(P7, UNW_REG_PFS, t, arg); break;
+        case 8: UNW_DEC_REG_WHEN(P7, UNW_REG_PR, t, arg); break;
+        case 9: UNW_DEC_REG_PSPREL(P7, UNW_REG_PR, t, arg); break;
+        case 10: UNW_DEC_REG_WHEN(P7, UNW_REG_LC, t, arg); break;
+        case 11: UNW_DEC_REG_PSPREL(P7, UNW_REG_LC, t, arg); break;
+        case 12: UNW_DEC_REG_WHEN(P7, UNW_REG_UNAT, t, arg); break;
+        case 13: UNW_DEC_REG_PSPREL(P7, UNW_REG_UNAT, t, arg); break;
+        case 14: UNW_DEC_REG_WHEN(P7, UNW_REG_FPSR, t, arg); break;
+        case 15: UNW_DEC_REG_PSPREL(P7, UNW_REG_FPSR, t, arg); break;
+        default: UNW_DEC_BAD_CODE(r); break;
+        }
+    }
+  else
+    {
+      switch (code & 0xf)
+        {
+        case 0x0: /* p8 */
+          {
+            r = *dp++;
+            t = unw_decode_uleb128 (&dp);
+            switch (r)
+              {
+              case  1: UNW_DEC_REG_SPREL(P8, UNW_REG_RP, t, arg); break;
+              case  2: UNW_DEC_REG_SPREL(P8, UNW_REG_PFS, t, arg); break;
+              case  3: UNW_DEC_REG_SPREL(P8, UNW_REG_PR, t, arg); break;
+              case  4: UNW_DEC_REG_SPREL(P8, UNW_REG_LC, t, arg); break;
+              case  5: UNW_DEC_REG_SPREL(P8, UNW_REG_UNAT, t, arg); break;
+              case  6: UNW_DEC_REG_SPREL(P8, UNW_REG_FPSR, t, arg); break;
+              case  7: UNW_DEC_REG_WHEN(P8, UNW_REG_BSP, t, arg); break;
+              case  8: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSP, t, arg); break;
+              case  9: UNW_DEC_REG_SPREL(P8, UNW_REG_BSP, t, arg); break;
+              case 10: UNW_DEC_REG_WHEN(P8, UNW_REG_BSPSTORE, t, arg); break;
+              case 11: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSPSTORE, t, arg); break;
+              case 12: UNW_DEC_REG_SPREL(P8, UNW_REG_BSPSTORE, t, arg); break;
+              case 13: UNW_DEC_REG_WHEN(P8, UNW_REG_RNAT, t, arg); break;
+              case 14: UNW_DEC_REG_PSPREL(P8, UNW_REG_RNAT, t, arg); break;
+              case 15: UNW_DEC_REG_SPREL(P8, UNW_REG_RNAT, t, arg); break;
+              case 16: UNW_DEC_PRIUNAT_WHEN_GR(P8, t, arg); break;
+              case 17: UNW_DEC_PRIUNAT_PSPREL(P8, t, arg); break;
+              case 18: UNW_DEC_PRIUNAT_SPREL(P8, t, arg); break;
+              case 19: UNW_DEC_PRIUNAT_WHEN_MEM(P8, t, arg); break;
+              default: UNW_DEC_BAD_CODE(r); break;
+            }
+          }
+          break;
+        case 0x1:
+          byte1 = *dp++; byte2 = *dp++;
+          UNW_DEC_GR_GR(P9, (byte1 & 0xf), (byte2 & 0x7f), arg);
+          break;
+        case 0xf: /* p10 */
+          byte1 = *dp++; byte2 = *dp++;
+          UNW_DEC_ABI(P10, byte1, byte2, arg);
+          break;
+        case 0x9:
+          return unw_decode_x1 (dp, code, arg);
+        case 0xa:
+          return unw_decode_x2 (dp, code, arg);
+        case 0xb:
+          return unw_decode_x3 (dp, code, arg);
+        case 0xc:
+          return unw_decode_x4 (dp, code, arg);
+        default:
+          UNW_DEC_BAD_CODE(code);
+          break;
+        }
+    }
+  return dp;
+}
+static unsigned char *
+unw_decode_b1 (unsigned char *dp, unsigned char code, void *arg)
+{
+  unw_word label = (code & 0x1f);
+  if ((code & 0x20) != 0)
+    UNW_DEC_COPY_STATE(B1, label, arg);
+  else
+    UNW_DEC_LABEL_STATE(B1, label, arg);
+  return dp;
+}
+static unsigned char *
+unw_decode_b2 (unsigned char *dp, unsigned char code, void *arg)
+{
+  unw_word t;
+  t = unw_decode_uleb128 (&dp);
+  UNW_DEC_EPILOGUE(B2, t, (code & 0x1f), arg);
+  return dp;
+}
+static unsigned char *
+unw_decode_b3_x4 (unsigned char *dp, unsigned char code, void *arg)
+{
+  unw_word t, ecount, label;
+  if ((code & 0x10) == 0)
+    {
+      t = unw_decode_uleb128 (&dp);
+      ecount = unw_decode_uleb128 (&dp);
+      UNW_DEC_EPILOGUE(B3, t, ecount, arg);
+    }
+  else if ((code & 0x07) == 0)
+    {
+      label = unw_decode_uleb128 (&dp);
+      if ((code & 0x08) != 0)
+        UNW_DEC_COPY_STATE(B4, label, arg);
+      else
+        UNW_DEC_LABEL_STATE(B4, label, arg);
+    }
+  else
+    switch (code & 0x7)
+      {
+      case 1: return unw_decode_x1 (dp, code, arg);
+      case 2: return unw_decode_x2 (dp, code, arg);
+      case 3: return unw_decode_x3 (dp, code, arg);
+      case 4: return unw_decode_x4 (dp, code, arg);
+      default: UNW_DEC_BAD_CODE(code); break;
+      }
+  return dp;
+}
+typedef unsigned char *(*unw_decoder) (unsigned char *, unsigned char, void *);
+static unw_decoder unw_decode_table[2][8] =
+{
+  /* prologue table: */
+  {
+    unw_decode_r1,      /* 0 */
+    unw_decode_r1,
+    unw_decode_r2,
+    unw_decode_r3,
+    unw_decode_p1,      /* 4 */
+    unw_decode_p2_p5,
+    unw_decode_p6,
+    unw_decode_p7_p10
+  },
+  {
+    unw_decode_r1,      /* 0 */
+    unw_decode_r1,
+    unw_decode_r2,
+    unw_decode_r3,
+    unw_decode_b1,      /* 4 */
+    unw_decode_b1,
+    unw_decode_b2,
+    unw_decode_b3_x4
+  }
+};
+/*
+ * Decode one descriptor and return address of next descriptor.
+ */
+static inline unsigned char *
+unw_decode (unsigned char *dp, int inside_body, void *arg)
+{
+  unw_decoder decoder;
+  unsigned char code;
+  code = *dp++;
+  decoder = unw_decode_table[inside_body][code >> 5];
+  dp = (*decoder) (dp, code, arg);
+  return dp;
+}
diff --git a/arch/ia64/kernel/unwind_i.h b/arch/ia64/kernel/unwind_i.h
new file mode 100644
index 000000000000..96693a6ae370
--- /dev/null
+++ b/arch/ia64/kernel/unwind_i.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2000, 2002-2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * Kernel unwind support.
+ */
+#define UNW_VER(x)              ((x) >> 48)
+#define UNW_FLAG_MASK           0x0000ffff00000000
+#define UNW_FLAG_OSMASK         0x0000f00000000000
+#define UNW_FLAG_EHANDLER(x)    ((x) & 0x0000000100000000L)
+#define UNW_FLAG_UHANDLER(x)    ((x) & 0x0000000200000000L)
+#define UNW_LENGTH(x)           ((x) & 0x00000000ffffffffL)
+enum unw_register_index {
+        /* primary unat: */
+        UNW_REG_PRI_UNAT_GR,
+        UNW_REG_PRI_UNAT_MEM,
+        /* register stack */
+        UNW_REG_BSP,                                    /* register stack pointer */
+        UNW_REG_BSPSTORE,
+        UNW_REG_PFS,                                    /* previous function state */
+        UNW_REG_RNAT,
+        /* memory stack */
+        UNW_REG_PSP,                                    /* previous memory stack pointer */
+        /* return pointer: */
+        UNW_REG_RP,
+        /* preserved registers: */
+        UNW_REG_R4, UNW_REG_R5, UNW_REG_R6, UNW_REG_R7,
+        UNW_REG_UNAT, UNW_REG_PR, UNW_REG_LC, UNW_REG_FPSR,
+        UNW_REG_B1, UNW_REG_B2, UNW_REG_B3, UNW_REG_B4, UNW_REG_B5,
+        UNW_REG_F2, UNW_REG_F3, UNW_REG_F4, UNW_REG_F5,
+        UNW_REG_F16, UNW_REG_F17, UNW_REG_F18, UNW_REG_F19,
+        UNW_REG_F20, UNW_REG_F21, UNW_REG_F22, UNW_REG_F23,
+        UNW_REG_F24, UNW_REG_F25, UNW_REG_F26, UNW_REG_F27,
+        UNW_REG_F28, UNW_REG_F29, UNW_REG_F30, UNW_REG_F31,
+        UNW_NUM_REGS
+};
+struct unw_info_block {
+        u64 header;
+        u64 desc[0];            /* unwind descriptors */
+        /* personality routine and language-specific data follow behind descriptors */
+};
+struct unw_table {
+        struct unw_table *next;         /* must be first member! */
+        const char *name;
+        unsigned long gp;               /* global pointer for this load-module */
+        unsigned long segment_base;     /* base for offsets in the unwind table entries */
+        unsigned long start;
+        unsigned long end;
+        const struct unw_table_entry *array;
+        unsigned long length;
+};
+enum unw_where {
+        UNW_WHERE_NONE,                 /* register isn't saved at all */
+        UNW_WHERE_GR,                   /* register is saved in a general register */
+        UNW_WHERE_FR,                   /* register is saved in a floating-point register */
+        UNW_WHERE_BR,                   /* register is saved in a branch register */
+        UNW_WHERE_SPREL,                /* register is saved on memstack (sp-relative) */
+        UNW_WHERE_PSPREL,               /* register is saved on memstack (psp-relative) */
+        /*
+         * At the end of each prologue these locations get resolved to
+         * UNW_WHERE_PSPREL and UNW_WHERE_GR, respectively:
+         */
+        UNW_WHERE_SPILL_HOME,           /* register is saved in its spill home */
+        UNW_WHERE_GR_SAVE               /* register is saved in next general register */
+};
+#define UNW_WHEN_NEVER  0x7fffffff
+struct unw_reg_info {
+        unsigned long val;              /* save location: register number or offset */
+        enum unw_where where;           /* where the register gets saved */
+        int when;                       /* when the register gets saved */
+};
+struct unw_reg_state {
+        struct unw_reg_state *next;             /* next (outer) element on state stack */
+        struct unw_reg_info reg[UNW_NUM_REGS];  /* register save locations */
+};
+struct unw_labeled_state {
+        struct unw_labeled_state *next;         /* next labeled state (or NULL) */
+        unsigned long label;                    /* label for this state */
+        struct unw_reg_state saved_state;
+};
+struct unw_state_record {
+        unsigned int first_region : 1;  /* is this the first region? */
+        unsigned int done : 1;          /* are we done scanning descriptors? */
+        unsigned int any_spills : 1;    /* got any register spills? */
+        unsigned int in_body : 1;       /* are we inside a body (as opposed to a prologue)? */
+        unsigned long flags;            /* see UNW_FLAG_* in unwind.h */
+        u8 *imask;                      /* imask of spill_mask record or NULL */
+        unsigned long pr_val;           /* predicate values */
+        unsigned long pr_mask;          /* predicate mask */
+        long spill_offset;              /* psp-relative offset for spill base */
+        int region_start;
+        int region_len;
+        int epilogue_start;
+        int epilogue_count;
+        int when_target;
+        u8 gr_save_loc;                 /* next general register to use for saving a register */
+        u8 return_link_reg;             /* branch register in which the return link is passed */
+        struct unw_labeled_state *labeled_states;       /* list of all labeled states */
+        struct unw_reg_state curr;      /* current state */
+};
+enum unw_nat_type {
+        UNW_NAT_NONE,           /* NaT not represented */
+        UNW_NAT_VAL,            /* NaT represented by NaT value (fp reg) */
+        UNW_NAT_MEMSTK,         /* NaT value is in unat word at offset OFF  */
+        UNW_NAT_REGSTK          /* NaT is in rnat */
+};
+enum unw_insn_opcode {
+        UNW_INSN_ADD,                   /* s[dst] += val */
+        UNW_INSN_ADD_PSP,               /* s[dst] = (s.psp + val) */
+        UNW_INSN_ADD_SP,                /* s[dst] = (s.sp + val) */
+        UNW_INSN_MOVE,                  /* s[dst] = s[val] */
+        UNW_INSN_MOVE2,                 /* s[dst] = s[val]; s[dst+1] = s[val+1] */
+        UNW_INSN_MOVE_STACKED,          /* s[dst] = ia64_rse_skip(*s.bsp, val) */
+        UNW_INSN_SETNAT_MEMSTK,         /* s[dst+1].nat.type = MEMSTK;
+                                           s[dst+1].nat.off = *s.pri_unat - s[dst] */
+        UNW_INSN_SETNAT_TYPE,           /* s[dst+1].nat.type = val */
+        UNW_INSN_LOAD,                  /* s[dst] = *s[val] */
+        UNW_INSN_MOVE_SCRATCH,          /* s[dst] = scratch reg "val" */
+        UNW_INSN_MOVE_CONST,            /* s[dst] = constant reg "val" */
+};
+struct unw_insn {
+        unsigned int opc        :  4;
+        unsigned int dst        :  9;
+        signed int val          : 19;
+};
+/*
+ * Preserved general static registers (r4-r7) give rise to two script
+ * instructions; everything else yields at most one instruction; at
+ * the end of the script, the psp gets popped, accounting for one more
+ * instruction.
+ */
+#define UNW_MAX_SCRIPT_LEN      (UNW_NUM_REGS + 5)
+struct unw_script {
+        unsigned long ip;               /* ip this script is for */
+        unsigned long pr_mask;          /* mask of predicates script depends on */
+        unsigned long pr_val;           /* predicate values this script is for */
+        rwlock_t lock;
+        unsigned int flags;             /* see UNW_FLAG_* in unwind.h */
+        unsigned short lru_chain;       /* used for least-recently-used chain */
+        unsigned short coll_chain;      /* used for hash collisions */
+        unsigned short hint;            /* hint for next script to try (or -1) */
+        unsigned short count;           /* number of instructions in script */
+        struct unw_insn insn[UNW_MAX_SCRIPT_LEN];
+};
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
new file mode 100644
index 000000000000..b9f0db4c1b04
--- /dev/null
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -0,0 +1,251 @@
+#include <linux/config.h>
+#include <asm/cache.h>
+#include <asm/ptrace.h>
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#define LOAD_OFFSET     (KERNEL_START - KERNEL_TR_PAGE_SIZE)
+#include <asm-generic/vmlinux.lds.h>
+OUTPUT_FORMAT("elf64-ia64-little")
+OUTPUT_ARCH(ia64)
+ENTRY(phys_start)
+jiffies = jiffies_64;
+PHDRS {
+  code   PT_LOAD;
+  percpu PT_LOAD;
+  data   PT_LOAD;
+}
+SECTIONS
+{
+  /* Sections to be discarded */
+  /DISCARD/ : {
+        *(.exit.text)
+        *(.exit.data)
+        *(.exitcall.exit)
+        *(.IA_64.unwind.exit.text)
+        *(.IA_64.unwind_info.exit.text)
+        }
+  v = PAGE_OFFSET;      /* this symbol is here to make debugging easier... */
+  phys_start = _start - LOAD_OFFSET;
+  code : { } :code
+  . = KERNEL_START;
+  _text = .;
+  _stext = .;
+  .text : AT(ADDR(.text) - LOAD_OFFSET)
+    {
+        *(.text.ivt)
+        *(.text)
+        SCHED_TEXT
+        LOCK_TEXT
+        *(.gnu.linkonce.t*)
+    }
+  .text2 : AT(ADDR(.text2) - LOAD_OFFSET)
+        { *(.text2) }
+#ifdef CONFIG_SMP
+  .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET)
+        { *(.text.lock) }
+#endif
+  _etext = .;
+  /* Read-only data */
+  /* Exception table */
+  . = ALIGN(16);
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET)
+        {
+          __start___ex_table = .;
+          *(__ex_table)
+          __stop___ex_table = .;
+        }
+  .data.patch.vtop : AT(ADDR(.data.patch.vtop) - LOAD_OFFSET)
+        {
+          __start___vtop_patchlist = .;
+          *(.data.patch.vtop)
+          __end___vtop_patchlist = .;
+        }
+  .data.patch.mckinley_e9 : AT(ADDR(.data.patch.mckinley_e9) - LOAD_OFFSET)
+        {
+          __start___mckinley_e9_bundles = .;
+          *(.data.patch.mckinley_e9)
+          __end___mckinley_e9_bundles = .;
+        }
+  /* Global data */
+  _data = .;
+#if defined(CONFIG_IA64_GENERIC)
+  /* Machine Vector */
+  . = ALIGN(16);
+  .machvec : AT(ADDR(.machvec) - LOAD_OFFSET)
+        {
+          machvec_start = .;
+          *(.machvec)
+          machvec_end = .;
+        }
+#endif
+  /* Unwind info & table: */
+  . = ALIGN(8);
+  .IA_64.unwind_info : AT(ADDR(.IA_64.unwind_info) - LOAD_OFFSET)
+        { *(.IA_64.unwind_info*) }
+  .IA_64.unwind : AT(ADDR(.IA_64.unwind) - LOAD_OFFSET)
+        {
+          __start_unwind = .;
+          *(.IA_64.unwind*)
+          __end_unwind = .;
+        }
+  RODATA
+  .opd : AT(ADDR(.opd) - LOAD_OFFSET)
+        { *(.opd) }
+  /* Initialization code and data: */
+  . = ALIGN(PAGE_SIZE);
+  __init_begin = .;
+  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET)
+        {
+          _sinittext = .;
+          *(.init.text)
+          _einittext = .;
+        }
+  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET)
+        { *(.init.data) }
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET)
+        {
+          __initramfs_start = .;
+          *(.init.ramfs)
+          __initramfs_end = .;
+        }
+   . = ALIGN(16);
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET)
+        {
+          __setup_start = .;
+          *(.init.setup)
+          __setup_end = .;
+        }
+  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET)
+        {
+          __initcall_start = .;
+          *(.initcall1.init)
+          *(.initcall2.init)
+          *(.initcall3.init)
+          *(.initcall4.init)
+          *(.initcall5.init)
+          *(.initcall6.init)
+          *(.initcall7.init)
+          __initcall_end = .;
+        }
+   __con_initcall_start = .;
+  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET)
+        { *(.con_initcall.init) }
+  __con_initcall_end = .;
+  __security_initcall_start = .;
+  .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET)
+        { *(.security_initcall.init) }
+  __security_initcall_end = .;
+  . = ALIGN(PAGE_SIZE);
+  __init_end = .;
+  /* The initial task and kernel stack */
+  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET)
+        { *(.data.init_task) }
+  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET)
+        { *(__special_page_section)
+          __start_gate_section = .;
+          *(.data.gate)
+          __stop_gate_section = .;
+        }
+  . = ALIGN(PAGE_SIZE);         /* make sure the gate page doesn't expose kernel data */
+  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET)
+        { *(.data.cacheline_aligned) }
+  /* Per-cpu data: */
+  percpu : { } :percpu
+  . = ALIGN(PERCPU_PAGE_SIZE);
+  __phys_per_cpu_start = .;
+  .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET)
+        {
+                __per_cpu_start = .;
+                *(.data.percpu)
+                __per_cpu_end = .;
+        }
+  . = __phys_per_cpu_start + PERCPU_PAGE_SIZE;  /* ensure percpu data fits into percpu page size */
+  data : { } :data
+  .data : AT(ADDR(.data) - LOAD_OFFSET)
+        { *(.data) *(.data1) *(.gnu.linkonce.d*) CONSTRUCTORS }
+  . = ALIGN(16);        /* gp must be 16-byte aligned for exc. table */
+  .got : AT(ADDR(.got) - LOAD_OFFSET)
+        { *(.got.plt) *(.got) }
+  __gp = ADDR(.got) + 0x200000;
+  /* We want the small data sections together, so single-instruction offsets
+     can access them all, and initialized data all before uninitialized, so
+     we can shorten the on-disk segment size.  */
+  .sdata : AT(ADDR(.sdata) - LOAD_OFFSET)
+        { *(.sdata) *(.sdata1) *(.srdata) }
+  _edata  =  .;
+  _bss = .;
+  .sbss : AT(ADDR(.sbss) - LOAD_OFFSET)
+        { *(.sbss) *(.scommon) }
+  .bss : AT(ADDR(.bss) - LOAD_OFFSET)
+        { *(.bss) *(COMMON) }
+  _end = .;
+  code : { } :code
+  /* Stabs debugging sections.  */
+  .stab 0 : { *(.stab) }
+  .stabstr 0 : { *(.stabstr) }
+  .stab.excl 0 : { *(.stab.excl) }
+  .stab.exclstr 0 : { *(.stab.exclstr) }
+  .stab.index 0 : { *(.stab.index) }
+  .stab.indexstr 0 : { *(.stab.indexstr) }
+  /* DWARF debug sections.
+     Symbols in the DWARF debugging sections are relative to the beginning
+     of the section so we begin them at 0.  */
+  /* DWARF 1 */
+  .debug          0 : { *(.debug) }
+  .line           0 : { *(.line) }
+  /* GNU DWARF 1 extensions */
+  .debug_srcinfo  0 : { *(.debug_srcinfo) }
+  .debug_sfnames  0 : { *(.debug_sfnames) }
+  /* DWARF 1.1 and DWARF 2 */
+  .debug_aranges  0 : { *(.debug_aranges) }
+  .debug_pubnames 0 : { *(.debug_pubnames) }
+  /* DWARF 2 */
+  .debug_info     0 : { *(.debug_info) }
+  .debug_abbrev   0 : { *(.debug_abbrev) }
+  .debug_line     0 : { *(.debug_line) }
+  .debug_frame    0 : { *(.debug_frame) }
+  .debug_str      0 : { *(.debug_str) }
+  .debug_loc      0 : { *(.debug_loc) }
+  .debug_macinfo  0 : { *(.debug_macinfo) }
+  /* SGI/MIPS DWARF 2 extensions */
+  .debug_weaknames 0 : { *(.debug_weaknames) }
+  .debug_funcnames 0 : { *(.debug_funcnames) }
+  .debug_typenames 0 : { *(.debug_typenames) }
+  .debug_varnames  0 : { *(.debug_varnames) }
+  /* These must appear regardless of  .  */
+  /* Discard them for now since Intel SoftSDV cannot handle them.
+  .comment 0 : { *(.comment) }
+  .note 0 : { *(.note) }
+  */
+  /DISCARD/ : { *(.comment) }
+  /DISCARD/ : { *(.note) }
+}