9 files changed, 2592 insertions, 0 deletions
diff --git a/arch/ia64/mm/Makefile b/arch/ia64/mm/Makefile
new file mode 100644
index 000000000000..7078f67887ec
--- /dev/null
+++ b/arch/ia64/mm/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the ia64-specific parts of the memory manager.
+#
+obj-y := init.o fault.o tlb.o extable.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_NUMA)         += numa.o
+obj-$(CONFIG_DISCONTIGMEM) += discontig.o
+ifndef CONFIG_DISCONTIGMEM
+obj-y += contig.o
+endif
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
new file mode 100644
index 000000000000..6daf15ac8940
--- /dev/null
+++ b/arch/ia64/mm/contig.c
@@ -0,0 +1,299 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *      Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com>
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 2003 Silicon Graphics, Inc. All rights reserved.
+ *
+ * Routines used by ia64 machines with contiguous (or virtually contiguous)
+ * memory.
+ */
+#include <linux/config.h>
+#include <linux/bootmem.h>
+#include <linux/efi.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <asm/meminit.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <asm/mca.h>
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+static unsigned long num_dma_physpages;
+#endif
+/**
+ * show_mem - display a memory statistics summary
+ *
+ * Just walks the pages in the system and describes where they're allocated.
+ */
+void
+show_mem (void)
+{
+        int i, total = 0, reserved = 0;
+        int shared = 0, cached = 0;
+        printk("Mem-info:\n");
+        show_free_areas();
+        printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+        i = max_mapnr;
+        while (i-- > 0) {
+                if (!pfn_valid(i))
+                        continue;
+                total++;
+                if (PageReserved(mem_map+i))
+                        reserved++;
+                else if (PageSwapCache(mem_map+i))
+                        cached++;
+                else if (page_count(mem_map + i))
+                        shared += page_count(mem_map + i) - 1;
+        }
+        printk("%d pages of RAM\n", total);
+        printk("%d reserved pages\n", reserved);
+        printk("%d pages shared\n", shared);
+        printk("%d pages swap cached\n", cached);
+        printk("%ld pages in page table cache\n", pgtable_cache_size);
+}
+/* physical address where the bootmem map is located */
+unsigned long bootmap_start;
+/**
+ * find_max_pfn - adjust the maximum page number callback
+ * @start: start of range
+ * @end: end of range
+ * @arg: address of pointer to global max_pfn variable
+ *
+ * Passed as a callback function to efi_memmap_walk() to determine the highest
+ * available page frame number in the system.
+ */
+int
+find_max_pfn (unsigned long start, unsigned long end, void *arg)
+{
+        unsigned long *max_pfnp = arg, pfn;
+        pfn = (PAGE_ALIGN(end - 1) - PAGE_OFFSET) >> PAGE_SHIFT;
+        if (pfn > *max_pfnp)
+                *max_pfnp = pfn;
+        return 0;
+}
+/**
+ * find_bootmap_location - callback to find a memory area for the bootmap
+ * @start: start of region
+ * @end: end of region
+ * @arg: unused callback data
+ *
+ * Find a place to put the bootmap and return its starting address in
+ * bootmap_start.  This address must be page-aligned.
+ */
+int
+find_bootmap_location (unsigned long start, unsigned long end, void *arg)
+{
+        unsigned long needed = *(unsigned long *)arg;
+        unsigned long range_start, range_end, free_start;
+        int i;
+#if IGNORE_PFN0
+        if (start == PAGE_OFFSET) {
+                start += PAGE_SIZE;
+                if (start >= end)
+                        return 0;
+        }
+#endif
+        free_start = PAGE_OFFSET;
+        for (i = 0; i < num_rsvd_regions; i++) {
+                range_start = max(start, free_start);
+                range_end   = min(end, rsvd_region[i].start & PAGE_MASK);
+                free_start = PAGE_ALIGN(rsvd_region[i].end);
+                if (range_end <= range_start)
+                        continue; /* skip over empty range */
+                if (range_end - range_start >= needed) {
+                        bootmap_start = __pa(range_start);
+                        return -1;      /* done */
+                }
+                /* nothing more available in this segment */
+                if (range_end == end)
+                        return 0;
+        }
+        return 0;
+}
+/**
+ * find_memory - setup memory map
+ *
+ * Walk the EFI memory map and find usable memory for the system, taking
+ * into account reserved areas.
+ */
+void
+find_memory (void)
+{
+        unsigned long bootmap_size;
+        reserve_memory();
+        /* first find highest page frame number */
+        max_pfn = 0;
+        efi_memmap_walk(find_max_pfn, &max_pfn);
+        /* how many bytes to cover all the pages */
+        bootmap_size = bootmem_bootmap_pages(max_pfn) << PAGE_SHIFT;
+        /* look for a location to hold the bootmap */
+        bootmap_start = ~0UL;
+        efi_memmap_walk(find_bootmap_location, &bootmap_size);
+        if (bootmap_start == ~0UL)
+                panic("Cannot find %ld bytes for bootmap\n", bootmap_size);
+        bootmap_size = init_bootmem(bootmap_start >> PAGE_SHIFT, max_pfn);
+        /* Free all available memory, then mark bootmem-map as being in use. */
+        efi_memmap_walk(filter_rsvd_memory, free_bootmem);
+        reserve_bootmem(bootmap_start, bootmap_size);
+        find_initrd();
+}
+#ifdef CONFIG_SMP
+/**
+ * per_cpu_init - setup per-cpu variables
+ *
+ * Allocate and setup per-cpu data areas.
+ */
+void *
+per_cpu_init (void)
+{
+        void *cpu_data;
+        int cpu;
+        /*
+         * get_free_pages() cannot be used before cpu_init() done.  BSP
+         * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls
+         * get_zeroed_page().
+         */
+        if (smp_processor_id() == 0) {
+                cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS,
+                                           PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+                for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                        memcpy(cpu_data, __phys_per_cpu_start, __per_cpu_end - __per_cpu_start);
+                        __per_cpu_offset[cpu] = (char *) cpu_data - __per_cpu_start;
+                        cpu_data += PERCPU_PAGE_SIZE;
+                        per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
+                }
+        }
+        return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
+}
+#endif /* CONFIG_SMP */
+static int
+count_pages (u64 start, u64 end, void *arg)
+{
+        unsigned long *count = arg;
+        *count += (end - start) >> PAGE_SHIFT;
+        return 0;
+}
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+static int
+count_dma_pages (u64 start, u64 end, void *arg)
+{
+        unsigned long *count = arg;
+        if (start < MAX_DMA_ADDRESS)
+                *count += (min(end, MAX_DMA_ADDRESS) - start) >> PAGE_SHIFT;
+        return 0;
+}
+#endif
+/*
+ * Set up the page tables.
+ */
+void
+paging_init (void)
+{
+        unsigned long max_dma;
+        unsigned long zones_size[MAX_NR_ZONES];
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+        unsigned long zholes_size[MAX_NR_ZONES];
+        unsigned long max_gap;
+#endif
+        /* initialize mem_map[] */
+        memset(zones_size, 0, sizeof(zones_size));
+        num_physpages = 0;
+        efi_memmap_walk(count_pages, &num_physpages);
+        max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+        memset(zholes_size, 0, sizeof(zholes_size));
+        num_dma_physpages = 0;
+        efi_memmap_walk(count_dma_pages, &num_dma_physpages);
+        if (max_low_pfn < max_dma) {
+                zones_size[ZONE_DMA] = max_low_pfn;
+                zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages;
+        } else {
+                zones_size[ZONE_DMA] = max_dma;
+                zholes_size[ZONE_DMA] = max_dma - num_dma_physpages;
+                if (num_physpages > num_dma_physpages) {
+                        zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
+                        zholes_size[ZONE_NORMAL] =
+                                ((max_low_pfn - max_dma) -
+                                 (num_physpages - num_dma_physpages));
+                }
+        }
+        max_gap = 0;
+        efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
+        if (max_gap < LARGE_GAP) {
+                vmem_map = (struct page *) 0;
+                free_area_init_node(0, &contig_page_data, zones_size, 0,
+                                    zholes_size);
+        } else {
+                unsigned long map_size;
+                /* allocate virtual_mem_map */
+                map_size = PAGE_ALIGN(max_low_pfn * sizeof(struct page));
+                vmalloc_end -= map_size;
+                vmem_map = (struct page *) vmalloc_end;
+                efi_memmap_walk(create_mem_map_page_table, NULL);
+                NODE_DATA(0)->node_mem_map = vmem_map;
+                free_area_init_node(0, &contig_page_data, zones_size,
+                                    0, zholes_size);
+                printk("Virtual mem_map starts at 0x%p\n", mem_map);
+        }
+#else /* !CONFIG_VIRTUAL_MEM_MAP */
+        if (max_low_pfn < max_dma)
+                zones_size[ZONE_DMA] = max_low_pfn;
+        else {
+                zones_size[ZONE_DMA] = max_dma;
+                zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
+        }
+        free_area_init(zones_size);
+#endif /* !CONFIG_VIRTUAL_MEM_MAP */
+        zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
+}
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
new file mode 100644
index 000000000000..3456a9b6971e
--- /dev/null
+++ b/arch/ia64/mm/discontig.c
@@ -0,0 +1,737 @@
+/*
+ * Copyright (c) 2000, 2003 Silicon Graphics, Inc.  All rights reserved.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
+ * Copyright (c) 2002 NEC Corp.
+ * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
+ * Copyright (c) 2004 Silicon Graphics, Inc
+ *      Russ Anderson <rja@sgi.com>
+ *      Jesse Barnes <jbarnes@sgi.com>
+ *      Jack Steiner <steiner@sgi.com>
+ */
+/*
+ * Platform initialization for Discontig Memory
+ */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/bootmem.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
+#include <linux/nodemask.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/meminit.h>
+#include <asm/numa.h>
+#include <asm/sections.h>
+/*
+ * Track per-node information needed to setup the boot memory allocator, the
+ * per-node areas, and the real VM.
+ */
+struct early_node_data {
+        struct ia64_node_data *node_data;
+        pg_data_t *pgdat;
+        unsigned long pernode_addr;
+        unsigned long pernode_size;
+        struct bootmem_data bootmem_data;
+        unsigned long num_physpages;
+        unsigned long num_dma_physpages;
+        unsigned long min_pfn;
+        unsigned long max_pfn;
+};
+static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
+/**
+ * reassign_cpu_only_nodes - called from find_memory to move CPU-only nodes to a memory node
+ *
+ * This function will move nodes with only CPUs (no memory)
+ * to a node with memory which is at the minimum numa_slit distance.
+ * Any reassigments will result in the compression of the nodes
+ * and renumbering the nid values where appropriate.
+ * The static declarations below are to avoid large stack size which
+ * makes the code not re-entrant.
+ */
+static void __init reassign_cpu_only_nodes(void)
+{
+        struct node_memblk_s *p;
+        int i, j, k, nnode, nid, cpu, cpunid, pxm;
+        u8 cslit, slit;
+        static DECLARE_BITMAP(nodes_with_mem, MAX_NUMNODES) __initdata;
+        static u8 numa_slit_fix[MAX_NUMNODES * MAX_NUMNODES] __initdata;
+        static int node_flip[MAX_NUMNODES] __initdata;
+        static int old_nid_map[NR_CPUS] __initdata;
+        for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
+                if (!test_bit(p->nid, (void *) nodes_with_mem)) {
+                        set_bit(p->nid, (void *) nodes_with_mem);
+                        nnode++;
+                }
+        /*
+         * All nids with memory.
+         */
+        if (nnode == num_online_nodes())
+                return;
+        /*
+         * Change nids and attempt to migrate CPU-only nodes
+         * to the best numa_slit (closest neighbor) possible.
+         * For reassigned CPU nodes a nid can't be arrived at
+         * until after this loop because the target nid's new
+         * identity might not have been established yet. So
+         * new nid values are fabricated above num_online_nodes() and
+         * mapped back later to their true value.
+         */
+        /* MCD - This code is a bit complicated, but may be unnecessary now.
+         * We can now handle much more interesting node-numbering.
+         * The old requirement that 0 <= nid <= numnodes <= MAX_NUMNODES
+         * and that there be no holes in the numbering 0..numnodes
+         * has become simply 0 <= nid <= MAX_NUMNODES.
+         */
+        nid = 0;
+        for_each_online_node(i)  {
+                if (test_bit(i, (void *) nodes_with_mem)) {
+                        /*
+                         * Save original nid value for numa_slit
+                         * fixup and node_cpuid reassignments.
+                         */
+                        node_flip[nid] = i;
+                        if (i == nid) {
+                                nid++;
+                                continue;
+                        }
+                        for (p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
+                                if (p->nid == i)
+                                        p->nid = nid;
+                        cpunid = nid;
+                        nid++;
+                } else
+                        cpunid = MAX_NUMNODES;
+                for (cpu = 0; cpu < NR_CPUS; cpu++)
+                        if (node_cpuid[cpu].nid == i) {
+                                /*
+                                 * For nodes not being reassigned just
+                                 * fix the cpu's nid and reverse pxm map
+                                 */
+                                if (cpunid < MAX_NUMNODES) {
+                                        pxm = nid_to_pxm_map[i];
+                                        pxm_to_nid_map[pxm] =
+                                                  node_cpuid[cpu].nid = cpunid;
+                                        continue;
+                                }
+                                /*
+                                 * For nodes being reassigned, find best node by
+                                 * numa_slit information and then make a temporary
+                                 * nid value based on current nid and num_online_nodes().
+                                 */
+                                slit = 0xff;
+                                k = 2*num_online_nodes();
+                                for_each_online_node(j) {
+                                        if (i == j)
+                                                continue;
+                                        else if (test_bit(j, (void *) nodes_with_mem)) {
+                                                cslit = numa_slit[i * num_online_nodes() + j];
+                                                if (cslit < slit) {
+                                                        k = num_online_nodes() + j;
+                                                        slit = cslit;
+                                                }
+                                        }
+                                }
+                                /* save old nid map so we can update the pxm */
+                                old_nid_map[cpu] = node_cpuid[cpu].nid;
+                                node_cpuid[cpu].nid = k;
+                        }
+        }
+        /*
+         * Fixup temporary nid values for CPU-only nodes.
+         */
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                if (node_cpuid[cpu].nid == (2*num_online_nodes())) {
+                        pxm = nid_to_pxm_map[old_nid_map[cpu]];
+                        pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = nnode - 1;
+                } else {
+                        for (i = 0; i < nnode; i++) {
+                                if (node_flip[i] != (node_cpuid[cpu].nid - num_online_nodes()))
+                                        continue;
+                                pxm = nid_to_pxm_map[old_nid_map[cpu]];
+                                pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = i;
+                                break;
+                        }
+                }
+        /*
+         * Fix numa_slit by compressing from larger
+         * nid array to reduced nid array.
+         */
+        for (i = 0; i < nnode; i++)
+                for (j = 0; j < nnode; j++)
+                        numa_slit_fix[i * nnode + j] =
+                                numa_slit[node_flip[i] * num_online_nodes() + node_flip[j]];
+        memcpy(numa_slit, numa_slit_fix, sizeof (numa_slit));
+        nodes_clear(node_online_map);
+        for (i = 0; i < nnode; i++)
+                node_set_online(i);
+        return;
+}
+/*
+ * To prevent cache aliasing effects, align per-node structures so that they
+ * start at addresses that are strided by node number.
+ */
+#define NODEDATA_ALIGN(addr, node)                                              \
+        ((((addr) + 1024*1024-1) & ~(1024*1024-1)) + (node)*PERCPU_PAGE_SIZE)
+/**
+ * build_node_maps - callback to setup bootmem structs for each node
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * We allocate a struct bootmem_data for each piece of memory that we wish to
+ * treat as a virtually contiguous block (i.e. each node). Each such block
+ * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down
+ * if necessary.  Any non-existent pages will simply be part of the virtual
+ * memmap.  We also update min_low_pfn and max_low_pfn here as we receive
+ * memory ranges from the caller.
+ */
+static int __init build_node_maps(unsigned long start, unsigned long len,
+                                  int node)
+{
+        unsigned long cstart, epfn, end = start + len;
+        struct bootmem_data *bdp = &mem_data[node].bootmem_data;
+        epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
+        cstart = GRANULEROUNDDOWN(start);
+        if (!bdp->node_low_pfn) {
+                bdp->node_boot_start = cstart;
+                bdp->node_low_pfn = epfn;
+        } else {
+                bdp->node_boot_start = min(cstart, bdp->node_boot_start);
+                bdp->node_low_pfn = max(epfn, bdp->node_low_pfn);
+        }
+        min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT);
+        max_low_pfn = max(max_low_pfn, bdp->node_low_pfn);
+        return 0;
+}
+/**
+ * early_nr_phys_cpus_node - return number of physical cpus on a given node
+ * @node: node to check
+ *
+ * Count the number of physical cpus on @node.  These are cpus that actually
+ * exist.  We can't use nr_cpus_node() yet because
+ * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
+ * called yet.
+ */
+static int early_nr_phys_cpus_node(int node)
+{
+        int cpu, n = 0;
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                if (node == node_cpuid[cpu].nid)
+                        if ((cpu == 0) || node_cpuid[cpu].phys_id)
+                                n++;
+        return n;
+}
+/**
+ * early_nr_cpus_node - return number of cpus on a given node
+ * @node: node to check
+ *
+ * Count the number of cpus on @node.  We can't use nr_cpus_node() yet because
+ * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
+ * called yet.  Note that node 0 will also count all non-existent cpus.
+ */
+static int early_nr_cpus_node(int node)
+{
+        int cpu, n = 0;
+        for (cpu = 0; cpu < NR_CPUS; cpu++)
+                if (node == node_cpuid[cpu].nid)
+                        n++;
+        return n;
+}
+/**
+ * find_pernode_space - allocate memory for memory map and per-node structures
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * This routine reserves space for the per-cpu data struct, the list of
+ * pg_data_ts and the per-node data struct.  Each node will have something like
+ * the following in the first chunk of addr. space large enough to hold it.
+ *
+ *    ________________________
+ *   |                        |
+ *   |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
+ *   |    PERCPU_PAGE_SIZE *  |     start and length big enough
+ *   |    cpus_on_this_node   | Node 0 will also have entries for all non-existent cpus.
+ *   |------------------------|
+ *   |   local pg_data_t *    |
+ *   |------------------------|
+ *   |  local ia64_node_data  |
+ *   |------------------------|
+ *   |          ???           |
+ *   |________________________|
+ *
+ * Once this space has been set aside, the bootmem maps are initialized.  We
+ * could probably move the allocation of the per-cpu and ia64_node_data space
+ * outside of this function and use alloc_bootmem_node(), but doing it here
+ * is straightforward and we get the alignments we want so...
+ */
+static int __init find_pernode_space(unsigned long start, unsigned long len,
+                                     int node)
+{
+        unsigned long epfn, cpu, cpus, phys_cpus;
+        unsigned long pernodesize = 0, pernode, pages, mapsize;
+        void *cpu_data;
+        struct bootmem_data *bdp = &mem_data[node].bootmem_data;
+        epfn = (start + len) >> PAGE_SHIFT;
+        pages = bdp->node_low_pfn - (bdp->node_boot_start >> PAGE_SHIFT);
+        mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
+        /*
+         * Make sure this memory falls within this node's usable memory
+         * since we may have thrown some away in build_maps().
+         */
+        if (start < bdp->node_boot_start || epfn > bdp->node_low_pfn)
+                return 0;
+        /* Don't setup this node's local space twice... */
+        if (mem_data[node].pernode_addr)
+                return 0;
+        /*
+         * Calculate total size needed, incl. what's necessary
+         * for good alignment and alias prevention.
+         */
+        cpus = early_nr_cpus_node(node);
+        phys_cpus = early_nr_phys_cpus_node(node);
+        pernodesize += PERCPU_PAGE_SIZE * cpus;
+        pernodesize += node * L1_CACHE_BYTES;
+        pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
+        pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+        pernodesize = PAGE_ALIGN(pernodesize);
+        pernode = NODEDATA_ALIGN(start, node);
+        /* Is this range big enough for what we want to store here? */
+        if (start + len > (pernode + pernodesize + mapsize)) {
+                mem_data[node].pernode_addr = pernode;
+                mem_data[node].pernode_size = pernodesize;
+                memset(__va(pernode), 0, pernodesize);
+                cpu_data = (void *)pernode;
+                pernode += PERCPU_PAGE_SIZE * cpus;
+                pernode += node * L1_CACHE_BYTES;
+                mem_data[node].pgdat = __va(pernode);
+                pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+                mem_data[node].node_data = __va(pernode);
+                pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+                mem_data[node].pgdat->bdata = bdp;
+                pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+                /*
+                 * Copy the static per-cpu data into the region we
+                 * just set aside and then setup __per_cpu_offset
+                 * for each CPU on this node.
+                 */
+                for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                        if (node == node_cpuid[cpu].nid) {
+                                memcpy(__va(cpu_data), __phys_per_cpu_start,
+                                       __per_cpu_end - __per_cpu_start);
+                                __per_cpu_offset[cpu] = (char*)__va(cpu_data) -
+                                        __per_cpu_start;
+                                cpu_data += PERCPU_PAGE_SIZE;
+                        }
+                }
+        }
+        return 0;
+}
+/**
+ * free_node_bootmem - free bootmem allocator memory for use
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * Simply calls the bootmem allocator to free the specified ranged from
+ * the given pg_data_t's bdata struct.  After this function has been called
+ * for all the entries in the EFI memory map, the bootmem allocator will
+ * be ready to service allocation requests.
+ */
+static int __init free_node_bootmem(unsigned long start, unsigned long len,
+                                    int node)
+{
+        free_bootmem_node(mem_data[node].pgdat, start, len);
+        return 0;
+}
+/**
+ * reserve_pernode_space - reserve memory for per-node space
+ *
+ * Reserve the space used by the bootmem maps & per-node space in the boot
+ * allocator so that when we actually create the real mem maps we don't
+ * use their memory.
+ */
+static void __init reserve_pernode_space(void)
+{
+        unsigned long base, size, pages;
+        struct bootmem_data *bdp;
+        int node;
+        for_each_online_node(node) {
+                pg_data_t *pdp = mem_data[node].pgdat;
+                bdp = pdp->bdata;
+                /* First the bootmem_map itself */
+                pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
+                size = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
+                base = __pa(bdp->node_bootmem_map);
+                reserve_bootmem_node(pdp, base, size);
+                /* Now the per-node space */
+                size = mem_data[node].pernode_size;
+                base = __pa(mem_data[node].pernode_addr);
+                reserve_bootmem_node(pdp, base, size);
+        }
+}
+/**
+ * initialize_pernode_data - fixup per-cpu & per-node pointers
+ *
+ * Each node's per-node area has a copy of the global pg_data_t list, so
+ * we copy that to each node here, as well as setting the per-cpu pointer
+ * to the local node data structure.  The active_cpus field of the per-node
+ * structure gets setup by the platform_cpu_init() function later.
+ */
+static void __init initialize_pernode_data(void)
+{
+        int cpu, node;
+        pg_data_t *pgdat_list[MAX_NUMNODES];
+        for_each_online_node(node)
+                pgdat_list[node] = mem_data[node].pgdat;
+        /* Copy the pg_data_t list to each node and init the node field */
+        for_each_online_node(node) {
+                memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list,
+                       sizeof(pgdat_list));
+        }
+        /* Set the node_data pointer for each per-cpu struct */
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                node = node_cpuid[cpu].nid;
+                per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data;
+        }
+}
+/**
+ * find_memory - walk the EFI memory map and setup the bootmem allocator
+ *
+ * Called early in boot to setup the bootmem allocator, and to
+ * allocate the per-cpu and per-node structures.
+ */
+void __init find_memory(void)
+{
+        int node;
+        reserve_memory();
+        if (num_online_nodes() == 0) {
+                printk(KERN_ERR "node info missing!\n");
+                node_set_online(0);
+        }
+        min_low_pfn = -1;
+        max_low_pfn = 0;
+        if (num_online_nodes() > 1)
+                reassign_cpu_only_nodes();
+        /* These actually end up getting called by call_pernode_memory() */
+        efi_memmap_walk(filter_rsvd_memory, build_node_maps);
+        efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
+        /*
+         * Initialize the boot memory maps in reverse order since that's
+         * what the bootmem allocator expects
+         */
+        for (node = MAX_NUMNODES - 1; node >= 0; node--) {
+                unsigned long pernode, pernodesize, map;
+                struct bootmem_data *bdp;
+                if (!node_online(node))
+                        continue;
+                bdp = &mem_data[node].bootmem_data;
+                pernode = mem_data[node].pernode_addr;
+                pernodesize = mem_data[node].pernode_size;
+                map = pernode + pernodesize;
+                /* Sanity check... */
+                if (!pernode)
+                        panic("pernode space for node %d "
+                              "could not be allocated!", node);
+                init_bootmem_node(mem_data[node].pgdat,
+                                  map>>PAGE_SHIFT,
+                                  bdp->node_boot_start>>PAGE_SHIFT,
+                                  bdp->node_low_pfn);
+        }
+        efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
+        reserve_pernode_space();
+        initialize_pernode_data();
+        max_pfn = max_low_pfn;
+        find_initrd();
+}
+/**
+ * per_cpu_init - setup per-cpu variables
+ *
+ * find_pernode_space() does most of this already, we just need to set
+ * local_per_cpu_offset
+ */
+void *per_cpu_init(void)
+{
+        int cpu;
+        if (smp_processor_id() == 0) {
+                for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                        per_cpu(local_per_cpu_offset, cpu) =
+                                __per_cpu_offset[cpu];
+                }
+        }
+        return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
+}
+/**
+ * show_mem - give short summary of memory stats
+ *
+ * Shows a simple page count of reserved and used pages in the system.
+ * For discontig machines, it does this on a per-pgdat basis.
+ */
+void show_mem(void)
+{
+        int i, total_reserved = 0;
+        int total_shared = 0, total_cached = 0;
+        unsigned long total_present = 0;
+        pg_data_t *pgdat;
+        printk("Mem-info:\n");
+        show_free_areas();
+        printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+        for_each_pgdat(pgdat) {
+                unsigned long present = pgdat->node_present_pages;
+                int shared = 0, cached = 0, reserved = 0;
+                printk("Node ID: %d\n", pgdat->node_id);
+                for(i = 0; i < pgdat->node_spanned_pages; i++) {
+                        if (!ia64_pfn_valid(pgdat->node_start_pfn+i))
+                                continue;
+                        if (PageReserved(pgdat->node_mem_map+i))
+                                reserved++;
+                        else if (PageSwapCache(pgdat->node_mem_map+i))
+                                cached++;
+                        else if (page_count(pgdat->node_mem_map+i))
+                                shared += page_count(pgdat->node_mem_map+i)-1;
+                }
+                total_present += present;
+                total_reserved += reserved;
+                total_cached += cached;
+                total_shared += shared;
+                printk("\t%ld pages of RAM\n", present);
+                printk("\t%d reserved pages\n", reserved);
+                printk("\t%d pages shared\n", shared);
+                printk("\t%d pages swap cached\n", cached);
+        }
+        printk("%ld pages of RAM\n", total_present);
+        printk("%d reserved pages\n", total_reserved);
+        printk("%d pages shared\n", total_shared);
+        printk("%d pages swap cached\n", total_cached);
+        printk("Total of %ld pages in page table cache\n", pgtable_cache_size);
+        printk("%d free buffer pages\n", nr_free_buffer_pages());
+}
+/**
+ * call_pernode_memory - use SRAT to call callback functions with node info
+ * @start: physical start of range
+ * @len: length of range
+ * @arg: function to call for each range
+ *
+ * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
+ * out to which node a block of memory belongs.  Ignore memory that we cannot
+ * identify, and split blocks that run across multiple nodes.
+ *
+ * Take this opportunity to round the start address up and the end address
+ * down to page boundaries.
+ */
+void call_pernode_memory(unsigned long start, unsigned long len, void *arg)
+{
+        unsigned long rs, re, end = start + len;
+        void (*func)(unsigned long, unsigned long, int);
+        int i;
+        start = PAGE_ALIGN(start);
+        end &= PAGE_MASK;
+        if (start >= end)
+                return;
+        func = arg;
+        if (!num_node_memblks) {
+                /* No SRAT table, so assume one node (node 0) */
+                if (start < end)
+                        (*func)(start, end - start, 0);
+                return;
+        }
+        for (i = 0; i < num_node_memblks; i++) {
+                rs = max(start, node_memblk[i].start_paddr);
+                re = min(end, node_memblk[i].start_paddr +
+                         node_memblk[i].size);
+                if (rs < re)
+                        (*func)(rs, re - rs, node_memblk[i].nid);
+                if (re == end)
+                        break;
+        }
+}
+/**
+ * count_node_pages - callback to build per-node memory info structures
+ * @start: physical start of range
+ * @len: length of range
+ * @node: node where this range resides
+ *
+ * Each node has it's own number of physical pages, DMAable pages, start, and
+ * end page frame number.  This routine will be called by call_pernode_memory()
+ * for each piece of usable memory and will setup these values for each node.
+ * Very similar to build_maps().
+ */
+static __init int count_node_pages(unsigned long start, unsigned long len, int node)
+{
+        unsigned long end = start + len;
+        mem_data[node].num_physpages += len >> PAGE_SHIFT;
+        if (start <= __pa(MAX_DMA_ADDRESS))
+                mem_data[node].num_dma_physpages +=
+                        (min(end, __pa(MAX_DMA_ADDRESS)) - start) >>PAGE_SHIFT;
+        start = GRANULEROUNDDOWN(start);
+        start = ORDERROUNDDOWN(start);
+        end = GRANULEROUNDUP(end);
+        mem_data[node].max_pfn = max(mem_data[node].max_pfn,
+                                     end >> PAGE_SHIFT);
+        mem_data[node].min_pfn = min(mem_data[node].min_pfn,
+                                     start >> PAGE_SHIFT);
+        return 0;
+}
+/**
+ * paging_init - setup page tables
+ *
+ * paging_init() sets up the page tables for each node of the system and frees
+ * the bootmem allocator memory for general use.
+ */
+void __init paging_init(void)
+{
+        unsigned long max_dma;
+        unsigned long zones_size[MAX_NR_ZONES];
+        unsigned long zholes_size[MAX_NR_ZONES];
+        unsigned long pfn_offset = 0;
+        int node;
+        max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+        /* so min() will work in count_node_pages */
+        for_each_online_node(node)
+                mem_data[node].min_pfn = ~0UL;
+        efi_memmap_walk(filter_rsvd_memory, count_node_pages);
+        for_each_online_node(node) {
+                memset(zones_size, 0, sizeof(zones_size));
+                memset(zholes_size, 0, sizeof(zholes_size));
+                num_physpages += mem_data[node].num_physpages;
+                if (mem_data[node].min_pfn >= max_dma) {
+                        /* All of this node's memory is above ZONE_DMA */
+                        zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+                                mem_data[node].min_pfn;
+                        zholes_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+                                mem_data[node].min_pfn -
+                                mem_data[node].num_physpages;
+                } else if (mem_data[node].max_pfn < max_dma) {
+                        /* All of this node's memory is in ZONE_DMA */
+                        zones_size[ZONE_DMA] = mem_data[node].max_pfn -
+                                mem_data[node].min_pfn;
+                        zholes_size[ZONE_DMA] = mem_data[node].max_pfn -
+                                mem_data[node].min_pfn -
+                                mem_data[node].num_dma_physpages;
+                } else {
+                        /* This node has memory in both zones */
+                        zones_size[ZONE_DMA] = max_dma -
+                                mem_data[node].min_pfn;
+                        zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
+                                mem_data[node].num_dma_physpages;
+                        zones_size[ZONE_NORMAL] = mem_data[node].max_pfn -
+                                max_dma;
+                        zholes_size[ZONE_NORMAL] = zones_size[ZONE_NORMAL] -
+                                (mem_data[node].num_physpages -
+                                 mem_data[node].num_dma_physpages);
+                }
+                if (node == 0) {
+                        vmalloc_end -=
+                                PAGE_ALIGN(max_low_pfn * sizeof(struct page));
+                        vmem_map = (struct page *) vmalloc_end;
+                        efi_memmap_walk(create_mem_map_page_table, NULL);
+                        printk("Virtual mem_map starts at 0x%p\n", vmem_map);
+                }
+                pfn_offset = mem_data[node].min_pfn;
+                NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
+                free_area_init_node(node, NODE_DATA(node), zones_size,
+                                    pfn_offset, zholes_size);
+        }
+        zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
+}
diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c
new file mode 100644
index 000000000000..6d259e34f359
--- /dev/null
+++ b/arch/ia64/mm/extable.c
@@ -0,0 +1,90 @@
+/*
+ * Kernel exception handling table support.  Derived from arch/alpha/mm/extable.c.
+ *
+ * Copyright (C) 1998, 1999, 2001-2002, 2004 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/config.h>
+#include <linux/sort.h>
+#include <asm/uaccess.h>
+#include <asm/module.h>
+static int cmp_ex(const void *a, const void *b)
+{
+        const struct exception_table_entry *l = a, *r = b;
+        u64 lip = (u64) &l->addr + l->addr;
+        u64 rip = (u64) &r->addr + r->addr;
+        /* avoid overflow */
+        if (lip > rip)
+                return 1;
+        if (lip < rip)
+                return -1;
+        return 0;
+}
+static void swap_ex(void *a, void *b, int size)
+{
+        struct exception_table_entry *l = a, *r = b, tmp;
+        u64 delta = (u64) r - (u64) l;
+        tmp = *l;
+        l->addr = r->addr + delta;
+        l->cont = r->cont + delta;
+        r->addr = tmp.addr - delta;
+        r->cont = tmp.cont - delta;
+}
+/*
+ * Sort the exception table. It's usually already sorted, but there
+ * may be unordered entries due to multiple text sections (such as the
+ * .init text section). Note that the exception-table-entries contain
+ * location-relative addresses, which requires a bit of care during
+ * sorting to avoid overflows in the offset members (e.g., it would
+ * not be safe to make a temporary copy of an exception-table entry on
+ * the stack, because the stack may be more than 2GB away from the
+ * exception-table).
+ */
+void sort_extable (struct exception_table_entry *start,
+                   struct exception_table_entry *finish)
+{
+        sort(start, finish - start, sizeof(struct exception_table_entry),
+             cmp_ex, swap_ex);
+}
+const struct exception_table_entry *
+search_extable (const struct exception_table_entry *first,
+                const struct exception_table_entry *last,
+                unsigned long ip)
+{
+        const struct exception_table_entry *mid;
+        unsigned long mid_ip;
+        long diff;
+        while (first <= last) {
+                mid = &first[(last - first)/2];
+                mid_ip = (u64) &mid->addr + mid->addr;
+                diff = mid_ip - ip;
+                if (diff == 0)
+                        return mid;
+                else if (diff < 0)
+                        first = mid + 1;
+                else
+                        last = mid - 1;
+        }
+        return NULL;
+}
+void
+ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e)
+{
+        long fix = (u64) &e->cont + e->cont;
+        regs->r8 = -EFAULT;
+        if (fix & 4)
+                regs->r9 = 0;
+        regs->cr_iip = fix & ~0xf;
+        ia64_psr(regs)->ri = fix & 0x3;         /* set continuation slot number */
+}
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
new file mode 100644
index 000000000000..da859125aaef
--- /dev/null
+++ b/arch/ia64/mm/fault.c
@@ -0,0 +1,261 @@
+/*
+ * MMU fault handling support.
+ *
+ * Copyright (C) 1998-2002 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+extern void die (char *, struct pt_regs *, long);
+/*
+ * This routine is analogous to expand_stack() but instead grows the
+ * register backing store (which grows towards higher addresses).
+ * Since the register backing store is access sequentially, we
+ * disallow growing the RBS by more than a page at a time.  Note that
+ * the VM_GROWSUP flag can be set on any VM area but that's fine
+ * because the total process size is still limited by RLIMIT_STACK and
+ * RLIMIT_AS.
+ */
+static inline long
+expand_backing_store (struct vm_area_struct *vma, unsigned long address)
+{
+        unsigned long grow;
+        grow = PAGE_SIZE >> PAGE_SHIFT;
+        if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur
+            || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur))
+                return -ENOMEM;
+        vma->vm_end += PAGE_SIZE;
+        vma->vm_mm->total_vm += grow;
+        if (vma->vm_flags & VM_LOCKED)
+                vma->vm_mm->locked_vm += grow;
+        __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
+        return 0;
+}
+/*
+ * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
+ * (inside region 5, on ia64) and that page is present.
+ */
+static int
+mapped_kernel_page_is_present (unsigned long address)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *ptep, pte;
+        pgd = pgd_offset_k(address);
+        if (pgd_none(*pgd) || pgd_bad(*pgd))
+                return 0;
+        pud = pud_offset(pgd, address);
+        if (pud_none(*pud) || pud_bad(*pud))
+                return 0;
+        pmd = pmd_offset(pud, address);
+        if (pmd_none(*pmd) || pmd_bad(*pmd))
+                return 0;
+        ptep = pte_offset_kernel(pmd, address);
+        if (!ptep)
+                return 0;
+        pte = *ptep;
+        return pte_present(pte);
+}
+void
+ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *regs)
+{
+        int signal = SIGSEGV, code = SEGV_MAPERR;
+        struct vm_area_struct *vma, *prev_vma;
+        struct mm_struct *mm = current->mm;
+        struct siginfo si;
+        unsigned long mask;
+        /*
+         * If we're in an interrupt or have no user context, we must not take the fault..
+         */
+        if (in_atomic() || !mm)
+                goto no_context;
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+        /*
+         * If fault is in region 5 and we are in the kernel, we may already
+         * have the mmap_sem (pfn_valid macro is called during mmap). There
+         * is no vma for region 5 addr's anyway, so skip getting the semaphore
+         * and go directly to the exception handling code.
+         */
+        if ((REGION_NUMBER(address) == 5) && !user_mode(regs))
+                goto bad_area_no_up;
+#endif
+        down_read(&mm->mmap_sem);
+        vma = find_vma_prev(mm, address, &prev_vma);
+        if (!vma)
+                goto bad_area;
+        /* find_vma_prev() returns vma such that address < vma->vm_end or NULL */
+        if (address < vma->vm_start)
+                goto check_expansion;
+  good_area:
+        code = SEGV_ACCERR;
+        /* OK, we've got a good vm_area for this memory area.  Check the access permissions: */
+#       define VM_READ_BIT      0
+#       define VM_WRITE_BIT     1
+#       define VM_EXEC_BIT      2
+#       if (((1 << VM_READ_BIT) != VM_READ || (1 << VM_WRITE_BIT) != VM_WRITE) \
+            || (1 << VM_EXEC_BIT) != VM_EXEC)
+#               error File is out of sync with <linux/mm.h>.  Please update.
+#       endif
+        mask = (  (((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
+                | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)
+                | (((isr >> IA64_ISR_R_BIT) & 1UL) << VM_READ_BIT));
+        if ((vma->vm_flags & mask) != mask)
+                goto bad_area;
+  survive:
+        /*
+         * If for any reason at all we couldn't handle the fault, make
+         * sure we exit gracefully rather than endlessly redo the
+         * fault.
+         */
+        switch (handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0)) {
+              case VM_FAULT_MINOR:
+                ++current->min_flt;
+                break;
+              case VM_FAULT_MAJOR:
+                ++current->maj_flt;
+                break;
+              case VM_FAULT_SIGBUS:
+                /*
+                 * We ran out of memory, or some other thing happened
+                 * to us that made us unable to handle the page fault
+                 * gracefully.
+                 */
+                signal = SIGBUS;
+                goto bad_area;
+              case VM_FAULT_OOM:
+                goto out_of_memory;
+              default:
+                BUG();
+        }
+        up_read(&mm->mmap_sem);
+        return;
+  check_expansion:
+        if (!(prev_vma && (prev_vma->vm_flags & VM_GROWSUP) && (address == prev_vma->vm_end))) {
+                if (!(vma->vm_flags & VM_GROWSDOWN))
+                        goto bad_area;
+                if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+                    || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+                        goto bad_area;
+                if (expand_stack(vma, address))
+                        goto bad_area;
+        } else {
+                vma = prev_vma;
+                if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
+                    || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
+                        goto bad_area;
+                if (expand_backing_store(vma, address))
+                        goto bad_area;
+        }
+        goto good_area;
+  bad_area:
+        up_read(&mm->mmap_sem);
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+  bad_area_no_up:
+#endif
+        if ((isr & IA64_ISR_SP)
+            || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
+        {
+                /*
+                 * This fault was due to a speculative load or lfetch.fault, set the "ed"
+                 * bit in the psr to ensure forward progress.  (Target register will get a
+                 * NaT for ld.s, lfetch will be canceled.)
+                 */
+                ia64_psr(regs)->ed = 1;
+                return;
+        }
+        if (user_mode(regs)) {
+                si.si_signo = signal;
+                si.si_errno = 0;
+                si.si_code = code;
+                si.si_addr = (void __user *) address;
+                si.si_isr = isr;
+                si.si_flags = __ISR_VALID;
+                force_sig_info(signal, &si, current);
+                return;
+        }
+  no_context:
+        if (isr & IA64_ISR_SP) {
+                /*
+                 * This fault was due to a speculative load set the "ed" bit in the psr to
+                 * ensure forward progress (target register will get a NaT).
+                 */
+                ia64_psr(regs)->ed = 1;
+                return;
+        }
+        if (ia64_done_with_exception(regs))
+                return;
+        /*
+         * Since we have no vma's for region 5, we might get here even if the address is
+         * valid, due to the VHPT walker inserting a non present translation that becomes
+         * stale. If that happens, the non present fault handler already purged the stale
+         * translation, which fixed the problem. So, we check to see if the translation is
+         * valid, and return if it is.
+         */
+        if (REGION_NUMBER(address) == 5 && mapped_kernel_page_is_present(address))
+                return;
+        /*
+         * Oops. The kernel tried to access some bad page. We'll have to terminate things
+         * with extreme prejudice.
+         */
+        bust_spinlocks(1);
+        if (address < PAGE_SIZE)
+                printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference (address %016lx)\n", address);
+        else
+                printk(KERN_ALERT "Unable to handle kernel paging request at "
+                       "virtual address %016lx\n", address);
+        die("Oops", regs, isr);
+        bust_spinlocks(0);
+        do_exit(SIGKILL);
+        return;
+  out_of_memory:
+        up_read(&mm->mmap_sem);
+        if (current->pid == 1) {
+                yield();
+                down_read(&mm->mmap_sem);
+                goto survive;
+        }
+        printk(KERN_CRIT "VM: killing process %s\n", current->comm);
+        if (user_mode(regs))
+                do_exit(SIGKILL);
+        goto no_context;
+}
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
new file mode 100644
index 000000000000..40ad8328ffd5
--- /dev/null
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -0,0 +1,357 @@
+/*
+ * IA-64 Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2002-2004 Rohit Seth <rohit.seth@intel.com>
+ * Copyright (C) 2003-2004 Ken Chen <kenneth.w.chen@intel.com>
+ *
+ * Sep, 2003: add numa support
+ * Feb, 2004: dynamic hugetlb page size via boot parameter
+ */
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
+static pte_t *
+huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
+{
+        unsigned long taddr = htlbpage_to_page(addr);
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte = NULL;
+        pgd = pgd_offset(mm, taddr);
+        pud = pud_alloc(mm, pgd, taddr);
+        if (pud) {
+                pmd = pmd_alloc(mm, pud, taddr);
+                if (pmd)
+                        pte = pte_alloc_map(mm, pmd, taddr);
+        }
+        return pte;
+}
+static pte_t *
+huge_pte_offset (struct mm_struct *mm, unsigned long addr)
+{
+        unsigned long taddr = htlbpage_to_page(addr);
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte = NULL;
+        pgd = pgd_offset(mm, taddr);
+        if (pgd_present(*pgd)) {
+                pud = pud_offset(pgd, taddr);
+                if (pud_present(*pud)) {
+                        pmd = pmd_offset(pud, taddr);
+                        if (pmd_present(*pmd))
+                                pte = pte_offset_map(pmd, taddr);
+                }
+        }
+        return pte;
+}
+#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
+static void
+set_huge_pte (struct mm_struct *mm, struct vm_area_struct *vma,
+              struct page *page, pte_t * page_table, int write_access)
+{
+        pte_t entry;
+        add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+        if (write_access) {
+                entry =
+                    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+        } else
+                entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+        entry = pte_mkyoung(entry);
+        mk_pte_huge(entry);
+        set_pte(page_table, entry);
+        return;
+}
+/*
+ * This function checks for proper alignment of input addr and len parameters.
+ */
+int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
+{
+        if (len & ~HPAGE_MASK)
+                return -EINVAL;
+        if (addr & ~HPAGE_MASK)
+                return -EINVAL;
+        if (REGION_NUMBER(addr) != REGION_HPAGE)
+                return -EINVAL;
+        return 0;
+}
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+                        struct vm_area_struct *vma)
+{
+        pte_t *src_pte, *dst_pte, entry;
+        struct page *ptepage;
+        unsigned long addr = vma->vm_start;
+        unsigned long end = vma->vm_end;
+        while (addr < end) {
+                dst_pte = huge_pte_alloc(dst, addr);
+                if (!dst_pte)
+                        goto nomem;
+                src_pte = huge_pte_offset(src, addr);
+                entry = *src_pte;
+                ptepage = pte_page(entry);
+                get_page(ptepage);
+                set_pte(dst_pte, entry);
+                add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+                addr += HPAGE_SIZE;
+        }
+        return 0;
+nomem:
+        return -ENOMEM;
+}
+int
+follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                    struct page **pages, struct vm_area_struct **vmas,
+                    unsigned long *st, int *length, int i)
+{
+        pte_t *ptep, pte;
+        unsigned long start = *st;
+        unsigned long pstart;
+        int len = *length;
+        struct page *page;
+        do {
+                pstart = start & HPAGE_MASK;
+                ptep = huge_pte_offset(mm, start);
+                pte = *ptep;
+back1:
+                page = pte_page(pte);
+                if (pages) {
+                        page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
+                        get_page(page);
+                        pages[i] = page;
+                }
+                if (vmas)
+                        vmas[i] = vma;
+                i++;
+                len--;
+                start += PAGE_SIZE;
+                if (((start & HPAGE_MASK) == pstart) && len &&
+                                (start < vma->vm_end))
+                        goto back1;
+        } while (len && start < vma->vm_end);
+        *length = len;
+        *st = start;
+        return i;
+}
+struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
+{
+        struct page *page;
+        pte_t *ptep;
+        if (REGION_NUMBER(addr) != REGION_HPAGE)
+                return ERR_PTR(-EINVAL);
+        ptep = huge_pte_offset(mm, addr);
+        if (!ptep || pte_none(*ptep))
+                return NULL;
+        page = pte_page(*ptep);
+        page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
+        return page;
+}
+int pmd_huge(pmd_t pmd)
+{
+        return 0;
+}
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
+{
+        return NULL;
+}
+/*
+ * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
+ * are hugetlb region specific.
+ */
+void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
+        unsigned long start, unsigned long end)
+{
+        unsigned long first = start & HUGETLB_PGDIR_MASK;
+        unsigned long last = end + HUGETLB_PGDIR_SIZE - 1;
+        struct mm_struct *mm = tlb->mm;
+        if (!prev) {
+                prev = mm->mmap;
+                if (!prev)
+                        goto no_mmaps;
+                if (prev->vm_end > start) {
+                        if (last > prev->vm_start)
+                                last = prev->vm_start;
+                        goto no_mmaps;
+                }
+        }
+        for (;;) {
+                struct vm_area_struct *next = prev->vm_next;
+                if (next) {
+                        if (next->vm_start < start) {
+                                prev = next;
+                                continue;
+                        }
+                        if (last > next->vm_start)
+                                last = next->vm_start;
+                }
+                if (prev->vm_end > first)
+                        first = prev->vm_end;
+                break;
+        }
+no_mmaps:
+        if (last < first)       /* for arches with discontiguous pgd indices */
+                return;
+        clear_page_range(tlb, first, last);
+}
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long address;
+        pte_t *pte;
+        struct page *page;
+        BUG_ON(start & (HPAGE_SIZE - 1));
+        BUG_ON(end & (HPAGE_SIZE - 1));
+        for (address = start; address < end; address += HPAGE_SIZE) {
+                pte = huge_pte_offset(mm, address);
+                if (pte_none(*pte))
+                        continue;
+                page = pte_page(*pte);
+                put_page(page);
+                pte_clear(mm, address, pte);
+        }
+        add_mm_counter(mm, rss, - ((end - start) >> PAGE_SHIFT));
+        flush_tlb_range(vma, start, end);
+}
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long addr;
+        int ret = 0;
+        BUG_ON(vma->vm_start & ~HPAGE_MASK);
+        BUG_ON(vma->vm_end & ~HPAGE_MASK);
+        spin_lock(&mm->page_table_lock);
+        for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+                unsigned long idx;
+                pte_t *pte = huge_pte_alloc(mm, addr);
+                struct page *page;
+                if (!pte) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                if (!pte_none(*pte))
+                        continue;
+                idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+                        + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+                page = find_get_page(mapping, idx);
+                if (!page) {
+                        /* charge the fs quota first */
+                        if (hugetlb_get_quota(mapping)) {
+                                ret = -ENOMEM;
+                                goto out;
+                        }
+                        page = alloc_huge_page();
+                        if (!page) {
+                                hugetlb_put_quota(mapping);
+                                ret = -ENOMEM;
+                                goto out;
+                        }
+                        ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+                        if (! ret) {
+                                unlock_page(page);
+                        } else {
+                                hugetlb_put_quota(mapping);
+                                page_cache_release(page);
+                                goto out;
+                        }
+                }
+                set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
+        }
+out:
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+                unsigned long pgoff, unsigned long flags)
+{
+        struct vm_area_struct *vmm;
+        if (len > RGN_MAP_LIMIT)
+                return -ENOMEM;
+        if (len & ~HPAGE_MASK)
+                return -EINVAL;
+        /* This code assumes that REGION_HPAGE != 0. */
+        if ((REGION_NUMBER(addr) != REGION_HPAGE) || (addr & (HPAGE_SIZE - 1)))
+                addr = HPAGE_REGION_BASE;
+        else
+                addr = ALIGN(addr, HPAGE_SIZE);
+        for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
+                /* At this point:  (!vmm || addr < vmm->vm_end). */
+                if (REGION_OFFSET(addr) + len > RGN_MAP_LIMIT)
+                        return -ENOMEM;
+                if (!vmm || (addr + len) <= vmm->vm_start)
+                        return addr;
+                addr = ALIGN(vmm->vm_end, HPAGE_SIZE);
+        }
+}
+static int __init hugetlb_setup_sz(char *str)
+{
+        u64 tr_pages;
+        unsigned long long size;
+        if (ia64_pal_vm_page_size(&tr_pages, NULL) != 0)
+                /*
+                 * shouldn't happen, but just in case.
+                 */
+                tr_pages = 0x15557000UL;
+        size = memparse(str, &str);
+        if (*str || (size & (size-1)) || !(tr_pages & size) ||
+                size <= PAGE_SIZE ||
+                size >= (1UL << PAGE_SHIFT << MAX_ORDER)) {
+                printk(KERN_WARNING "Invalid huge page size specified\n");
+                return 1;
+        }
+        hpage_shift = __ffs(size);
+        /*
+         * boot cpu already executed ia64_mmu_init, and has HPAGE_SHIFT_DEFAULT
+         * override here with new page shift.
+         */
+        ia64_set_rr(HPAGE_REGION_BASE, hpage_shift << 2);
+        return 1;
+}
+__setup("hugepagesz=", hugetlb_setup_sz);
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
new file mode 100644
index 000000000000..65cf839573ea
--- /dev/null
+++ b/arch/ia64/mm/init.c
@@ -0,0 +1,597 @@
+/*
+ * Initialize MMU support.
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/efi.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/personality.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/bitops.h>
+#include <asm/a.out.h>
+#include <asm/dma.h>
+#include <asm/ia32.h>
+#include <asm/io.h>
+#include <asm/machvec.h>
+#include <asm/numa.h>
+#include <asm/patch.h>
+#include <asm/pgalloc.h>
+#include <asm/sal.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/tlb.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/mca.h>
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+extern void ia64_tlb_init (void);
+unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+unsigned long vmalloc_end = VMALLOC_END_INIT;
+EXPORT_SYMBOL(vmalloc_end);
+struct page *vmem_map;
+EXPORT_SYMBOL(vmem_map);
+#endif
+static int pgt_cache_water[2] = { 25, 50 };
+struct page *zero_page_memmap_ptr;              /* map entry for zero page */
+EXPORT_SYMBOL(zero_page_memmap_ptr);
+void
+check_pgt_cache (void)
+{
+        int low, high;
+        low = pgt_cache_water[0];
+        high = pgt_cache_water[1];
+        preempt_disable();
+        if (pgtable_cache_size > (u64) high) {
+                do {
+                        if (pgd_quicklist)
+                                free_page((unsigned long)pgd_alloc_one_fast(NULL));
+                        if (pmd_quicklist)
+                                free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
+                } while (pgtable_cache_size > (u64) low);
+        }
+        preempt_enable();
+}
+void
+lazy_mmu_prot_update (pte_t pte)
+{
+        unsigned long addr;
+        struct page *page;
+        if (!pte_exec(pte))
+                return;                         /* not an executable page... */
+        page = pte_page(pte);
+        addr = (unsigned long) page_address(page);
+        if (test_bit(PG_arch_1, &page->flags))
+                return;                         /* i-cache is already coherent with d-cache */
+        flush_icache_range(addr, addr + PAGE_SIZE);
+        set_bit(PG_arch_1, &page->flags);       /* mark page as clean */
+}
+inline void
+ia64_set_rbs_bot (void)
+{
+        unsigned long stack_size = current->signal->rlim[RLIMIT_STACK].rlim_max & -16;
+        if (stack_size > MAX_USER_STACK_SIZE)
+                stack_size = MAX_USER_STACK_SIZE;
+        current->thread.rbs_bot = STACK_TOP - stack_size;
+}
+/*
+ * This performs some platform-dependent address space initialization.
+ * On IA-64, we want to setup the VM area for the register backing
+ * store (which grows upwards) and install the gateway page which is
+ * used for signal trampolines, etc.
+ */
+void
+ia64_init_addr_space (void)
+{
+        struct vm_area_struct *vma;
+        ia64_set_rbs_bot();
+        /*
+         * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore
+         * the problem.  When the process attempts to write to the register backing store
+         * for the first time, it will get a SEGFAULT in this case.
+         */
+        vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+        if (vma) {
+                memset(vma, 0, sizeof(*vma));
+                vma->vm_mm = current->mm;
+                vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
+                vma->vm_end = vma->vm_start + PAGE_SIZE;
+                vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
+                vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP;
+                down_write(&current->mm->mmap_sem);
+                if (insert_vm_struct(current->mm, vma)) {
+                        up_write(&current->mm->mmap_sem);
+                        kmem_cache_free(vm_area_cachep, vma);
+                        return;
+                }
+                up_write(&current->mm->mmap_sem);
+        }
+        /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
+        if (!(current->personality & MMAP_PAGE_ZERO)) {
+                vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+                if (vma) {
+                        memset(vma, 0, sizeof(*vma));
+                        vma->vm_mm = current->mm;
+                        vma->vm_end = PAGE_SIZE;
+                        vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
+                        vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
+                        down_write(&current->mm->mmap_sem);
+                        if (insert_vm_struct(current->mm, vma)) {
+                                up_write(&current->mm->mmap_sem);
+                                kmem_cache_free(vm_area_cachep, vma);
+                                return;
+                        }
+                        up_write(&current->mm->mmap_sem);
+                }
+        }
+}
+void
+free_initmem (void)
+{
+        unsigned long addr, eaddr;
+        addr = (unsigned long) ia64_imva(__init_begin);
+        eaddr = (unsigned long) ia64_imva(__init_end);
+        while (addr < eaddr) {
+                ClearPageReserved(virt_to_page(addr));
+                set_page_count(virt_to_page(addr), 1);
+                free_page(addr);
+                ++totalram_pages;
+                addr += PAGE_SIZE;
+        }
+        printk(KERN_INFO "Freeing unused kernel memory: %ldkB freed\n",
+               (__init_end - __init_begin) >> 10);
+}
+void
+free_initrd_mem (unsigned long start, unsigned long end)
+{
+        struct page *page;
+        /*
+         * EFI uses 4KB pages while the kernel can use 4KB or bigger.
+         * Thus EFI and the kernel may have different page sizes. It is
+         * therefore possible to have the initrd share the same page as
+         * the end of the kernel (given current setup).
+         *
+         * To avoid freeing/using the wrong page (kernel sized) we:
+         *      - align up the beginning of initrd
+         *      - align down the end of initrd
+         *
+         *  |             |
+         *  |=============| a000
+         *  |             |
+         *  |             |
+         *  |             | 9000
+         *  |/////////////|
+         *  |/////////////|
+         *  |=============| 8000
+         *  |///INITRD////|
+         *  |/////////////|
+         *  |/////////////| 7000
+         *  |             |
+         *  |KKKKKKKKKKKKK|
+         *  |=============| 6000
+         *  |KKKKKKKKKKKKK|
+         *  |KKKKKKKKKKKKK|
+         *  K=kernel using 8KB pages
+         *
+         * In this example, we must free page 8000 ONLY. So we must align up
+         * initrd_start and keep initrd_end as is.
+         */
+        start = PAGE_ALIGN(start);
+        end = end & PAGE_MASK;
+        if (start < end)
+                printk(KERN_INFO "Freeing initrd memory: %ldkB freed\n", (end - start) >> 10);
+        for (; start < end; start += PAGE_SIZE) {
+                if (!virt_addr_valid(start))
+                        continue;
+                page = virt_to_page(start);
+                ClearPageReserved(page);
+                set_page_count(page, 1);
+                free_page(start);
+                ++totalram_pages;
+        }
+}
+/*
+ * This installs a clean page in the kernel's page table.
+ */
+struct page *
+put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        if (!PageReserved(page))
+                printk(KERN_ERR "put_kernel_page: page at 0x%p not in reserved memory\n",
+                       page_address(page));
+        pgd = pgd_offset_k(address);            /* note: this is NOT pgd_offset()! */
+        spin_lock(&init_mm.page_table_lock);
+        {
+                pud = pud_alloc(&init_mm, pgd, address);
+                if (!pud)
+                        goto out;
+                pmd = pmd_alloc(&init_mm, pud, address);
+                if (!pmd)
+                        goto out;
+                pte = pte_alloc_map(&init_mm, pmd, address);
+                if (!pte)
+                        goto out;
+                if (!pte_none(*pte)) {
+                        pte_unmap(pte);
+                        goto out;
+                }
+                set_pte(pte, mk_pte(page, pgprot));
+                pte_unmap(pte);
+        }
+  out:  spin_unlock(&init_mm.page_table_lock);
+        /* no need for flush_tlb */
+        return page;
+}
+static void
+setup_gate (void)
+{
+        struct page *page;
+        /*
+         * Map the gate page twice: once read-only to export the ELF headers etc. and once
+         * execute-only page to enable privilege-promotion via "epc":
+         */
+        page = virt_to_page(ia64_imva(__start_gate_section));
+        put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
+#ifdef HAVE_BUGGY_SEGREL
+        page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
+        put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
+#else
+        put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
+#endif
+        ia64_patch_gate();
+}
+void __devinit
+ia64_mmu_init (void *my_cpu_data)
+{
+        unsigned long psr, pta, impl_va_bits;
+        extern void __devinit tlb_init (void);
+#ifdef CONFIG_DISABLE_VHPT
+#       define VHPT_ENABLE_BIT  0
+#else
+#       define VHPT_ENABLE_BIT  1
+#endif
+        /* Pin mapping for percpu area into TLB */
+        psr = ia64_clear_ic();
+        ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR,
+                 pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)),
+                 PERCPU_PAGE_SHIFT);
+        ia64_set_psr(psr);
+        ia64_srlz_i();
+        /*
+         * Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
+         * address space.  The IA-64 architecture guarantees that at least 50 bits of
+         * virtual address space are implemented but if we pick a large enough page size
+         * (e.g., 64KB), the mapped address space is big enough that it will overlap with
+         * VMLPT.  I assume that once we run on machines big enough to warrant 64KB pages,
+         * IMPL_VA_MSB will be significantly bigger, so this is unlikely to become a
+         * problem in practice.  Alternatively, we could truncate the top of the mapped
+         * address space to not permit mappings that would overlap with the VMLPT.
+         * --davidm 00/12/06
+         */
+#       define pte_bits                 3
+#       define mapped_space_bits        (3*(PAGE_SHIFT - pte_bits) + PAGE_SHIFT)
+        /*
+         * The virtual page table has to cover the entire implemented address space within
+         * a region even though not all of this space may be mappable.  The reason for
+         * this is that the Access bit and Dirty bit fault handlers perform
+         * non-speculative accesses to the virtual page table, so the address range of the
+         * virtual page table itself needs to be covered by virtual page table.
+         */
+#       define vmlpt_bits               (impl_va_bits - PAGE_SHIFT + pte_bits)
+#       define POW2(n)                  (1ULL << (n))
+        impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61)));
+        if (impl_va_bits < 51 || impl_va_bits > 61)
+                panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1);
+        /* place the VMLPT at the end of each page-table mapped region: */
+        pta = POW2(61) - POW2(vmlpt_bits);
+        if (POW2(mapped_space_bits) >= pta)
+                panic("mm/init: overlap between virtually mapped linear page table and "
+                      "mapped kernel space!");
+        /*
+         * Set the (virtually mapped linear) page table address.  Bit
+         * 8 selects between the short and long format, bits 2-7 the
+         * size of the table, and bit 0 whether the VHPT walker is
+         * enabled.
+         */
+        ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | VHPT_ENABLE_BIT);
+        ia64_tlb_init();
+#ifdef  CONFIG_HUGETLB_PAGE
+        ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2);
+        ia64_srlz_d();
+#endif
+}
+#ifdef CONFIG_VIRTUAL_MEM_MAP
+int
+create_mem_map_page_table (u64 start, u64 end, void *arg)
+{
+        unsigned long address, start_page, end_page;
+        struct page *map_start, *map_end;
+        int node;
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
+        map_end   = vmem_map + (__pa(end) >> PAGE_SHIFT);
+        start_page = (unsigned long) map_start & PAGE_MASK;
+        end_page = PAGE_ALIGN((unsigned long) map_end);
+        node = paddr_to_nid(__pa(start));
+        for (address = start_page; address < end_page; address += PAGE_SIZE) {
+                pgd = pgd_offset_k(address);
+                if (pgd_none(*pgd))
+                        pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
+                pud = pud_offset(pgd, address);
+                if (pud_none(*pud))
+                        pud_populate(&init_mm, pud, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
+                pmd = pmd_offset(pud, address);
+                if (pmd_none(*pmd))
+                        pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
+                pte = pte_offset_kernel(pmd, address);
+                if (pte_none(*pte))
+                        set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)) >> PAGE_SHIFT,
+                                             PAGE_KERNEL));
+        }
+        return 0;
+}
+struct memmap_init_callback_data {
+        struct page *start;
+        struct page *end;
+        int nid;
+        unsigned long zone;
+};
+static int
+virtual_memmap_init (u64 start, u64 end, void *arg)
+{
+        struct memmap_init_callback_data *args;
+        struct page *map_start, *map_end;
+        args = (struct memmap_init_callback_data *) arg;
+        map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
+        map_end   = vmem_map + (__pa(end) >> PAGE_SHIFT);
+        if (map_start < args->start)
+                map_start = args->start;
+        if (map_end > args->end)
+                map_end = args->end;
+        /*
+         * We have to initialize "out of bounds" struct page elements that fit completely
+         * on the same pages that were allocated for the "in bounds" elements because they
+         * may be referenced later (and found to be "reserved").
+         */
+        map_start -= ((unsigned long) map_start & (PAGE_SIZE - 1)) / sizeof(struct page);
+        map_end += ((PAGE_ALIGN((unsigned long) map_end) - (unsigned long) map_end)
+                    / sizeof(struct page));
+        if (map_start < map_end)
+                memmap_init_zone((unsigned long)(map_end - map_start),
+                                 args->nid, args->zone, page_to_pfn(map_start));
+        return 0;
+}
+void
+memmap_init (unsigned long size, int nid, unsigned long zone,
+             unsigned long start_pfn)
+{
+        if (!vmem_map)
+                memmap_init_zone(size, nid, zone, start_pfn);
+        else {
+                struct page *start;
+                struct memmap_init_callback_data args;
+                start = pfn_to_page(start_pfn);
+                args.start = start;
+                args.end = start + size;
+                args.nid = nid;
+                args.zone = zone;
+                efi_memmap_walk(virtual_memmap_init, &args);
+        }
+}
+int
+ia64_pfn_valid (unsigned long pfn)
+{
+        char byte;
+        struct page *pg = pfn_to_page(pfn);
+        return     (__get_user(byte, (char __user *) pg) == 0)
+                && ((((u64)pg & PAGE_MASK) == (((u64)(pg + 1) - 1) & PAGE_MASK))
+                        || (__get_user(byte, (char __user *) (pg + 1) - 1) == 0));
+}
+EXPORT_SYMBOL(ia64_pfn_valid);
+int
+find_largest_hole (u64 start, u64 end, void *arg)
+{
+        u64 *max_gap = arg;
+        static u64 last_end = PAGE_OFFSET;
+        /* NOTE: this algorithm assumes efi memmap table is ordered */
+        if (*max_gap < (start - last_end))
+                *max_gap = start - last_end;
+        last_end = end;
+        return 0;
+}
+#endif /* CONFIG_VIRTUAL_MEM_MAP */
+static int
+count_reserved_pages (u64 start, u64 end, void *arg)
+{
+        unsigned long num_reserved = 0;
+        unsigned long *count = arg;
+        for (; start < end; start += PAGE_SIZE)
+                if (PageReserved(virt_to_page(start)))
+                        ++num_reserved;
+        *count += num_reserved;
+        return 0;
+}
+/*
+ * Boot command-line option "nolwsys" can be used to disable the use of any light-weight
+ * system call handler.  When this option is in effect, all fsyscalls will end up bubbling
+ * down into the kernel and calling the normal (heavy-weight) syscall handler.  This is
+ * useful for performance testing, but conceivably could also come in handy for debugging
+ * purposes.
+ */
+static int nolwsys;
+static int __init
+nolwsys_setup (char *s)
+{
+        nolwsys = 1;
+        return 1;
+}
+__setup("nolwsys", nolwsys_setup);
+void
+mem_init (void)
+{
+        long reserved_pages, codesize, datasize, initsize;
+        unsigned long num_pgt_pages;
+        pg_data_t *pgdat;
+        int i;
+        static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
+#ifdef CONFIG_PCI
+        /*
+         * This needs to be called _after_ the command line has been parsed but _before_
+         * any drivers that may need the PCI DMA interface are initialized or bootmem has
+         * been freed.
+         */
+        platform_dma_init();
+#endif
+#ifndef CONFIG_DISCONTIGMEM
+        if (!mem_map)
+                BUG();
+        max_mapnr = max_low_pfn;
+#endif
+        high_memory = __va(max_low_pfn * PAGE_SIZE);
+        kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE);
+        kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
+        kclist_add(&kcore_kernel, _stext, _end - _stext);
+        for_each_pgdat(pgdat)
+                totalram_pages += free_all_bootmem_node(pgdat);
+        reserved_pages = 0;
+        efi_memmap_walk(count_reserved_pages, &reserved_pages);
+        codesize =  (unsigned long) _etext - (unsigned long) _stext;
+        datasize =  (unsigned long) _edata - (unsigned long) _etext;
+        initsize =  (unsigned long) __init_end - (unsigned long) __init_begin;
+        printk(KERN_INFO "Memory: %luk/%luk available (%luk code, %luk reserved, "
+               "%luk data, %luk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT - 10),
+               num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
+               reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);
+        /*
+         * Allow for enough (cached) page table pages so that we can map the entire memory
+         * at least once.  Each task also needs a couple of page tables pages, so add in a
+         * fudge factor for that (don't use "threads-max" here; that would be wrong!).
+         * Don't allow the cache to be more than 10% of total memory, though.
+         */
+#       define NUM_TASKS        500     /* typical number of tasks */
+        num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS;
+        if (num_pgt_pages > nr_free_pages() / 10)
+                num_pgt_pages = nr_free_pages() / 10;
+        if (num_pgt_pages > (u64) pgt_cache_water[1])
+                pgt_cache_water[1] = num_pgt_pages;
+        /*
+         * For fsyscall entrpoints with no light-weight handler, use the ordinary
+         * (heavy-weight) handler, but mark it by setting bit 0, so the fsyscall entry
+         * code can tell them apart.
+         */
+        for (i = 0; i < NR_syscalls; ++i) {
+                extern unsigned long fsyscall_table[NR_syscalls];
+                extern unsigned long sys_call_table[NR_syscalls];
+                if (!fsyscall_table[i] || nolwsys)
+                        fsyscall_table[i] = sys_call_table[i] | 1;
+        }
+        setup_gate();
+#ifdef CONFIG_IA32_SUPPORT
+        ia32_mem_init();
+#endif
+}
diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
new file mode 100644
index 000000000000..77118bbf3d8b
--- /dev/null
+++ b/arch/ia64/mm/numa.c
@@ -0,0 +1,49 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * This file contains NUMA specific variables and functions which can
+ * be split away from DISCONTIGMEM and are used on NUMA machines with
+ * contiguous memory.
+ * 
+ *                         2002/08/07 Erich Focht <efocht@ess.nec.de>
+ */
+#include <linux/config.h>
+#include <linux/cpu.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/node.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <asm/mmzone.h>
+#include <asm/numa.h>
+/*
+ * The following structures are usually initialized by ACPI or
+ * similar mechanisms and describe the NUMA characteristics of the machine.
+ */
+int num_node_memblks;
+struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
+struct node_cpuid_s node_cpuid[NR_CPUS];
+/*
+ * This is a matrix with "distances" between nodes, they should be
+ * proportional to the memory access latency ratios.
+ */
+u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES];
+/* Identify which cnode a physical address resides on */
+int
+paddr_to_nid(unsigned long paddr)
+{
+        int     i;
+        for (i = 0; i < num_node_memblks; i++)
+                if (paddr >= node_memblk[i].start_paddr &&
+                    paddr < node_memblk[i].start_paddr + node_memblk[i].size)
+                        break;
+        return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0);
+}
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
new file mode 100644
index 000000000000..464557e4ed82
--- /dev/null
+++ b/arch/ia64/mm/tlb.c
@@ -0,0 +1,190 @@
+/*
+ * TLB support routines.
+ *
+ * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
+ *      David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 08/02/00 A. Mallick <asit.k.mallick@intel.com>
+ *              Modified RID allocation for SMP
+ *          Goutham Rao <goutham.rao@intel.com>
+ *              IPI based ptc implementation and A-step IPI implementation.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <asm/delay.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+#include <asm/pal.h>
+#include <asm/tlbflush.h>
+static struct {
+        unsigned long mask;     /* mask of supported purge page-sizes */
+        unsigned long max_bits; /* log2() of largest supported purge page-size */
+} purge;
+struct ia64_ctx ia64_ctx = {
+        .lock =         SPIN_LOCK_UNLOCKED,
+        .next =         1,
+        .limit =        (1 << 15) - 1,          /* start out with the safe (architected) limit */
+        .max_ctx =      ~0U
+};
+DEFINE_PER_CPU(u8, ia64_need_tlb_flush);
+/*
+ * Acquire the ia64_ctx.lock before calling this function!
+ */
+void
+wrap_mmu_context (struct mm_struct *mm)
+{
+        unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx;
+        struct task_struct *tsk;
+        int i;
+        if (ia64_ctx.next > max_ctx)
+                ia64_ctx.next = 300;    /* skip daemons */
+        ia64_ctx.limit = max_ctx + 1;
+        /*
+         * Scan all the task's mm->context and set proper safe range
+         */
+        read_lock(&tasklist_lock);
+  repeat:
+        for_each_process(tsk) {
+                if (!tsk->mm)
+                        continue;
+                tsk_context = tsk->mm->context;
+                if (tsk_context == ia64_ctx.next) {
+                        if (++ia64_ctx.next >= ia64_ctx.limit) {
+                                /* empty range: reset the range limit and start over */
+                                if (ia64_ctx.next > max_ctx)
+                                        ia64_ctx.next = 300;
+                                ia64_ctx.limit = max_ctx + 1;
+                                goto repeat;
+                        }
+                }
+                if ((tsk_context > ia64_ctx.next) && (tsk_context < ia64_ctx.limit))
+                        ia64_ctx.limit = tsk_context;
+        }
+        read_unlock(&tasklist_lock);
+        /* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
+        {
+                int cpu = get_cpu(); /* prevent preemption/migration */
+                for (i = 0; i < NR_CPUS; ++i)
+                        if (cpu_online(i) && (i != cpu))
+                                per_cpu(ia64_need_tlb_flush, i) = 1;
+                put_cpu();
+        }
+        local_flush_tlb_all();
+}
+void
+ia64_global_tlb_purge (unsigned long start, unsigned long end, unsigned long nbits)
+{
+        static DEFINE_SPINLOCK(ptcg_lock);
+        /* HW requires global serialization of ptc.ga.  */
+        spin_lock(&ptcg_lock);
+        {
+                do {
+                        /*
+                         * Flush ALAT entries also.
+                         */
+                        ia64_ptcga(start, (nbits<<2));
+                        ia64_srlz_i();
+                        start += (1UL << nbits);
+                } while (start < end);
+        }
+        spin_unlock(&ptcg_lock);
+}
+void
+local_flush_tlb_all (void)
+{
+        unsigned long i, j, flags, count0, count1, stride0, stride1, addr;
+        addr    = local_cpu_data->ptce_base;
+        count0  = local_cpu_data->ptce_count[0];
+        count1  = local_cpu_data->ptce_count[1];
+        stride0 = local_cpu_data->ptce_stride[0];
+        stride1 = local_cpu_data->ptce_stride[1];
+        local_irq_save(flags);
+        for (i = 0; i < count0; ++i) {
+                for (j = 0; j < count1; ++j) {
+                        ia64_ptce(addr);
+                        addr += stride1;
+                }
+                addr += stride0;
+        }
+        local_irq_restore(flags);
+        ia64_srlz_i();                  /* srlz.i implies srlz.d */
+}
+void
+flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long size = end - start;
+        unsigned long nbits;
+        if (mm != current->active_mm) {
+                /* this does happen, but perhaps it's not worth optimizing for? */
+#ifdef CONFIG_SMP
+                flush_tlb_all();
+#else
+                mm->context = 0;
+#endif
+                return;
+        }
+        nbits = ia64_fls(size + 0xfff);
+        while (unlikely (((1UL << nbits) & purge.mask) == 0) && (nbits < purge.max_bits))
+                ++nbits;
+        if (nbits > purge.max_bits)
+                nbits = purge.max_bits;
+        start &= ~((1UL << nbits) - 1);
+# ifdef CONFIG_SMP
+        platform_global_tlb_purge(start, end, nbits);
+# else
+        do {
+                ia64_ptcl(start, (nbits<<2));
+                start += (1UL << nbits);
+        } while (start < end);
+# endif
+        ia64_srlz_i();                  /* srlz.i implies srlz.d */
+}
+EXPORT_SYMBOL(flush_tlb_range);
+void __devinit
+ia64_tlb_init (void)
+{
+        ia64_ptce_info_t ptce_info;
+        unsigned long tr_pgbits;
+        long status;
+        if ((status = ia64_pal_vm_page_size(&tr_pgbits, &purge.mask)) != 0) {
+                printk(KERN_ERR "PAL_VM_PAGE_SIZE failed with status=%ld;"
+                       "defaulting to architected purge page-sizes.\n", status);
+                purge.mask = 0x115557000UL;
+        }
+        purge.max_bits = ia64_fls(purge.mask);
+        ia64_get_ptce(&ptce_info);
+        local_cpu_data->ptce_base = ptce_info.base;
+        local_cpu_data->ptce_count[0] = ptce_info.count[0];
+        local_cpu_data->ptce_count[1] = ptce_info.count[1];
+        local_cpu_data->ptce_stride[0] = ptce_info.stride[0];
+        local_cpu_data->ptce_stride[1] = ptce_info.stride[1];
+        local_flush_tlb_all();          /* nuke left overs from bootstrapping... */
+}