24 files changed, 4612 insertions, 1384 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index b7b3e4c7cfc9..dfb932dcf136 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
 obj-y   :=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-            pat.o pgtable.o
+            pat.o pgtable.o gup.o
 obj-$(CONFIG_X86_32)            += pgtable_32.o
@@ -8,10 +8,17 @@ obj-$(CONFIG_X86_PTDUMP)	+= dump_pagetables.o
 obj-$(CONFIG_HIGHMEM)           += highmem_32.o
+obj-$(CONFIG_MMIOTRACE_HOOKS)   += kmmio.o
+obj-$(CONFIG_MMIOTRACE)         += mmiotrace.o
+mmiotrace-y                     := pf_in.o mmio-mod.o
+obj-$(CONFIG_MMIOTRACE_TEST)    += testmmiotrace.o
 ifeq ($(CONFIG_X86_32),y)
 obj-$(CONFIG_NUMA)              += discontig_32.o
 else
 obj-$(CONFIG_NUMA)              += numa_64.o
 obj-$(CONFIG_K8_NUMA)           += k8topology_64.o
-obj-$(CONFIG_ACPI_NUMA)         += srat_64.o
 endif
+obj-$(CONFIG_ACPI_NUMA)         += srat_$(BITS).o
+obj-$(CONFIG_MEMTEST)           += memtest.o
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 914ccf983687..847c164725f4 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -38,10 +38,10 @@
 #include <asm/setup.h>
 #include <asm/mmzone.h>
 #include <asm/bios_ebda.h>
+#include <asm/proto.h>
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
-static bootmem_data_t node0_bdata;
 /*
 * numa interface - we expect the numa architecture specific code to have
@@ -59,14 +59,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
 /*
 * 4) physnode_map     - the mapping between a pfn and owning node
 * physnode_map keeps track of the physical memory layout of a generic
- * numa node on a 256Mb break (each element of the array will
+ * numa node on a 64Mb break (each element of the array will
- * represent 256Mb of memory and will be marked by the node id.  so,
+ * represent 64Mb of memory and will be marked by the node id.  so,
 * if the first gig is on node 0, and the second gig is on node 1
 * physnode_map will contain:
 *
- *     physnode_map[0-3] = 0;
+ *     physnode_map[0-15] = 0;
- *     physnode_map[4-7] = 1;
+ *     physnode_map[16-31] = 1;
- *     physnode_map[8- ] = -1;
+ *     physnode_map[32- ] = -1;
 */
 s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
 EXPORT_SYMBOL(physnode_map);
@@ -75,15 +75,15 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 {
        unsigned long pfn;
-        printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n",
+        printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
                        nid, start, end);
        printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);
        printk(KERN_DEBUG "  ");
        for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
                physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
-                printk("%ld ", pfn);
+                printk(KERN_CONT "%lx ", pfn);
        }
-        printk("\n");
+        printk(KERN_CONT "\n");
 }
 unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +99,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 #endif
 extern unsigned long find_max_low_pfn(void);
-extern void add_one_highpage_init(struct page *, int, int);
 extern unsigned long highend_pfn, highstart_pfn;
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -117,13 +116,13 @@ static unsigned long kva_pages;
 */
 int __init get_memcfg_numa_flat(void)
 {
-        printk("NUMA - single node, flat memory mode\n");
+        printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
-        /* Run the memory configuration and find the top of memory. */
-        propagate_e820_map();
        node_start_pfn[0] = 0;
        node_end_pfn[0] = max_pfn;
+        e820_register_active_regions(0, 0, max_pfn);
        memory_present(0, 0, max_pfn);
+        node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
        /* Indicate there is one node available. */
        nodes_clear(node_online_map);
@@ -156,24 +155,32 @@ static void __init propagate_e820_map_node(int nid)
 */
 static void __init allocate_pgdat(int nid)
 {
-        if (nid && node_has_online_mem(nid))
+        char buf[16];
+        if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
                NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
        else {
-                NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
+                unsigned long pgdat_phys;
-                min_low_pfn += PFN_UP(sizeof(pg_data_t));
+                pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+                                 max_pfn_mapped<<PAGE_SHIFT,
+                                 sizeof(pg_data_t),
+                                 PAGE_SIZE);
+                NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
+                memset(buf, 0, sizeof(buf));
+                sprintf(buf, "NODE_DATA %d",  nid);
+                reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
        }
+        printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
+                nid, (unsigned long)NODE_DATA(nid));
 }
-#ifdef CONFIG_DISCONTIGMEM
 /*
- * In the discontig memory model, a portion of the kernel virtual area (KVA)
+ * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
- * is reserved and portions of nodes are mapped using it. This is to allow
+ * virtual address space (KVA) is reserved and portions of nodes are mapped
- * node-local memory to be allocated for structures that would normally require
+ * using it. This is to allow node-local memory to be allocated for
- * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers
+ * structures that would normally require ZONE_NORMAL. The memory is
- * should be prepared to allocate from the bootmem allocator instead. This KVA
+ * allocated with alloc_remap() and callers should be prepared to allocate
- * mechanism is incompatible with SPARSEMEM as it makes assumptions about the
+ * from the bootmem allocator instead.
- * layout of memory that are broken if alloc_remap() succeeds for some of the
- * map and fails for others
 */
 static unsigned long node_remap_start_pfn[MAX_NUMNODES];
 static void *node_remap_end_vaddr[MAX_NUMNODES];
@@ -195,15 +202,19 @@ void *alloc_remap(int nid, unsigned long size)
        return allocation;
 }
-void __init remap_numa_kva(void)
+static void __init remap_numa_kva(void)
 {
        void *vaddr;
        unsigned long pfn;
        int node;
        for_each_online_node(node) {
+                printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
                for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
                        vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+                        printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
+                                (unsigned long)vaddr,
+                                node_remap_start_pfn[node] + pfn);
                        set_pmd_pfn((ulong) vaddr, 
                                node_remap_start_pfn[node] + pfn, 
                                PAGE_KERNEL_LARGE);
@@ -215,17 +226,21 @@ static unsigned long calculate_numa_remap_pages(void)
 {
        int nid;
        unsigned long size, reserve_pages = 0;
-        unsigned long pfn;
        for_each_online_node(nid) {
-                unsigned old_end_pfn = node_end_pfn[nid];
+                u64 node_kva_target;
+                u64 node_kva_final;
                /*
                 * The acpi/srat node info can show hot-add memroy zones
                 * where memory could be added but not currently present.
                 */
+                printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
+                        nid, node_start_pfn[nid], node_end_pfn[nid]);
                if (node_start_pfn[nid] > max_pfn)
                        continue;
+                if (!node_end_pfn[nid])
+                        continue;
                if (node_end_pfn[nid] > max_pfn)
                        node_end_pfn[nid] = max_pfn;
@@ -237,41 +252,48 @@ static unsigned long calculate_numa_remap_pages(void)
                /* now the roundup is correct, convert to PAGE_SIZE pages */
                size = size * PTRS_PER_PTE;
-                /*
+                node_kva_target = round_down(node_end_pfn[nid] - size,
-                 * Validate the region we are allocating only contains valid
+                                                 PTRS_PER_PTE);
-                 * pages.
+                node_kva_target <<= PAGE_SHIFT;
-                 */
+                do {
-                for (pfn = node_end_pfn[nid] - size;
+                        node_kva_final = find_e820_area(node_kva_target,
-                     pfn < node_end_pfn[nid]; pfn++)
+                                        ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
-                        if (!page_is_ram(pfn))
+                                                ((u64)size)<<PAGE_SHIFT,
-                                break;
+                                                LARGE_PAGE_BYTES);
+                        node_kva_target -= LARGE_PAGE_BYTES;
-                if (pfn != node_end_pfn[nid])
+                } while (node_kva_final == -1ULL &&
-                        size = 0;
+                         (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
+                if (node_kva_final == -1ULL)
+                        panic("Can not get kva ram\n");
-                printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
-                                size, nid);
                node_remap_size[nid] = size;
                node_remap_offset[nid] = reserve_pages;
                reserve_pages += size;
-                printk("Shrinking node %d from %ld pages to %ld pages\n",
+                printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
-                        nid, node_end_pfn[nid], node_end_pfn[nid] - size);
+                                  " node %d at %llx\n",
+                                size, nid, node_kva_final>>PAGE_SHIFT);
-                if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
-                        /*
+                /*
-                         * Align node_end_pfn[] and node_remap_start_pfn[] to
+                 *  prevent kva address below max_low_pfn want it on system
-                         * pmd boundary. remap_numa_kva will barf otherwise.
+                 *  with less memory later.
-                         */
+                 *  layout will be: KVA address , KVA RAM
-                        printk("Shrinking node %d further by %ld pages for proper alignment\n",
+                 *
-                                nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
+                 *  we are supposed to only record the one less then max_low_pfn
-                        size +=  node_end_pfn[nid] & (PTRS_PER_PTE-1);
+                 *  but we could have some hole in high memory, and it will only
-                }
+                 *  check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
+                 *  to use it as free.
+                 *  So reserve_early here, hope we don't run out of that array
+                 */
+                reserve_early(node_kva_final,
+                              node_kva_final+(((u64)size)<<PAGE_SHIFT),
+                              "KVA RAM");
-                node_end_pfn[nid] -= size;
+                node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
-                node_remap_start_pfn[nid] = node_end_pfn[nid];
+                remove_active_range(nid, node_remap_start_pfn[nid],
-                shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
+                                         node_remap_start_pfn[nid] + size);
        }
-        printk("Reserving total of %ld pages for numa KVA remap\n",
+        printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
                        reserve_pages);
        return reserve_pages;
 }
@@ -285,37 +307,16 @@ static void init_remap_allocator(int nid)
        node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
                ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-        printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+        printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
                (ulong) node_remap_start_vaddr[nid],
-                (ulong) pfn_to_kaddr(highstart_pfn
+                (ulong) node_remap_end_vaddr[nid]);
-                   + node_remap_offset[nid] + node_remap_size[nid]));
-}
-#else
-void *alloc_remap(int nid, unsigned long size)
-{
-        return NULL;
-}
-static unsigned long calculate_numa_remap_pages(void)
-{
-        return 0;
-}
-static void init_remap_allocator(int nid)
-{
-}
-void __init remap_numa_kva(void)
-{
 }
-#endif /* CONFIG_DISCONTIGMEM */
-extern void setup_bootmem_allocator(void);
+void __init initmem_init(unsigned long start_pfn,
-unsigned long __init setup_memory(void)
+                                  unsigned long end_pfn)
 {
        int nid;
-        unsigned long system_start_pfn, system_max_low_pfn;
+        long kva_target_pfn;
-        unsigned long wasted_pages;
        /*
         * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -324,109 +325,77 @@ unsigned long __init setup_memory(void)
         * this space and use it to adjust the boundary between ZONE_NORMAL
         * and ZONE_HIGHMEM.
         */
-        get_memcfg_numa();
-        kva_pages = calculate_numa_remap_pages();
+        get_memcfg_numa();
-        /* partially used pages are not usable - thus round upwards */
+        kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
-        system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
-        kva_start_pfn = find_max_low_pfn() - kva_pages;
+        kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
+        do {
+                kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
+                                        max_low_pfn<<PAGE_SHIFT,
+                                        kva_pages<<PAGE_SHIFT,
+                                        PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
+                kva_target_pfn -= PTRS_PER_PTE;
+        } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
-#ifdef CONFIG_BLK_DEV_INITRD
+        if (kva_start_pfn == -1UL)
-        /* Numa kva area is below the initrd */
+                panic("Can not get kva space\n");
-        if (initrd_start)
-                kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
-                        - kva_pages;
-#endif
-        /*
+        printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
-         * We waste pages past at the end of the KVA for no good reason other
-         * than how it is located. This is bad.
-         */
-        wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
-        kva_start_pfn -= wasted_pages;
-        kva_pages += wasted_pages;
-        system_max_low_pfn = max_low_pfn = find_max_low_pfn();
-        printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
                kva_start_pfn, max_low_pfn);
-        printk("max_pfn = %ld\n", max_pfn);
+        printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
+        /* avoid clash with initrd */
+        reserve_early(kva_start_pfn<<PAGE_SHIFT,
+                      (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
+                     "KVA PG");
 #ifdef CONFIG_HIGHMEM
        highstart_pfn = highend_pfn = max_pfn;
-        if (max_pfn > system_max_low_pfn)
+        if (max_pfn > max_low_pfn)
-                highstart_pfn = system_max_low_pfn;
+                highstart_pfn = max_low_pfn;
        printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
               pages_to_mb(highend_pfn - highstart_pfn));
        num_physpages = highend_pfn;
        high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-        num_physpages = system_max_low_pfn;
+        num_physpages = max_low_pfn;
-        high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1;
+        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
        printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
-                        pages_to_mb(system_max_low_pfn));
+                        pages_to_mb(max_low_pfn));
-        printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 
+        printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
-                        min_low_pfn, max_low_pfn, highstart_pfn);
+                        max_low_pfn, highstart_pfn);
-        printk("Low memory ends at vaddr %08lx\n",
+        printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
                        (ulong) pfn_to_kaddr(max_low_pfn));
        for_each_online_node(nid) {
                init_remap_allocator(nid);
                allocate_pgdat(nid);
        }
-        printk("High memory starts at vaddr %08lx\n",
+        remap_numa_kva();
+        printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
                        (ulong) pfn_to_kaddr(highstart_pfn));
        for_each_online_node(nid)
                propagate_e820_map_node(nid);
-        memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
+        for_each_online_node(nid)
-        NODE_DATA(0)->bdata = &node0_bdata;
+                memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
-        setup_bootmem_allocator();
-        return max_low_pfn;
-}
-void __init numa_kva_reserve(void)
-{
-        if (kva_pages)
-                reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
-                                BOOTMEM_DEFAULT);
-}
-void __init zone_sizes_init(void)
-{
-        int nid;
-        unsigned long max_zone_pfns[MAX_NR_ZONES];
-        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-        max_zone_pfns[ZONE_DMA] =
-                virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-        max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
-        max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-#endif
-        /* If SRAT has not registered memory, register it now */
-        if (find_max_pfn_with_active_regions() == 0) {
-                for_each_online_node(nid) {
-                        if (node_has_online_mem(nid))
-                                add_active_range(nid, node_start_pfn[nid],
-                                                        node_end_pfn[nid]);
-                }
-        }
-        free_area_init_nodes(max_zone_pfns);
+        NODE_DATA(0)->bdata = &bootmem_node_data[0];
-        return;
+        setup_bootmem_allocator();
 }
-void __init set_highmem_pages_init(int bad_ppro) 
+void __init set_highmem_pages_init(void)
 {
 #ifdef CONFIG_HIGHMEM
        struct zone *zone;
-        struct page *page;
+        int nid;
        for_each_zone(zone) {
-                unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
+                unsigned long zone_start_pfn, zone_end_pfn;
                if (!is_highmem(zone))
                        continue;
@@ -434,16 +403,12 @@ void __init set_highmem_pages_init(int bad_ppro)
                zone_start_pfn = zone->zone_start_pfn;
                zone_end_pfn = zone_start_pfn + zone->spanned_pages;
-                printk("Initializing %s for node %d (%08lx:%08lx)\n",
+                nid = zone_to_nid(zone);
-                                zone->name, zone_to_nid(zone),
+                printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
-                                zone_start_pfn, zone_end_pfn);
+                                zone->name, nid, zone_start_pfn, zone_end_pfn);
-                for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
+                add_highpages_with_active_regions(nid, zone_start_pfn,
-                        if (!pfn_valid(node_pfn))
+                                 zone_end_pfn);
-                                continue;
-                        page = pfn_to_page(node_pfn);
-                        add_one_highpage_init(page, node_pfn, bad_ppro);
-                }
        }
        totalram_pages += totalhigh_pages;
 #endif
@@ -476,3 +441,4 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2c24bea92c66..e7277cbcfb40 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = {
        { 0, "User Space" },
 #ifdef CONFIG_X86_64
        { 0x8000000000000000UL, "Kernel Space" },
-        { 0xffff810000000000UL, "Low Kernel Mapping" },
+        { PAGE_OFFSET,          "Low Kernel Mapping" },
        { VMALLOC_START,        "vmalloc() Area" },
        { VMEMMAP_START,        "Vmemmap" },
        { __START_KERNEL_map,   "High Kernel Mapping" },
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
         * we have now. "break" is either changing perms, levels or
         * address space marker.
         */
-        prot = pgprot_val(new_prot) & ~(PTE_MASK);
+        prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
-        cur = pgprot_val(st->current_prot) & ~(PTE_MASK);
+        cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
        if (!st->level) {
                /* First entry */
@@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
        for (i = 0; i < PTRS_PER_PMD; i++) {
                st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
                if (!pmd_none(*start)) {
-                        pgprotval_t prot = pmd_val(*start) & ~PTE_MASK;
+                        pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
                        if (pmd_large(*start) || !pmd_present(*start))
                                note_page(m, st, __pgprot(prot), 3);
@@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
        for (i = 0; i < PTRS_PER_PUD; i++) {
                st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
                if (!pud_none(*start)) {
-                        pgprotval_t prot = pud_val(*start) & ~PTE_MASK;
+                        pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
                        if (pud_large(*start) || !pud_present(*start))
                                note_page(m, st, __pgprot(prot), 2);
@@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m)
        for (i = 0; i < PTRS_PER_PGD; i++) {
                st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
                if (!pgd_none(*start)) {
-                        pgprotval_t prot = pgd_val(*start) & ~PTE_MASK;
+                        pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
                        if (pgd_large(*start) || !pgd_present(*start))
                                note_page(m, &st, __pgprot(prot), 1);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8bcb6f40ccb6..a742d753d5b0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <linux/mmiotrace.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -34,6 +35,7 @@
 #include <asm/tlbflush.h>
 #include <asm/proto.h>
 #include <asm-generic/sections.h>
+#include <asm/traps.h>
 /*
 * Page fault error code bits
@@ -49,17 +51,23 @@
 #define PF_RSVD         (1<<3)
 #define PF_INSTR        (1<<4)
+static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
+{
+#ifdef CONFIG_MMIOTRACE_HOOKS
+        if (unlikely(is_kmmio_active()))
+                if (kmmio_handler(regs, addr) == 1)
+                        return -1;
+#endif
+        return 0;
+}
 static inline int notify_page_fault(struct pt_regs *regs)
 {
 #ifdef CONFIG_KPROBES
        int ret = 0;
        /* kprobe_running() needs smp_processor_id() */
-#ifdef CONFIG_X86_32
        if (!user_mode_vm(regs)) {
-#else
-        if (!user_mode(regs)) {
-#endif
                preempt_disable();
                if (kprobe_running() && kprobe_fault_handler(regs, 14))
                        ret = 1;
@@ -350,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
        return 0;
 }
-void do_invalid_op(struct pt_regs *, unsigned long);
 static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
 {
 #ifdef CONFIG_X86_F00F_BUG
@@ -396,11 +402,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
                printk(KERN_CONT "NULL pointer dereference");
        else
                printk(KERN_CONT "paging request");
-#ifdef CONFIG_X86_32
+        printk(KERN_CONT " at %p\n", (void *) address);
-        printk(KERN_CONT " at %08lx\n", address);
-#else
-        printk(KERN_CONT " at %016lx\n", address);
-#endif
        printk(KERN_ALERT "IP:");
        printk_address(regs->ip, 1);
        dump_pagetable(address);
@@ -606,6 +608,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
        if (notify_page_fault(regs))
                return;
+        if (unlikely(kmmio_fault(regs, address)))
+                return;
        /*
         * We fault-in kernel-space virtual memory on-demand. The
@@ -800,14 +804,10 @@ bad_area_nosemaphore:
                if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
                    printk_ratelimit()) {
                        printk(
-#ifdef CONFIG_X86_32
+                        "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
-                        "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
-#else
-                        "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
-#endif
                        task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-                        tsk->comm, task_pid_nr(tsk), address, regs->ip,
+                        tsk->comm, task_pid_nr(tsk), address,
-                        regs->sp, error_code);
+                        (void *) regs->ip, (void *) regs->sp, error_code);
                        print_vma_addr(" in ", regs->ip);
                        printk("\n");
                }
@@ -914,72 +914,45 @@ LIST_HEAD(pgd_list);
 void vmalloc_sync_all(void)
 {
-#ifdef CONFIG_X86_32
-        /*
-         * Note that races in the updates of insync and start aren't
-         * problematic: insync can only get set bits added, and updates to
-         * start are only improving performance (without affecting correctness
-         * if undone).
-         */
-        static DECLARE_BITMAP(insync, PTRS_PER_PGD);
-        static unsigned long start = TASK_SIZE;
        unsigned long address;
+#ifdef CONFIG_X86_32
        if (SHARED_KERNEL_PMD)
                return;
-        BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
+        for (address = VMALLOC_START & PMD_MASK;
-        for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
+             address >= TASK_SIZE && address < FIXADDR_TOP;
-                if (!test_bit(pgd_index(address), insync)) {
+             address += PMD_SIZE) {
-                        unsigned long flags;
+                unsigned long flags;
-                        struct page *page;
+                struct page *page;
-                        spin_lock_irqsave(&pgd_lock, flags);
+                spin_lock_irqsave(&pgd_lock, flags);
-                        list_for_each_entry(page, &pgd_list, lru) {
+                list_for_each_entry(page, &pgd_list, lru) {
-                                if (!vmalloc_sync_one(page_address(page),
+                        if (!vmalloc_sync_one(page_address(page),
-                                                      address))
+                                              address))
-                                        break;
+                                break;
-                        }
-                        spin_unlock_irqrestore(&pgd_lock, flags);
-                        if (!page)
-                                set_bit(pgd_index(address), insync);
                }
-                if (address == start && test_bit(pgd_index(address), insync))
+                spin_unlock_irqrestore(&pgd_lock, flags);
-                        start = address + PGDIR_SIZE;
        }
 #else /* CONFIG_X86_64 */
-        /*
+        for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-         * Note that races in the updates of insync and start aren't
+             address += PGDIR_SIZE) {
-         * problematic: insync can only get set bits added, and updates to
+                const pgd_t *pgd_ref = pgd_offset_k(address);
-         * start are only improving performance (without affecting correctness
+                unsigned long flags;
-         * if undone).
+                struct page *page;
-         */
-        static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+                if (pgd_none(*pgd_ref))
-        static unsigned long start = VMALLOC_START & PGDIR_MASK;
+                        continue;
-        unsigned long address;
+                spin_lock_irqsave(&pgd_lock, flags);
+                list_for_each_entry(page, &pgd_list, lru) {
-        for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+                        pgd_t *pgd;
-                if (!test_bit(pgd_index(address), insync)) {
+                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
-                        const pgd_t *pgd_ref = pgd_offset_k(address);
+                        if (pgd_none(*pgd))
-                        unsigned long flags;
+                                set_pgd(pgd, *pgd_ref);
-                        struct page *page;
+                        else
+                                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-                        if (pgd_none(*pgd_ref))
-                                continue;
-                        spin_lock_irqsave(&pgd_lock, flags);
-                        list_for_each_entry(page, &pgd_list, lru) {
-                                pgd_t *pgd;
-                                pgd = (pgd_t *)page_address(page) + pgd_index(address);
-                                if (pgd_none(*pgd))
-                                        set_pgd(pgd, *pgd_ref);
-                                else
-                                        BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-                        }
-                        spin_unlock_irqrestore(&pgd_lock, flags);
-                        set_bit(pgd_index(address), insync);
                }
-                if (address == start)
+                spin_unlock_irqrestore(&pgd_lock, flags);
-                        start = address + PGDIR_SIZE;
        }
 #endif
 }
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..007bb06c7504
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,298 @@
+/*
+ * Lockless get_user_pages_fast for x86
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/highmem.h>
+#include <asm/pgtable.h>
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+#ifndef CONFIG_X86_PAE
+        return *ptep;
+#else
+        /*
+         * With get_user_pages_fast, we walk down the pagetables without taking
+         * any locks.  For this we would like to load the pointers atoimcally,
+         * but that is not possible (without expensive cmpxchg8b) on PAE.  What
+         * we do have is the guarantee that a pte will only either go from not
+         * present to present, or present to not present or both -- it will not
+         * switch to a completely different present page without a TLB flush in
+         * between; something that we are blocking by holding interrupts off.
+         *
+         * Setting ptes from not present to present goes:
+         * ptep->pte_high = h;
+         * smp_wmb();
+         * ptep->pte_low = l;
+         *
+         * And present to not present goes:
+         * ptep->pte_low = 0;
+         * smp_wmb();
+         * ptep->pte_high = 0;
+         *
+         * We must ensure here that the load of pte_low sees l iff pte_high
+         * sees h. We load pte_high *after* loading pte_low, which ensures we
+         * don't see an older value of pte_high.  *Then* we recheck pte_low,
+         * which ensures that we haven't picked up a changed pte high. We might
+         * have got rubbish values from pte_low and pte_high, but we are
+         * guaranteed that pte_low will not have the present bit set *unless*
+         * it is 'l'. And get_user_pages_fast only operates on present ptes, so
+         * we're safe.
+         *
+         * gup_get_pte should not be used or copied outside gup.c without being
+         * very careful -- it does not atomically load the pte or anything that
+         * is likely to be useful for you.
+         */
+        pte_t pte;
+retry:
+        pte.pte_low = ptep->pte_low;
+        smp_rmb();
+        pte.pte_high = ptep->pte_high;
+        smp_rmb();
+        if (unlikely(pte.pte_low != ptep->pte_low))
+                goto retry;
+        return pte;
+#endif
+}
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        unsigned long mask;
+        pte_t *ptep;
+        mask = _PAGE_PRESENT|_PAGE_USER;
+        if (write)
+                mask |= _PAGE_RW;
+        ptep = pte_offset_map(&pmd, addr);
+        do {
+                pte_t pte = gup_get_pte(ptep);
+                struct page *page;
+                if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+                        pte_unmap(ptep);
+                        return 0;
+                }
+                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+                page = pte_page(pte);
+                get_page(page);
+                pages[*nr] = page;
+                (*nr)++;
+        } while (ptep++, addr += PAGE_SIZE, addr != end);
+        pte_unmap(ptep - 1);
+        return 1;
+}
+static inline void get_head_page_multiple(struct page *page, int nr)
+{
+        VM_BUG_ON(page != compound_head(page));
+        VM_BUG_ON(page_count(page) == 0);
+        atomic_add(nr, &page->_count);
+}
+static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        unsigned long mask;
+        pte_t pte = *(pte_t *)&pmd;
+        struct page *head, *page;
+        int refs;
+        mask = _PAGE_PRESENT|_PAGE_USER;
+        if (write)
+                mask |= _PAGE_RW;
+        if ((pte_val(pte) & mask) != mask)
+                return 0;
+        /* hugepages are never "special" */
+        VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+        refs = 0;
+        head = pte_page(pte);
+        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+        do {
+                VM_BUG_ON(compound_head(page) != head);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (addr += PAGE_SIZE, addr != end);
+        get_head_page_multiple(head, refs);
+        return 1;
+}
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+                int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pmd_t *pmdp;
+        pmdp = pmd_offset(&pud, addr);
+        do {
+                pmd_t pmd = *pmdp;
+                next = pmd_addr_end(addr, end);
+                if (pmd_none(pmd))
+                        return 0;
+                if (unlikely(pmd_large(pmd))) {
+                        if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+                                return 0;
+                } else {
+                        if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+                                return 0;
+                }
+        } while (pmdp++, addr = next, addr != end);
+        return 1;
+}
+static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        unsigned long mask;
+        pte_t pte = *(pte_t *)&pud;
+        struct page *head, *page;
+        int refs;
+        mask = _PAGE_PRESENT|_PAGE_USER;
+        if (write)
+                mask |= _PAGE_RW;
+        if ((pte_val(pte) & mask) != mask)
+                return 0;
+        /* hugepages are never "special" */
+        VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+        refs = 0;
+        head = pte_page(pte);
+        page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+        do {
+                VM_BUG_ON(compound_head(page) != head);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (addr += PAGE_SIZE, addr != end);
+        get_head_page_multiple(head, refs);
+        return 1;
+}
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+                        int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pud_t *pudp;
+        pudp = pud_offset(&pgd, addr);
+        do {
+                pud_t pud = *pudp;
+                next = pud_addr_end(addr, end);
+                if (pud_none(pud))
+                        return 0;
+                if (unlikely(pud_large(pud))) {
+                        if (!gup_huge_pud(pud, addr, next, write, pages, nr))
+                                return 0;
+                } else {
+                        if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+                                return 0;
+                }
+        } while (pudp++, addr = next, addr != end);
+        return 1;
+}
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+                        struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long addr, len, end;
+        unsigned long next;
+        pgd_t *pgdp;
+        int nr = 0;
+        start &= PAGE_MASK;
+        addr = start;
+        len = (unsigned long) nr_pages << PAGE_SHIFT;
+        end = start + len;
+        if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+                                        start, len)))
+                goto slow_irqon;
+        /*
+         * XXX: batch / limit 'nr', to avoid large irq off latency
+         * needs some instrumenting to determine the common sizes used by
+         * important workloads (eg. DB2), and whether limiting the batch size
+         * will decrease performance.
+         *
+         * It seems like we're in the clear for the moment. Direct-IO is
+         * the main guy that batches up lots of get_user_pages, and even
+         * they are limited to 64-at-a-time which is not so many.
+         */
+        /*
+         * This doesn't prevent pagetable teardown, but does prevent
+         * the pagetables and pages from being freed on x86.
+         *
+         * So long as we atomically load page table pointers versus teardown
+         * (which we do on x86, with the above PAE exception), we can follow the
+         * address down to the the page and take a ref on it.
+         */
+        local_irq_disable();
+        pgdp = pgd_offset(mm, addr);
+        do {
+                pgd_t pgd = *pgdp;
+                next = pgd_addr_end(addr, end);
+                if (pgd_none(pgd))
+                        goto slow;
+                if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+                        goto slow;
+        } while (pgdp++, addr = next, addr != end);
+        local_irq_enable();
+        VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+        return nr;
+        {
+                int ret;
+slow:
+                local_irq_enable();
+slow_irqon:
+                /* Try to get the remaining pages with get_user_pages */
+                start += nr << PAGE_SHIFT;
+                pages += nr;
+                down_read(&mm->mmap_sem);
+                ret = get_user_pages(current, mm, start,
+                        (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+                up_read(&mm->mmap_sem);
+                /* Have to be a bit careful with return values */
+                if (nr > 0) {
+                        if (ret < 0)
+                                ret = nr;
+                        else
+                                ret += nr;
+                }
+                return ret;
+        }
+}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686d..8f307d914c2e 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
        return 1;
 }
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
        pgd = pgd_offset(mm, addr);
        pud = pud_alloc(mm, pgd, addr);
        if (pud) {
-                if (pud_none(*pud))
+                if (sz == PUD_SIZE) {
-                        huge_pmd_share(mm, addr, pud);
+                        pte = (pte_t *)pud;
-                pte = (pte_t *) pmd_alloc(mm, pud, addr);
+                } else {
+                        BUG_ON(sz != PMD_SIZE);
+                        if (pud_none(*pud))
+                                huge_pmd_share(mm, addr, pud);
+                        pte = (pte_t *) pmd_alloc(mm, pud, addr);
+                }
        }
        BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
@@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        pgd = pgd_offset(mm, addr);
        if (pgd_present(*pgd)) {
                pud = pud_offset(pgd, addr);
-                if (pud_present(*pud))
+                if (pud_present(*pud)) {
+                        if (pud_large(*pud))
+                                return (pte_t *)pud;
                        pmd = pmd_offset(pud, addr);
+                }
        }
        return (pte_t *) pmd;
 }
@@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd)
        return 0;
 }
+int pud_huge(pud_t pud)
+{
+        return 0;
+}
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
                pmd_t *pmd, int write)
@@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd)
        return !!(pmd_val(pmd) & _PAGE_PSE);
 }
+int pud_huge(pud_t pud)
+{
+        return !!(pud_val(pud) & _PAGE_PSE);
+}
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
                pmd_t *pmd, int write)
@@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        page = pte_page(*(pte_t *)pmd);
        if (page)
-                page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
+                page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+        return page;
+}
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+                pud_t *pud, int write)
+{
+        struct page *page;
+        page = pte_page(*(pte_t *)pud);
+        if (page)
+                page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
        return page;
 }
 #endif
 /* x86_64 also uses this file */
@@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
                unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags)
 {
+        struct hstate *h = hstate_file(file);
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long start_addr;
@@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
        }
 full_search:
-        addr = ALIGN(start_addr, HPAGE_SIZE);
+        addr = ALIGN(start_addr, huge_page_size(h));
        for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
                /* At this point:  (!vma || addr < vma->vm_end). */
@@ -262,7 +295,7 @@ full_search:
                }
                if (addr + mm->cached_hole_size < vma->vm_start)
                        mm->cached_hole_size = vma->vm_start - addr;
-                addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+                addr = ALIGN(vma->vm_end, huge_page_size(h));
        }
 }
@@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
                unsigned long addr0, unsigned long len,
                unsigned long pgoff, unsigned long flags)
 {
+        struct hstate *h = hstate_file(file);
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev_vma;
        unsigned long base = mm->mmap_base, addr = addr0;
@@ -290,7 +324,7 @@ try_again:
                goto fail;
        /* either no address requested or cant fit in requested address hole */
-        addr = (mm->free_area_cache - len) & HPAGE_MASK;
+        addr = (mm->free_area_cache - len) & huge_page_mask(h);
        do {
                /*
                 * Lookup failure means no vma is above this address,
@@ -321,7 +355,7 @@ try_again:
                        largest_hole = vma->vm_start - addr;
                /* try just below the current vma->vm_start */
-                addr = (vma->vm_start - len) & HPAGE_MASK;
+                addr = (vma->vm_start - len) & huge_page_mask(h);
        } while (len <= vma->vm_start);
 fail:
@@ -359,22 +393,23 @@ unsigned long
 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
 {
+        struct hstate *h = hstate_file(file);
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
-        if (len & ~HPAGE_MASK)
+        if (len & ~huge_page_mask(h))
                return -EINVAL;
        if (len > TASK_SIZE)
                return -ENOMEM;
        if (flags & MAP_FIXED) {
-                if (prepare_hugepage_range(addr, len))
+                if (prepare_hugepage_range(file, addr, len))
                        return -EINVAL;
                return addr;
        }
        if (addr) {
-                addr = ALIGN(addr, HPAGE_SIZE);
+                addr = ALIGN(addr, huge_page_size(h));
                vma = find_vma(mm, addr);
                if (TASK_SIZE - len >= addr &&
                    (!vma || addr + len <= vma->vm_start))
@@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
+#ifdef CONFIG_X86_64
+static __init int setup_hugepagesz(char *opt)
+{
+        unsigned long ps = memparse(opt, &opt);
+        if (ps == PMD_SIZE) {
+                hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+        } else if (ps == PUD_SIZE && cpu_has_gbpages) {
+                hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+        } else {
+                printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
+                        ps >> 20);
+                return 0;
+        }
+        return 1;
+}
+__setup("hugepagesz=", setup_hugepagesz);
+#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..bbe044dbe014 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
 #include <linux/cpumask.h>
 #include <asm/asm.h>
+#include <asm/bios_ebda.h>
 #include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -47,9 +48,11 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
+#include <asm/smp.h>
 unsigned int __VMALLOC_RESERVE = 128 << 20;
+unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -57,6 +60,27 @@ unsigned long highstart_pfn, highend_pfn;
 static noinline int do_test_wp_bit(void);
+static unsigned long __initdata table_start;
+static unsigned long __meminitdata table_end;
+static unsigned long __meminitdata table_top;
+static int __initdata after_init_bootmem;
+static __init void *alloc_low_page(unsigned long *phys)
+{
+        unsigned long pfn = table_end++;
+        void *adr;
+        if (pfn >= table_top)
+                panic("alloc_low_page: ran out of memory");
+        adr = __va(pfn * PAGE_SIZE);
+        memset(adr, 0, PAGE_SIZE);
+        *phys  = pfn * PAGE_SIZE;
+        return adr;
+}
 /*
 * Creates a middle page table and puts a pointer to it in the
 * given global directory entry. This only returns the gd entry
@@ -68,9 +92,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
        pmd_t *pmd_table;
 #ifdef CONFIG_X86_PAE
+        unsigned long phys;
        if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
-                pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+                if (after_init_bootmem)
+                        pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+                else
+                        pmd_table = (pmd_t *)alloc_low_page(&phys);
                paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
                set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
                pud = pud_offset(pgd, 0);
@@ -92,12 +119,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
        if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
                pte_t *page_table = NULL;
+                if (after_init_bootmem) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
+                        page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
-                if (!page_table) {
+                        if (!page_table)
-                        page_table =
+                                page_table =
                                (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+                } else {
+                        unsigned long phys;
+                        page_table = (pte_t *)alloc_low_page(&phys);
                }
                paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@ -155,40 +186,72 @@ static inline int is_kernel_text(unsigned long addr)
 * of max_low_pfn pages, by creating page tables starting from address
 * PAGE_OFFSET:
 */
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
+                                                unsigned long start_pfn,
+                                                unsigned long end_pfn,
+                                                int use_pse)
 {
        int pgd_idx, pmd_idx, pte_ofs;
        unsigned long pfn;
        pgd_t *pgd;
        pmd_t *pmd;
        pte_t *pte;
+        unsigned pages_2m, pages_4k;
+        int mapping_iter;
-        pgd_idx = pgd_index(PAGE_OFFSET);
+        /*
-        pgd = pgd_base + pgd_idx;
+         * First iteration will setup identity mapping using large/small pages
-        pfn = 0;
+         * based on use_pse, with other attributes same as set by
+         * the early code in head_32.S
+         *
+         * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+         * as desired for the kernel identity mapping.
+         *
+         * This two pass mechanism conforms to the TLB app note which says:
+         *
+         *     "Software should not write to a paging-structure entry in a way
+         *      that would change, for any linear address, both the page size
+         *      and either the page frame or attributes."
+         */
+        mapping_iter = 1;
+        if (!cpu_has_pse)
+                use_pse = 0;
+repeat:
+        pages_2m = pages_4k = 0;
+        pfn = start_pfn;
+        pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+        pgd = pgd_base + pgd_idx;
        for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
                pmd = one_md_table_init(pgd);
-                if (pfn >= max_low_pfn)
-                        continue;
-                for (pmd_idx = 0;
+                if (pfn >= end_pfn)
-                     pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+                        continue;
+#ifdef CONFIG_X86_PAE
+                pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+                pmd += pmd_idx;
+#else
+                pmd_idx = 0;
+#endif
+                for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
                     pmd++, pmd_idx++) {
                        unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
                        /*
                         * Map with big pages if possible, otherwise
                         * create normal page tables:
-                         *
-                         * Don't use a large page for the first 2/4MB of memory
-                         * because there are often fixed size MTRRs in there
-                         * and overlapping MTRRs into large pages can cause
-                         * slowdowns.
                         */
-                        if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
+                        if (use_pse) {
                                unsigned int addr2;
                                pgprot_t prot = PAGE_KERNEL_LARGE;
+                                /*
+                                 * first pass will use the same initial
+                                 * identity mapping attribute + _PAGE_PSE.
+                                 */
+                                pgprot_t init_prot =
+                                        __pgprot(PTE_IDENT_ATTR |
+                                                 _PAGE_PSE);
                                addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
                                        PAGE_OFFSET + PAGE_SIZE-1;
@@ -197,34 +260,59 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
                                    is_kernel_text(addr2))
                                        prot = PAGE_KERNEL_LARGE_EXEC;
-                                set_pmd(pmd, pfn_pmd(pfn, prot));
+                                pages_2m++;
+                                if (mapping_iter == 1)
+                                        set_pmd(pmd, pfn_pmd(pfn, init_prot));
+                                else
+                                        set_pmd(pmd, pfn_pmd(pfn, prot));
                                pfn += PTRS_PER_PTE;
-                                max_pfn_mapped = pfn;
                                continue;
                        }
                        pte = one_page_table_init(pmd);
-                        for (pte_ofs = 0;
+                        pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
-                             pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+                        pte += pte_ofs;
+                        for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
                             pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
                                pgprot_t prot = PAGE_KERNEL;
+                                /*
+                                 * first pass will use the same initial
+                                 * identity mapping attribute.
+                                 */
+                                pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
                                if (is_kernel_text(addr))
                                        prot = PAGE_KERNEL_EXEC;
-                                set_pte(pte, pfn_pte(pfn, prot));
+                                pages_4k++;
+                                if (mapping_iter == 1)
+                                        set_pte(pte, pfn_pte(pfn, init_prot));
+                                else
+                                        set_pte(pte, pfn_pte(pfn, prot));
                        }
-                        max_pfn_mapped = pfn;
                }
        }
-}
+        if (mapping_iter == 1) {
+                /*
+                 * update direct mapping page count only in the first
+                 * iteration.
+                 */
+                update_page_count(PG_LEVEL_2M, pages_2m);
+                update_page_count(PG_LEVEL_4K, pages_4k);
-static inline int page_kills_ppro(unsigned long pagenr)
+                /*
-{
+                 * local global flush tlb, which will flush the previous
-        if (pagenr >= 0x70000 && pagenr <= 0x7003F)
+                 * mappings present in both small and large page TLB's.
-                return 1;
+                 */
-        return 0;
+                __flush_tlb_all();
+                /*
+                 * Second iteration will set the actual desired PTE attributes.
+                 */
+                mapping_iter = 2;
+                goto repeat;
+        }
 }
 /*
@@ -287,29 +375,62 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
        pkmap_page_table = pte;
 }
-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+static void __init add_one_highpage_init(struct page *page, int pfn)
 {
-        if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
+        ClearPageReserved(page);
-                ClearPageReserved(page);
+        init_page_count(page);
-                init_page_count(page);
+        __free_page(page);
-                __free_page(page);
+        totalhigh_pages++;
-                totalhigh_pages++;
-        } else
-                SetPageReserved(page);
 }
-#ifndef CONFIG_NUMA
+struct add_highpages_data {
-static void __init set_highmem_pages_init(int bad_ppro)
+        unsigned long start_pfn;
+        unsigned long end_pfn;
+};
+static int __init add_highpages_work_fn(unsigned long start_pfn,
+                                         unsigned long end_pfn, void *datax)
 {
-        int pfn;
+        int node_pfn;
+        struct page *page;
+        unsigned long final_start_pfn, final_end_pfn;
+        struct add_highpages_data *data;
-        for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
+        data = (struct add_highpages_data *)datax;
-                /*
-                 * Holes under sparsemem might not have no mem_map[]:
+        final_start_pfn = max(start_pfn, data->start_pfn);
-                 */
+        final_end_pfn = min(end_pfn, data->end_pfn);
-                if (pfn_valid(pfn))
+        if (final_start_pfn >= final_end_pfn)
-                        add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+                return 0;
+        for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
+             node_pfn++) {
+                if (!pfn_valid(node_pfn))
+                        continue;
+                page = pfn_to_page(node_pfn);
+                add_one_highpage_init(page, node_pfn);
        }
+        return 0;
+}
+void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+                                              unsigned long end_pfn)
+{
+        struct add_highpages_data data;
+        data.start_pfn = start_pfn;
+        data.end_pfn = end_pfn;
+        work_with_active_regions(nid, add_highpages_work_fn, &data);
+}
+#ifndef CONFIG_NUMA
+static void __init set_highmem_pages_init(void)
+{
+        add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
        totalram_pages += totalhigh_pages;
 }
 #endif /* !CONFIG_NUMA */
@@ -317,14 +438,9 @@ static void __init set_highmem_pages_init(int bad_ppro)
 #else
 # define kmap_init()                            do { } while (0)
 # define permanent_kmaps_init(pgd_base)         do { } while (0)
-# define set_highmem_pages_init(bad_ppro)       do { } while (0)
+# define set_highmem_pages_init()       do { } while (0)
 #endif /* CONFIG_HIGHMEM */
-pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
-EXPORT_SYMBOL(__PAGE_KERNEL);
-pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
 void __init native_pagetable_setup_start(pgd_t *base)
 {
        unsigned long pfn, va;
@@ -380,27 +496,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
 * be partially populated, and so it avoids stomping on any existing
 * mappings.
 */
-static void __init pagetable_init(void)
+static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
 {
-        pgd_t *pgd_base = swapper_pg_dir;
        unsigned long vaddr, end;
-        paravirt_pagetable_setup_start(pgd_base);
-        /* Enable PSE if available */
-        if (cpu_has_pse)
-                set_in_cr4(X86_CR4_PSE);
-        /* Enable PGE if available */
-        if (cpu_has_pge) {
-                set_in_cr4(X86_CR4_PGE);
-                __PAGE_KERNEL |= _PAGE_GLOBAL;
-                __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
-        }
-        kernel_physical_mapping_init(pgd_base);
-        remap_numa_kva();
        /*
         * Fixed mappings, only the page table structure has to be
         * created - mappings will be set by set_fixmap():
@@ -410,10 +509,13 @@ static void __init pagetable_init(void)
        end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
        page_table_range_init(vaddr, end, pgd_base);
        early_ioremap_reset();
+}
-        permanent_kmaps_init(pgd_base);
+static void __init pagetable_init(void)
+{
+        pgd_t *pgd_base = swapper_pg_dir;
-        paravirt_pagetable_setup_done(pgd_base);
+        permanent_kmaps_init(pgd_base);
 }
 #ifdef CONFIG_ACPI_SLEEP
@@ -456,7 +558,7 @@ void zap_low_mappings(void)
 int nx_enabled;
-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 #ifdef CONFIG_X86_PAE
@@ -509,27 +611,329 @@ static void __init set_nx(void)
 }
 #endif
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
 /*
- * paging_init() sets up the page tables - note that the first 8MB are
+ * highmem=size forces highmem to be exactly 'size' bytes.
- * already mapped by head.S.
+ * This works even on boxes that have no highmem otherwise.
- *
+ * This also works to reduce highmem size on bigger boxes.
- * This routines also unmaps the page at virtual kernel address 0, so
- * that we can trap those pesky NULL-reference errors in the kernel.
 */
-void __init paging_init(void)
+static int __init parse_highmem(char *arg)
+{
+        if (!arg)
+                return -EINVAL;
+        highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+        return 0;
+}
+early_param("highmem", parse_highmem);
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
 {
+        /* it could update max_pfn */
+        /* max_low_pfn is 0, we already have early_res support */
+        max_low_pfn = max_pfn;
+        if (max_low_pfn > MAXMEM_PFN) {
+                if (highmem_pages == -1)
+                        highmem_pages = max_pfn - MAXMEM_PFN;
+                if (highmem_pages + MAXMEM_PFN < max_pfn)
+                        max_pfn = MAXMEM_PFN + highmem_pages;
+                if (highmem_pages + MAXMEM_PFN > max_pfn) {
+                        printk(KERN_WARNING "only %luMB highmem pages "
+                                "available, ignoring highmem size of %uMB.\n",
+                                pages_to_mb(max_pfn - MAXMEM_PFN),
+                                pages_to_mb(highmem_pages));
+                        highmem_pages = 0;
+                }
+                max_low_pfn = MAXMEM_PFN;
+#ifndef CONFIG_HIGHMEM
+                /* Maximum memory usable is what is directly addressable */
+                printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+                                        MAXMEM>>20);
+                if (max_pfn > MAX_NONPAE_PFN)
+                        printk(KERN_WARNING
+                                 "Use a HIGHMEM64G enabled kernel.\n");
+                else
+                        printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+                max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+                if (max_pfn > MAX_NONPAE_PFN) {
+                        max_pfn = MAX_NONPAE_PFN;
+                        printk(KERN_WARNING "Warning only 4GB will be used."
+                                "Use a HIGHMEM64G enabled kernel.\n");
+                }
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+        } else {
+                if (highmem_pages == -1)
+                        highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+                if (highmem_pages >= max_pfn) {
+                        printk(KERN_ERR "highmem size specified (%uMB) is "
+                                "bigger than pages available (%luMB)!.\n",
+                                pages_to_mb(highmem_pages),
+                                pages_to_mb(max_pfn));
+                        highmem_pages = 0;
+                }
+                if (highmem_pages) {
+                        if (max_low_pfn - highmem_pages <
+                            64*1024*1024/PAGE_SIZE){
+                                printk(KERN_ERR "highmem size %uMB results in "
+                                "smaller than 64MB lowmem, ignoring it.\n"
+                                        , pages_to_mb(highmem_pages));
+                                highmem_pages = 0;
+                        }
+                        max_low_pfn -= highmem_pages;
+                }
+#else
+                if (highmem_pages)
+                        printk(KERN_ERR "ignoring highmem size on non-highmem"
+                                        " kernel!\n");
+#endif
+        }
+}
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init initmem_init(unsigned long start_pfn,
+                                  unsigned long end_pfn)
+{
+#ifdef CONFIG_HIGHMEM
+        highstart_pfn = highend_pfn = max_pfn;
+        if (max_pfn > max_low_pfn)
+                highstart_pfn = max_low_pfn;
+        memory_present(0, 0, highend_pfn);
+        e820_register_active_regions(0, 0, highend_pfn);
+        printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+                pages_to_mb(highend_pfn - highstart_pfn));
+        num_physpages = highend_pfn;
+        high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+        memory_present(0, 0, max_low_pfn);
+        e820_register_active_regions(0, 0, max_low_pfn);
+        num_physpages = max_low_pfn;
+        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+        max_mapnr = num_physpages;
+#endif
+        printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+                        pages_to_mb(max_low_pfn));
+        setup_bootmem_allocator();
+}
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+static void __init zone_sizes_init(void)
+{
+        unsigned long max_zone_pfns[MAX_NR_ZONES];
+        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+        max_zone_pfns[ZONE_DMA] =
+                virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+        max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+        max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+#endif
+        free_area_init_nodes(max_zone_pfns);
+}
+void __init setup_bootmem_allocator(void)
+{
+        int i;
+        unsigned long bootmap_size, bootmap;
+        /*
+         * Initialize the boot-time allocator (with low memory only):
+         */
+        bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
+        bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+                                 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+                                 PAGE_SIZE);
+        if (bootmap == -1L)
+                panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+        reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
+        /* don't touch min_low_pfn */
+        bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
+                                         min_low_pfn, max_low_pfn);
+        printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
+                 max_pfn_mapped<<PAGE_SHIFT);
+        printk(KERN_INFO "  low ram: %08lx - %08lx\n",
+                 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+        printk(KERN_INFO "  bootmap %08lx - %08lx\n",
+                 bootmap, bootmap + bootmap_size);
+        for_each_online_node(i)
+                free_bootmem_with_active_regions(i, max_low_pfn);
+        early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+        after_init_bootmem = 1;
+}
+static void __init find_early_table_space(unsigned long end, int use_pse)
+{
+        unsigned long puds, pmds, ptes, tables, start;
+        puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+        tables = PAGE_ALIGN(puds * sizeof(pud_t));
+        pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+        tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
+        if (use_pse) {
+                unsigned long extra;
+                extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+                extra += PMD_SIZE;
+                ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        } else
+                ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        tables += PAGE_ALIGN(ptes * sizeof(pte_t));
+        /* for fixmap */
+        tables += PAGE_SIZE * 2;
+        /*
+         * RED-PEN putting page tables only on node 0 could
+         * cause a hotspot and fill up ZONE_DMA. The page tables
+         * need roughly 0.5KB per GB.
+         */
+        start = 0x7000;
+        table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+                                        tables, PAGE_SIZE);
+        if (table_start == -1UL)
+                panic("Cannot find space for the kernel page tables");
+        table_start >>= PAGE_SHIFT;
+        table_end = table_start;
+        table_top = table_start + (tables>>PAGE_SHIFT);
+        printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+                end, table_start << PAGE_SHIFT,
+                (table_start << PAGE_SHIFT) + tables);
+}
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+                                                unsigned long end)
+{
+        pgd_t *pgd_base = swapper_pg_dir;
+        unsigned long start_pfn, end_pfn;
+        unsigned long big_page_start;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+        /*
+         * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+         * This will simplify cpa(), which otherwise needs to support splitting
+         * large pages into small in interrupt context, etc.
+         */
+        int use_pse = 0;
+#else
+        int use_pse = cpu_has_pse;
+#endif
+        /*
+         * Find space for the kernel direct mapping tables.
+         */
+        if (!after_init_bootmem)
+                find_early_table_space(end, use_pse);
 #ifdef CONFIG_X86_PAE
        set_nx();
        if (nx_enabled)
                printk(KERN_INFO "NX (Execute Disable) protection: active\n");
 #endif
-        pagetable_init();
+        /* Enable PSE if available */
+        if (cpu_has_pse)
+                set_in_cr4(X86_CR4_PSE);
+        /* Enable PGE if available */
+        if (cpu_has_pge) {
+                set_in_cr4(X86_CR4_PGE);
+                __supported_pte_mask |= _PAGE_GLOBAL;
+        }
+        /*
+         * Don't use a large page for the first 2/4MB of memory
+         * because there are often fixed size MTRRs in there
+         * and overlapping MTRRs into large pages can cause
+         * slowdowns.
+         */
+        big_page_start = PMD_SIZE;
+        if (start < big_page_start) {
+                start_pfn = start >> PAGE_SHIFT;
+                end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
+        } else {
+                /* head is not big page alignment ? */
+                start_pfn = start >> PAGE_SHIFT;
+                end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+                                 << (PMD_SHIFT - PAGE_SHIFT);
+        }
+        if (start_pfn < end_pfn)
+                kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
+        /* big page range */
+        start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+                         << (PMD_SHIFT - PAGE_SHIFT);
+        if (start_pfn < (big_page_start >> PAGE_SHIFT))
+                start_pfn =  big_page_start >> PAGE_SHIFT;
+        end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+        if (start_pfn < end_pfn)
+                kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
+                                             use_pse);
+        /* tail is not big page alignment ? */
+        start_pfn = end_pfn;
+        if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
+                end_pfn = end >> PAGE_SHIFT;
+                if (start_pfn < end_pfn)
+                        kernel_physical_mapping_init(pgd_base, start_pfn,
+                                                         end_pfn, 0);
+        }
+        early_ioremap_page_table_range_init(pgd_base);
        load_cr3(swapper_pg_dir);
        __flush_tlb_all();
+        if (!after_init_bootmem)
+                reserve_early(table_start << PAGE_SHIFT,
+                                 table_end << PAGE_SHIFT, "PGTABLE");
+        if (!after_init_bootmem)
+                early_memtest(start, end);
+        return end >> PAGE_SHIFT;
+}
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+        pagetable_init();
+        __flush_tlb_all();
        kmap_init();
+        /*
+         * NOTE: at this point the bootmem allocator is fully available.
+         */
+        sparse_init();
+        zone_sizes_init();
 }
 /*
@@ -564,24 +968,13 @@ static struct kcore_list kcore_mem, kcore_vmalloc;
 void __init mem_init(void)
 {
        int codesize, reservedpages, datasize, initsize;
-        int tmp, bad_ppro;
+        int tmp;
+        start_periodic_check_for_corruption();
 #ifdef CONFIG_FLATMEM
        BUG_ON(!mem_map);
 #endif
-        bad_ppro = ppro_with_ram_bug();
-#ifdef CONFIG_HIGHMEM
-        /* check that fixmap and pkmap do not overlap */
-        if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
-                printk(KERN_ERR
-                        "fixmap and kmap areas overlap - this will crash\n");
-                printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
-                                PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
-                                FIXADDR_START);
-                BUG();
-        }
-#endif
        /* this will put all low memory onto the freelists */
        totalram_pages += free_all_bootmem();
@@ -593,7 +986,7 @@ void __init mem_init(void)
                if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
                        reservedpages++;
-        set_highmem_pages_init(bad_ppro);
+        set_highmem_pages_init();
        codesize =  (unsigned long) &_etext - (unsigned long) &_text;
        datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
@@ -614,7 +1007,6 @@ void __init mem_init(void)
                (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
               );
-#if 1 /* double-sanity-check paranoia */
        printk(KERN_INFO "virtual kernel memory layout:\n"
                "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #ifdef CONFIG_HIGHMEM
@@ -655,12 +1047,10 @@ void __init mem_init(void)
 #endif
        BUG_ON(VMALLOC_START                            > VMALLOC_END);
        BUG_ON((unsigned long)high_memory               > VMALLOC_START);
-#endif /* double-sanity-check paranoia */
        if (boot_cpu_data.wp_works_ok < 0)
                test_wp_bit();
-        cpa_init();
        save_pg_dir();
        zap_low_mappings();
 }
@@ -710,6 +1100,8 @@ void mark_rodata_ro(void)
        unsigned long start = PFN_ALIGN(_text);
        unsigned long size = PFN_ALIGN(_etext) - start;
+#ifndef CONFIG_DYNAMIC_FTRACE
+        /* Dynamic tracing modifies the kernel text section */
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
        printk(KERN_INFO "Write protecting the kernel text: %luk\n",
                size >> 10);
@@ -722,6 +1114,8 @@ void mark_rodata_ro(void)
        printk(KERN_INFO "Testing CPA: write protecting again\n");
        set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 #endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
        start += size;
        size = (unsigned long)__end_rodata - start;
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
@@ -784,3 +1178,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
        free_init_pages("initrd memory", start, end);
 }
 #endif
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+                                   int flags)
+{
+        return reserve_bootmem(phys, len, flags);
+}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 156e6d7b0e32..3e10054c5731 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -18,6 +18,7 @@
 #include <linux/swap.h>
 #include <linux/smp.h>
 #include <linux/init.h>
+#include <linux/initrd.h>
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
@@ -30,6 +31,7 @@
 #include <linux/nmi.h>
 #include <asm/processor.h>
+#include <asm/bios_ebda.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -47,11 +49,19 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
 static unsigned long dma_reserve __initdata;
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-int direct_gbpages __meminitdata
+int direct_gbpages
 #ifdef CONFIG_DIRECT_GBPAGES
                                = 1
 #endif
@@ -77,46 +87,69 @@ early_param("gbpages", parse_direct_gbpages_on);
 * around without checking the pgd every time.
 */
-void show_mem(void)
+int after_bootmem;
-{
-        long i, total = 0, reserved = 0;
-        long shared = 0, cached = 0;
-        struct page *page;
-        pg_data_t *pgdat;
-        printk(KERN_INFO "Mem-info:\n");
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
-        show_free_areas();
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
-        for_each_online_pgdat(pgdat) {
-                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-                        /*
-                         * This loop can take a while with 256 GB and
-                         * 4k pages so defer the NMI watchdog:
-                         */
-                        if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
-                                touch_nmi_watchdog();
-                        if (!pfn_valid(pgdat->node_start_pfn + i))
+static int do_not_nx __cpuinitdata;
-                                continue;
-                        page = pfn_to_page(pgdat->node_start_pfn + i);
+/*
-                        total++;
+ * noexec=on|off
-                        if (PageReserved(page))
+ * Control non-executable mappings for 64-bit processes.
-                                reserved++;
+ *
-                        else if (PageSwapCache(page))
+ * on   Enable (default)
-                                cached++;
+ * off  Disable
-                        else if (page_count(page))
+ */
-                                shared += page_count(page) - 1;
+static int __init nonx_setup(char *str)
-                }
+{
+        if (!str)
+                return -EINVAL;
+        if (!strncmp(str, "on", 2)) {
+                __supported_pte_mask |= _PAGE_NX;
+                do_not_nx = 0;
+        } else if (!strncmp(str, "off", 3)) {
+                do_not_nx = 1;
+                __supported_pte_mask &= ~_PAGE_NX;
        }
-        printk(KERN_INFO "%lu pages of RAM\n",          total);
+        return 0;
-        printk(KERN_INFO "%lu reserved pages\n",        reserved);
-        printk(KERN_INFO "%lu pages shared\n",          shared);
-        printk(KERN_INFO "%lu pages swap cached\n",     cached);
 }
+early_param("noexec", nonx_setup);
-int after_bootmem;
+void __cpuinit check_efer(void)
+{
+        unsigned long efer;
+        rdmsrl(MSR_EFER, efer);
+        if (!(efer & EFER_NX) || do_not_nx)
+                __supported_pte_mask &= ~_PAGE_NX;
+}
-static __init void *spp_getpage(void)
+int force_personality32;
+/*
+ * noexec32=on|off
+ * Control non executable heap for 32bit processes.
+ * To control the stack too use noexec=off
+ *
+ * on   PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
+ * off  PROT_READ implies PROT_EXEC
+ */
+static int __init nonx32_setup(char *str)
+{
+        if (!strcmp(str, "on"))
+                force_personality32 &= ~READ_IMPLIES_EXEC;
+        else if (!strcmp(str, "off"))
+                force_personality32 |= READ_IMPLIES_EXEC;
+        return 1;
+}
+__setup("noexec32=", nonx32_setup);
+/*
+ * NOTE: This function is marked __ref because it calls __init function
+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ */
+static __ref void *spp_getpage(void)
 {
        void *ptr;
@@ -135,26 +168,17 @@ static __init void *spp_getpage(void)
        return ptr;
 }
-static void
+void
-set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
+set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 {
-        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
-        pte_t *pte, new_pte;
+        pte_t *pte;
-        pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
-        pgd = pgd_offset_k(vaddr);
+        pud = pud_page + pud_index(vaddr);
-        if (pgd_none(*pgd)) {
-                printk(KERN_ERR
-                        "PGD FIXMAP MISSING, it should be setup in head.S!\n");
-                return;
-        }
-        pud = pud_offset(pgd, vaddr);
        if (pud_none(*pud)) {
                pmd = (pmd_t *) spp_getpage();
-                set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+                pud_populate(&init_mm, pud, pmd);
                if (pmd != pmd_offset(pud, 0)) {
                        printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
                                pmd, pmd_offset(pud, 0));
@@ -164,13 +188,12 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
        pmd = pmd_offset(pud, vaddr);
        if (pmd_none(*pmd)) {
                pte = (pte_t *) spp_getpage();
-                set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+                pmd_populate_kernel(&init_mm, pmd, pte);
                if (pte != pte_offset_kernel(pmd, 0)) {
                        printk(KERN_ERR "PAGETABLE BUG #02!\n");
                        return;
                }
        }
-        new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
        pte = pte_offset_kernel(pmd, vaddr);
        if (!pte_none(*pte) && pte_val(new_pte) &&
@@ -185,6 +208,64 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
        __flush_tlb_one(vaddr);
 }
+void
+set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+        pgd_t *pgd;
+        pud_t *pud_page;
+        pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
+        pgd = pgd_offset_k(vaddr);
+        if (pgd_none(*pgd)) {
+                printk(KERN_ERR
+                        "PGD FIXMAP MISSING, it should be setup in head.S!\n");
+                return;
+        }
+        pud_page = (pud_t*)pgd_page_vaddr(*pgd);
+        set_pte_vaddr_pud(pud_page, vaddr, pteval);
+}
+/*
+ * Create large page table mappings for a range of physical addresses.
+ */
+static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
+                                                pgprot_t prot)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
+        for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
+                pgd = pgd_offset_k((unsigned long)__va(phys));
+                if (pgd_none(*pgd)) {
+                        pud = (pud_t *) spp_getpage();
+                        set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+                                                _PAGE_USER));
+                }
+                pud = pud_offset(pgd, (unsigned long)__va(phys));
+                if (pud_none(*pud)) {
+                        pmd = (pmd_t *) spp_getpage();
+                        set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
+                                                _PAGE_USER));
+                }
+                pmd = pmd_offset(pud, phys);
+                BUG_ON(!pmd_none(*pmd));
+                set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
+        }
+}
+void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
+{
+        __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
+}
+void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
+{
+        __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
+}
 /*
 * The head.S code sets up the kernel high mapping:
 *
@@ -201,7 +282,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
 void __init cleanup_highmap(void)
 {
        unsigned long vaddr = __START_KERNEL_map;
-        unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
+        unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
        pmd_t *pmd = level2_kernel_pgt;
        pmd_t *last_pmd = pmd + PTRS_PER_PMD;
@@ -213,22 +294,11 @@ void __init cleanup_highmap(void)
        }
 }
-/* NOTE: this is meant to be run only at boot */
-void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
-{
-        unsigned long address = __fix_to_virt(idx);
-        if (idx >= __end_of_fixed_addresses) {
-                printk(KERN_ERR "Invalid __set_fixmap\n");
-                return;
-        }
-        set_pte_phys(address, phys, prot);
-}
 static unsigned long __initdata table_start;
 static unsigned long __meminitdata table_end;
+static unsigned long __meminitdata table_top;
-static __meminit void *alloc_low_page(unsigned long *phys)
+static __ref void *alloc_low_page(unsigned long *phys)
 {
        unsigned long pfn = table_end++;
        void *adr;
@@ -240,7 +310,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
                return adr;
        }
-        if (pfn >= end_pfn)
+        if (pfn >= table_top)
                panic("alloc_low_page: ran out of memory");
        adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -249,7 +319,7 @@ static __meminit void *alloc_low_page(unsigned long *phys)
        return adr;
 }
-static __meminit void unmap_low_page(void *adr)
+static __ref void unmap_low_page(void *adr)
 {
        if (after_bootmem)
                return;
@@ -257,65 +327,71 @@ static __meminit void unmap_low_page(void *adr)
        early_iounmap(adr, PAGE_SIZE);
 }
-/* Must run before zap_low_mappings */
+static unsigned long __meminit
-__meminit void *early_ioremap(unsigned long addr, unsigned long size)
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
+              pgprot_t prot)
 {
-        pmd_t *pmd, *last_pmd;
+        unsigned pages = 0;
-        unsigned long vaddr;
+        unsigned long last_map_addr = end;
-        int i, pmds;
+        int i;
+        pte_t *pte = pte_page + pte_index(addr);
-        pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
+        for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
-        vaddr = __START_KERNEL_map;
-        pmd = level2_kernel_pgt;
-        last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
-        for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
+                if (addr >= end) {
-                for (i = 0; i < pmds; i++) {
+                        if (!after_bootmem) {
-                        if (pmd_present(pmd[i]))
+                                for(; i < PTRS_PER_PTE; i++, pte++)
-                                goto continue_outer_loop;
+                                        set_pte(pte, __pte(0));
+                        }
+                        break;
                }
-                vaddr += addr & ~PMD_MASK;
-                addr &= PMD_MASK;
-                for (i = 0; i < pmds; i++, addr += PMD_SIZE)
+                /*
-                        set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+                 * We will re-use the existing mapping.
-                __flush_tlb_all();
+                 * Xen for example has some special requirements, like mapping
+                 * pagetable pages as RO. So assume someone who pre-setup
+                 * these mappings are more intelligent.
+                 */
+                if (pte_val(*pte))
+                        continue;
-                return (void *)vaddr;
+                if (0)
-continue_outer_loop:
+                        printk("   pte=%p addr=%lx pte=%016lx\n",
-                ;
+                               pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
+                pages++;
+                set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
+                last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
        }
-        printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
-        return NULL;
+        update_page_count(PG_LEVEL_4K, pages);
+        return last_map_addr;
 }
-/*
+static unsigned long __meminit
- * To avoid virtual aliases later:
+phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
- */
+                pgprot_t prot)
-__meminit void early_iounmap(void *addr, unsigned long size)
 {
-        unsigned long vaddr;
+        pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
-        pmd_t *pmd;
-        int i, pmds;
-        vaddr = (unsigned long)addr;
+        return phys_pte_init(pte, address, end, prot);
-        pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
-        pmd = level2_kernel_pgt + pmd_index(vaddr);
-        for (i = 0; i < pmds; i++)
-                pmd_clear(pmd + i);
-        __flush_tlb_all();
 }
 static unsigned long __meminit
-phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
+              unsigned long page_size_mask, pgprot_t prot)
 {
+        unsigned long pages = 0;
+        unsigned long last_map_addr = end;
        int i = pmd_index(address);
        for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
+                unsigned long pte_phys;
                pmd_t *pmd = pmd_page + pmd_index(address);
+                pte_t *pte;
+                pgprot_t new_prot = prot;
                if (address >= end) {
                        if (!after_bootmem) {
@@ -325,31 +401,71 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
                        break;
                }
-                if (pmd_val(*pmd))
+                if (pmd_val(*pmd)) {
+                        if (!pmd_large(*pmd)) {
+                                spin_lock(&init_mm.page_table_lock);
+                                last_map_addr = phys_pte_update(pmd, address,
+                                                                end, prot);
+                                spin_unlock(&init_mm.page_table_lock);
+                                continue;
+                        }
+                        /*
+                         * If we are ok with PG_LEVEL_2M mapping, then we will
+                         * use the existing mapping,
+                         *
+                         * Otherwise, we will split the large page mapping but
+                         * use the same existing protection bits except for
+                         * large page, so that we don't violate Intel's TLB
+                         * Application note (317080) which says, while changing
+                         * the page sizes, new and old translations should
+                         * not differ with respect to page frame and
+                         * attributes.
+                         */
+                        if (page_size_mask & (1 << PG_LEVEL_2M))
+                                continue;
+                        new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
+                }
+                if (page_size_mask & (1<<PG_LEVEL_2M)) {
+                        pages++;
+                        spin_lock(&init_mm.page_table_lock);
+                        set_pte((pte_t *)pmd,
+                                pfn_pte(address >> PAGE_SHIFT,
+                                        __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+                        spin_unlock(&init_mm.page_table_lock);
+                        last_map_addr = (address & PMD_MASK) + PMD_SIZE;
                        continue;
+                }
-                set_pte((pte_t *)pmd,
+                pte = alloc_low_page(&pte_phys);
-                        pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+                last_map_addr = phys_pte_init(pte, address, end, new_prot);
+                unmap_low_page(pte);
+                spin_lock(&init_mm.page_table_lock);
+                pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+                spin_unlock(&init_mm.page_table_lock);
        }
-        return address;
+        update_page_count(PG_LEVEL_2M, pages);
+        return last_map_addr;
 }
 static unsigned long __meminit
-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
+                unsigned long page_size_mask, pgprot_t prot)
 {
        pmd_t *pmd = pmd_offset(pud, 0);
        unsigned long last_map_addr;
-        spin_lock(&init_mm.page_table_lock);
+        last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
-        last_map_addr = phys_pmd_init(pmd, address, end);
-        spin_unlock(&init_mm.page_table_lock);
        __flush_tlb_all();
        return last_map_addr;
 }
 static unsigned long __meminit
-phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
+                         unsigned long page_size_mask)
 {
+        unsigned long pages = 0;
        unsigned long last_map_addr = end;
        int i = pud_index(addr);
@@ -357,6 +473,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
                unsigned long pmd_phys;
                pud_t *pud = pud_page + pud_index(addr);
                pmd_t *pmd;
+                pgprot_t prot = PAGE_KERNEL;
                if (addr >= end)
                        break;
@@ -368,42 +485,87 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
                }
                if (pud_val(*pud)) {
-                        if (!pud_large(*pud))
+                        if (!pud_large(*pud)) {
-                                last_map_addr = phys_pmd_update(pud, addr, end);
+                                last_map_addr = phys_pmd_update(pud, addr, end,
-                        continue;
+                                                         page_size_mask, prot);
+                                continue;
+                        }
+                        /*
+                         * If we are ok with PG_LEVEL_1G mapping, then we will
+                         * use the existing mapping.
+                         *
+                         * Otherwise, we will split the gbpage mapping but use
+                         * the same existing protection  bits except for large
+                         * page, so that we don't violate Intel's TLB
+                         * Application note (317080) which says, while changing
+                         * the page sizes, new and old translations should
+                         * not differ with respect to page frame and
+                         * attributes.
+                         */
+                        if (page_size_mask & (1 << PG_LEVEL_1G))
+                                continue;
+                        prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
                }
-                if (direct_gbpages) {
+                if (page_size_mask & (1<<PG_LEVEL_1G)) {
+                        pages++;
+                        spin_lock(&init_mm.page_table_lock);
                        set_pte((pte_t *)pud,
                                pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+                        spin_unlock(&init_mm.page_table_lock);
                        last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
                        continue;
                }
                pmd = alloc_low_page(&pmd_phys);
+                last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
+                                              prot);
+                unmap_low_page(pmd);
                spin_lock(&init_mm.page_table_lock);
-                set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
+                pud_populate(&init_mm, pud, __va(pmd_phys));
-                last_map_addr = phys_pmd_init(pmd, addr, end);
                spin_unlock(&init_mm.page_table_lock);
-                unmap_low_page(pmd);
        }
        __flush_tlb_all();
-        return last_map_addr >> PAGE_SHIFT;
+        update_page_count(PG_LEVEL_1G, pages);
+        return last_map_addr;
 }
-static void __init find_early_table_space(unsigned long end)
+static unsigned long __meminit
+phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
+                 unsigned long page_size_mask)
 {
-        unsigned long puds, pmds, tables, start;
+        pud_t *pud;
+        pud = (pud_t *)pgd_page_vaddr(*pgd);
+        return phys_pud_init(pud, addr, end, page_size_mask);
+}
+static void __init find_early_table_space(unsigned long end, int use_pse,
+                                          int use_gbpages)
+{
+        unsigned long puds, pmds, ptes, tables, start;
        puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-        tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
+        tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-        if (!direct_gbpages) {
+        if (use_gbpages) {
+                unsigned long extra;
+                extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+                pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+        } else
                pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-                tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+        tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-        }
+        if (use_pse) {
+                unsigned long extra;
+                extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+                ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        } else
+                ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
        /*
         * RED-PEN putting page tables only on node 0 could
@@ -417,10 +579,10 @@ static void __init find_early_table_space(unsigned long end)
        table_start >>= PAGE_SHIFT;
        table_end = table_start;
+        table_top = table_start + (tables >> PAGE_SHIFT);
-        early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
+        printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-                end, table_start << PAGE_SHIFT,
+                end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
-                (table_start << PAGE_SHIFT) + tables);
 }
 static void __init init_gbpages(void)
@@ -431,125 +593,85 @@ static void __init init_gbpages(void)
                direct_gbpages = 0;
 }
-#ifdef CONFIG_MEMTEST_BOOTPARAM
+static unsigned long __init kernel_physical_mapping_init(unsigned long start,
+                                                unsigned long end,
-static void __init memtest(unsigned long start_phys, unsigned long size,
+                                                unsigned long page_size_mask)
-                                 unsigned pattern)
+{
-{
-        unsigned long i;
-        unsigned long *start;
-        unsigned long start_bad;
-        unsigned long last_bad;
-        unsigned long val;
-        unsigned long start_phys_aligned;
-        unsigned long count;
-        unsigned long incr;
-        switch (pattern) {
-        case 0:
-                val = 0UL;
-                break;
-        case 1:
-                val = -1UL;
-                break;
-        case 2:
-                val = 0x5555555555555555UL;
-                break;
-        case 3:
-                val = 0xaaaaaaaaaaaaaaaaUL;
-                break;
-        default:
-                return;
-        }
-        incr = sizeof(unsigned long);
+        unsigned long next, last_map_addr = end;
-        start_phys_aligned = ALIGN(start_phys, incr);
-        count = (size - (start_phys_aligned - start_phys))/incr;
-        start = __va(start_phys_aligned);
-        start_bad = 0;
-        last_bad = 0;
-        for (i = 0; i < count; i++)
-                start[i] = val;
-        for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
-                if (*start != val) {
-                        if (start_phys_aligned == last_bad + incr) {
-                                last_bad += incr;
-                        } else {
-                                if (start_bad) {
-                                        printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
-                                                val, start_bad, last_bad + incr);
-                                        reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-                                }
-                                start_bad = last_bad = start_phys_aligned;
-                        }
-                }
-        }
-        if (start_bad) {
-                printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
-                        val, start_bad, last_bad + incr);
-                reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
-        }
-}
+        start = (unsigned long)__va(start);
+        end = (unsigned long)__va(end);
-static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
+        for (; start < end; start = next) {
+                pgd_t *pgd = pgd_offset_k(start);
+                unsigned long pud_phys;
+                pud_t *pud;
-static int __init parse_memtest(char *arg)
+                next = (start + PGDIR_SIZE) & PGDIR_MASK;
-{
+                if (next > end)
-        if (arg)
+                        next = end;
-                memtest_pattern = simple_strtoul(arg, NULL, 0);
-        return 0;
-}
-early_param("memtest", parse_memtest);
+                if (pgd_val(*pgd)) {
+                        last_map_addr = phys_pud_update(pgd, __pa(start),
+                                                 __pa(end), page_size_mask);
+                        continue;
+                }
-static void __init early_memtest(unsigned long start, unsigned long end)
+                pud = alloc_low_page(&pud_phys);
-{
+                last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
-        u64 t_start, t_size;
+                                                 page_size_mask);
-        unsigned pattern;
+                unmap_low_page(pud);
-        if (!memtest_pattern)
+                spin_lock(&init_mm.page_table_lock);
-                return;
+                pgd_populate(&init_mm, pgd, __va(pud_phys));
+                spin_unlock(&init_mm.page_table_lock);
+        }
+        __flush_tlb_all();
-        printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+        return last_map_addr;
-        for (pattern = 0; pattern < memtest_pattern; pattern++) {
+}
-                t_start = start;
-                t_size = 0;
-                while (t_start < end) {
-                        t_start = find_e820_area_size(t_start, &t_size, 1);
-                        /* done ? */
+struct map_range {
-                        if (t_start >= end)
+        unsigned long start;
-                                break;
+        unsigned long end;
-                        if (t_start + t_size > end)
+        unsigned page_size_mask;
-                                t_size = end - t_start;
+};
-                        printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
+#define NR_RANGE_MR 5
-                                t_start, t_start + t_size, pattern);
-                        memtest(t_start, t_size, pattern);
+static int save_mr(struct map_range *mr, int nr_range,
+                   unsigned long start_pfn, unsigned long end_pfn,
+                   unsigned long page_size_mask)
+{
-                        t_start += t_size;
+        if (start_pfn < end_pfn) {
-                }
+                if (nr_range >= NR_RANGE_MR)
+                        panic("run out of range for init_memory_mapping\n");
+                mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+                mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+                mr[nr_range].page_size_mask = page_size_mask;
+                nr_range++;
        }
-        printk(KERN_CONT "\n");
-}
+        return nr_range;
-#else
-static void __init early_memtest(unsigned long start, unsigned long end)
-{
 }
-#endif
 /*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
-unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+                                               unsigned long end)
 {
-        unsigned long next, last_map_addr = end;
+        unsigned long last_map_addr = 0;
-        unsigned long start_phys = start, end_phys = end;
+        unsigned long page_size_mask = 0;
+        unsigned long start_pfn, end_pfn;
+        struct map_range mr[NR_RANGE_MR];
+        int nr_range, i;
+        int use_pse, use_gbpages;
        printk(KERN_INFO "init_memory_mapping\n");
@@ -560,48 +682,127 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon
         * memory mapped. Unfortunately this is done currently before the
         * nodes are discovered.
         */
-        if (!after_bootmem) {
+        if (!after_bootmem)
                init_gbpages();
-                find_early_table_space(end);
-        }
-        start = (unsigned long)__va(start);
+#ifdef CONFIG_DEBUG_PAGEALLOC
-        end = (unsigned long)__va(end);
+        /*
+         * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+         * This will simplify cpa(), which otherwise needs to support splitting
+         * large pages into small in interrupt context, etc.
+         */
+        use_pse = use_gbpages = 0;
+#else
+        use_pse = cpu_has_pse;
+        use_gbpages = direct_gbpages;
+#endif
-        for (; start < end; start = next) {
+        if (use_gbpages)
-                pgd_t *pgd = pgd_offset_k(start);
+                page_size_mask |= 1 << PG_LEVEL_1G;
-                unsigned long pud_phys;
+        if (use_pse)
-                pud_t *pud;
+                page_size_mask |= 1 << PG_LEVEL_2M;
+        memset(mr, 0, sizeof(mr));
+        nr_range = 0;
+        /* head if not big page alignment ?*/
+        start_pfn = start >> PAGE_SHIFT;
+        end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
+                        << (PMD_SHIFT - PAGE_SHIFT);
+        nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+        /* big page (2M) range*/
+        start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+                         << (PMD_SHIFT - PAGE_SHIFT);
+        end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
+                         << (PUD_SHIFT - PAGE_SHIFT);
+        if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
+                end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
+        nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                        page_size_mask & (1<<PG_LEVEL_2M));
+        /* big page (1G) range */
+        start_pfn = end_pfn;
+        end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+        nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                                page_size_mask &
+                                 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+        /* tail is not big page (1G) alignment */
+        start_pfn = end_pfn;
+        end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+        nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                        page_size_mask & (1<<PG_LEVEL_2M));
+        /* tail is not big page (2M) alignment */
+        start_pfn = end_pfn;
+        end_pfn = end>>PAGE_SHIFT;
+        nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+        /* try to merge same page size and continuous */
+        for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+                unsigned long old_start;
+                if (mr[i].end != mr[i+1].start ||
+                    mr[i].page_size_mask != mr[i+1].page_size_mask)
+                        continue;
+                /* move it */
+                old_start = mr[i].start;
+                memmove(&mr[i], &mr[i+1],
+                         (nr_range - 1 - i) * sizeof (struct map_range));
+                mr[i].start = old_start;
+                nr_range--;
+        }
-                if (after_bootmem)
+        for (i = 0; i < nr_range; i++)
-                        pud = pud_offset(pgd, start & PGDIR_MASK);
+                printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-                else
+                                mr[i].start, mr[i].end,
-                        pud = alloc_low_page(&pud_phys);
+                        (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+                         (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-                next = start + PGDIR_SIZE;
+        if (!after_bootmem)
-                if (next > end)
+                find_early_table_space(end, use_pse, use_gbpages);
-                        next = end;
-                last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
+        for (i = 0; i < nr_range; i++)
-                if (!after_bootmem)
+                last_map_addr = kernel_physical_mapping_init(
-                        set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+                                        mr[i].start, mr[i].end,
-                unmap_low_page(pud);
+                                        mr[i].page_size_mask);
-        }
        if (!after_bootmem)
                mmu_cr4_features = read_cr4();
        __flush_tlb_all();
-        if (!after_bootmem)
+        if (!after_bootmem && table_end > table_start)
                reserve_early(table_start << PAGE_SHIFT,
                                 table_end << PAGE_SHIFT, "PGTABLE");
+        printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
+                         last_map_addr, end);
        if (!after_bootmem)
-                early_memtest(start_phys, end_phys);
+                early_memtest(start, end);
-        return last_map_addr;
+        return last_map_addr >> PAGE_SHIFT;
 }
 #ifndef CONFIG_NUMA
+void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long bootmap_size, bootmap;
+        bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+        bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
+                                 PAGE_SIZE);
+        if (bootmap == -1L)
+                panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+        /* don't touch min_low_pfn */
+        bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
+                                         0, end_pfn);
+        e820_register_active_regions(0, start_pfn, end_pfn);
+        free_bootmem_with_active_regions(0, end_pfn);
+        early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
+        reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
+}
 void __init paging_init(void)
 {
        unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -609,9 +810,9 @@ void __init paging_init(void)
        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
        max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-        max_zone_pfns[ZONE_NORMAL] = end_pfn;
+        max_zone_pfns[ZONE_NORMAL] = max_pfn;
-        memory_present(0, 0, end_pfn);
+        memory_present(0, 0, max_pfn);
        sparse_init();
        free_area_init_nodes(max_zone_pfns);
 }
@@ -681,6 +882,8 @@ void __init mem_init(void)
 {
        long codesize, reservedpages, datasize, initsize;
+        start_periodic_check_for_corruption();
        pci_iommu_alloc();
        /* clear_bss() already clear the empty_zero_page */
@@ -693,8 +896,8 @@ void __init mem_init(void)
 #else
        totalram_pages = free_all_bootmem();
 #endif
-        reservedpages = end_pfn - totalram_pages -
+        reservedpages = max_pfn - totalram_pages -
-                                        absent_pages_in_range(0, end_pfn);
+                                        absent_pages_in_range(0, max_pfn);
        after_bootmem = 1;
        codesize =  (unsigned long) &_etext - (unsigned long) &_text;
@@ -713,13 +916,11 @@ void __init mem_init(void)
        printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
                                "%ldk reserved, %ldk data, %ldk init)\n",
                (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
-                end_pfn << (PAGE_SHIFT-10),
+                max_pfn << (PAGE_SHIFT-10),
                codesize >> 10,
                reservedpages << (PAGE_SHIFT-10),
                datasize >> 10,
                initsize >> 10);
-        cpa_init();
 }
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
@@ -766,6 +967,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
 void mark_rodata_ro(void)
 {
        unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
+        unsigned long rodata_start =
+                ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+#ifdef CONFIG_DYNAMIC_FTRACE
+        /* Dynamic tracing modifies the kernel text section */
+        start = rodata_start;
+#endif
        printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
               (end - start) >> 10);
@@ -775,8 +983,7 @@ void mark_rodata_ro(void)
         * The rodata section (but not the kernel text!) should also be
         * not-executable.
         */
-        start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+        set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
-        set_memory_nx(start, (end - start) >> PAGE_SHIFT);
        rodata_test();
@@ -798,24 +1005,26 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+                                   int flags)
 {
 #ifdef CONFIG_NUMA
        int nid, next_nid;
+        int ret;
 #endif
        unsigned long pfn = phys >> PAGE_SHIFT;
-        if (pfn >= end_pfn) {
+        if (pfn >= max_pfn) {
                /*
                 * This can happen with kdump kernels when accessing
                 * firmware tables:
                 */
                if (pfn < max_pfn_mapped)
-                        return;
+                        return -EFAULT;
-                printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+                printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
                                phys, len);
-                return;
+                return -EFAULT;
        }
        /* Should check here against the e820 map to avoid double free */
@@ -823,9 +1032,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
        nid = phys_to_nid(phys);
        next_nid = phys_to_nid(phys + len - 1);
        if (nid == next_nid)
-                reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+                ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
        else
-                reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+                ret = reserve_bootmem(phys, len, flags);
+        if (ret != 0)
+                return ret;
 #else
        reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
 #endif
@@ -834,6 +1047,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
                dma_reserve += len / PAGE_SIZE;
                set_dma_reserve(dma_reserve);
        }
+        return 0;
 }
 int kern_addr_valid(unsigned long addr)
@@ -938,7 +1153,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
        pmd_t *pmd;
        for (; addr < end; addr = next) {
-                next = pmd_addr_end(addr, end);
+                void *p = NULL;
                pgd = vmemmap_pgd_populate(addr, node);
                if (!pgd)
@@ -948,33 +1163,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
                if (!pud)
                        return -ENOMEM;
-                pmd = pmd_offset(pud, addr);
+                if (!cpu_has_pse) {
-                if (pmd_none(*pmd)) {
+                        next = (addr + PAGE_SIZE) & PAGE_MASK;
-                        pte_t entry;
+                        pmd = vmemmap_pmd_populate(pud, addr, node);
-                        void *p;
+                        if (!pmd)
+                                return -ENOMEM;
+                        p = vmemmap_pte_populate(pmd, addr, node);
-                        p = vmemmap_alloc_block(PMD_SIZE, node);
                        if (!p)
                                return -ENOMEM;
-                        entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+                        addr_end = addr + PAGE_SIZE;
-                                                        PAGE_KERNEL_LARGE);
+                        p_end = p + PAGE_SIZE;
-                        set_pmd(pmd, __pmd(pte_val(entry)));
-                        /* check to see if we have contiguous blocks */
-                        if (p_end != p || node_start != node) {
-                                if (p_start)
-                                        printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
-                                                addr_start, addr_end-1, p_start, p_end-1, node_start);
-                                addr_start = addr;
-                                node_start = node;
-                                p_start = p;
-                        }
-                        addr_end = addr + PMD_SIZE;
-                        p_end = p + PMD_SIZE;
                } else {
-                        vmemmap_verify((pte_t *)pmd, node, addr, next);
+                        next = pmd_addr_end(addr, end);
+                        pmd = pmd_offset(pud, addr);
+                        if (pmd_none(*pmd)) {
+                                pte_t entry;
+                                p = vmemmap_alloc_block(PMD_SIZE, node);
+                                if (!p)
+                                        return -ENOMEM;
+                                entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+                                                PAGE_KERNEL_LARGE);
+                                set_pmd(pmd, __pmd(pte_val(entry)));
+                                /* check to see if we have contiguous blocks */
+                                if (p_end != p || node_start != node) {
+                                        if (p_start)
+                                                printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+                                                       addr_start, addr_end-1, p_start, p_end-1, node_start);
+                                        addr_start = addr;
+                                        node_start = node;
+                                        p_start = p;
+                                }
+                                addr_end = addr + PMD_SIZE;
+                                p_end = p + PMD_SIZE;
+                        } else
+                                vmemmap_verify((pte_t *)pmd, node, addr, next);
                }
        }
        return 0;
 }
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 9dd3cb905971..8cbeda15cd29 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
@@ -101,6 +102,25 @@ int page_is_ram(unsigned long pagenr)
        return 0;
 }
+int pagerange_is_ram(unsigned long start, unsigned long end)
+{
+        int ram_page = 0, not_rampage = 0;
+        unsigned long page_nr;
+        for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
+             ++page_nr) {
+                if (page_is_ram(page_nr))
+                        ram_page = 1;
+                else
+                        not_rampage = 1;
+                if (ram_page == not_rampage)
+                        return -1;
+        }
+        return ram_page;
+}
 /*
 * Fix up the linear direct mapping of the kernel to avoid cache attribute
 * conflicts.
@@ -141,10 +161,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 {
        unsigned long pfn, offset, vaddr;
        resource_size_t last_addr;
+        const resource_size_t unaligned_phys_addr = phys_addr;
+        const unsigned long unaligned_size = size;
        struct vm_struct *area;
        unsigned long new_prot_val;
        pgprot_t prot;
        int retval;
+        void __iomem *ret_addr;
        /* Don't allow wraparound or zero size */
        last_addr = phys_addr + size - 1;
@@ -161,7 +184,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
        /*
         * Don't remap the low PCI/ISA area, it's always mapped..
         */
-        if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+        if (is_ISA_range(phys_addr, last_addr))
                return (__force void __iomem *)phys_to_virt(phys_addr);
        /*
@@ -185,7 +208,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
        phys_addr &= PAGE_MASK;
        size = PAGE_ALIGN(last_addr+1) - phys_addr;
-        retval = reserve_memtype(phys_addr, phys_addr + size,
+        retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
                                                prot_val, &new_prot_val);
        if (retval) {
                pr_debug("Warning: reserve_memtype returned %d\n", retval);
@@ -252,7 +275,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
                return NULL;
        }
-        return (void __iomem *) (vaddr + offset);
+        ret_addr = (void __iomem *) (vaddr + offset);
+        mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+        return ret_addr;
 }
 /**
@@ -280,7 +306,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 {
        /*
         * Ideally, this should be:
-         *      pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
+         *      pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
         *
         * Till we fix all X drivers to use ioremap_wc(), we will use
         * UC MINUS.
@@ -304,7 +330,7 @@ EXPORT_SYMBOL(ioremap_nocache);
 */
 void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
 {
-        if (pat_wc_enabled)
+        if (pat_enabled)
                return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
                                        __builtin_return_address(0));
        else
@@ -319,6 +345,37 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 }
 EXPORT_SYMBOL(ioremap_cache);
+static void __iomem *ioremap_default(resource_size_t phys_addr,
+                                        unsigned long size)
+{
+        unsigned long flags;
+        void *ret;
+        int err;
+        /*
+         * - WB for WB-able memory and no other conflicting mappings
+         * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+         * - Inherit from confliting mappings otherwise
+         */
+        err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
+        if (err < 0)
+                return NULL;
+        ret = (void *) __ioremap_caller(phys_addr, size, flags,
+                                        __builtin_return_address(0));
+        free_memtype(phys_addr, phys_addr + size);
+        return (void __iomem *)ret;
+}
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+                                unsigned long prot_val)
+{
+        return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
+                                __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_prot);
 /**
 * iounmap - Free a IO remapping
 * @addr: virtual address from ioremap_*
@@ -337,13 +394,15 @@ void iounmap(volatile void __iomem *addr)
         * vm_area and by simply returning an address into the kernel mapping
         * of ISA space.   So handle that here.
         */
-        if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
+        if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
-            addr < phys_to_virt(ISA_END_ADDRESS))
+            (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
                return;
        addr = (volatile void __iomem *)
                (PAGE_MASK & (unsigned long __force)addr);
+        mmiotrace_iounmap(addr);
        /* Use the vm area unlocked, assuming the caller
           ensures there isn't another iounmap for the same address
           in parallel. Reuse of the virtual address is prevented by
@@ -351,7 +410,7 @@ void iounmap(volatile void __iomem *addr)
           cpa takes care of the direct mappings. */
        read_lock(&vmlist_lock);
        for (p = vmlist; p; p = p->next) {
-                if (p->addr == addr)
+                if (p->addr == (void __force *)addr)
                        break;
        }
        read_unlock(&vmlist_lock);
@@ -365,7 +424,7 @@ void iounmap(volatile void __iomem *addr)
        free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
        /* Finally remove it */
-        o = remove_vm_area((void *)addr);
+        o = remove_vm_area((void __force *)addr);
        BUG_ON(p != o || o == NULL);
        kfree(p);
 }
@@ -384,7 +443,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
        if (page_is_ram(start >> PAGE_SHIFT))
                return __va(phys);
-        addr = (void *)ioremap(start, PAGE_SIZE);
+        addr = (void __force *)ioremap_default(start, PAGE_SIZE);
        if (addr)
                addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
@@ -400,9 +459,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
        return;
 }
-#ifdef CONFIG_X86_32
+static int __initdata early_ioremap_debug;
-int __initdata early_ioremap_debug;
 static int __init early_ioremap_debug_setup(char *str)
 {
@@ -413,8 +470,7 @@ static int __init early_ioremap_debug_setup(char *str)
 early_param("early_ioremap_debug", early_ioremap_debug_setup);
 static __initdata int after_paging_init;
-static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
-                __section(.bss.page_aligned);
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
@@ -503,10 +559,11 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
                return;
        }
        pte = early_ioremap_pte(addr);
        if (pgprot_val(flags))
                set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
        else
-                pte_clear(NULL, addr, pte);
+                pte_clear(&init_mm, addr, pte);
        __flush_tlb_one(addr);
 }
@@ -528,19 +585,17 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
 }
-int __initdata early_ioremap_nested;
+static int __initdata early_ioremap_nested;
 static int __init check_early_ioremap_leak(void)
 {
        if (!early_ioremap_nested)
                return 0;
+        WARN(1, KERN_WARNING
-        printk(KERN_WARNING
               "Debug warning: early ioremap leak of %d areas detected.\n",
-               early_ioremap_nested);
+                early_ioremap_nested);
        printk(KERN_WARNING
-               "please boot with early_ioremap_debug and report the dmesg.\n");
+                "please boot with early_ioremap_debug and report the dmesg.\n");
-        WARN_ON(1);
        return 1;
 }
@@ -578,7 +633,7 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
         */
        offset = phys_addr & ~PAGE_MASK;
        phys_addr &= PAGE_MASK;
-        size = PAGE_ALIGN(last_addr) - phys_addr;
+        size = PAGE_ALIGN(last_addr + 1) - phys_addr;
        /*
         * Mappings have to fit in the FIX_BTMAP area.
@@ -644,5 +699,3 @@ void __this_fixmap_does_not_exist(void)
 {
        WARN_ON(1);
 }
-#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 1f476e477844..41f1b5c00a1d 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,6 +22,7 @@
 #include <asm/numa.h>
 #include <asm/mpspec.h>
 #include <asm/apic.h>
+#include <asm/k8.h>
 static __init int find_northbridge(void)
 {
@@ -56,34 +57,33 @@ static __init void early_get_boot_cpu_id(void)
        /*
         * Find possible boot-time SMP configuration:
         */
+#ifdef CONFIG_X86_MPPARSE
        early_find_smp_config();
+#endif
 #ifdef CONFIG_ACPI
        /*
         * Read APIC information from ACPI tables.
         */
        early_acpi_boot_init();
 #endif
+#ifdef CONFIG_X86_MPPARSE
        /*
         * get boot-time SMP configuration:
         */
        if (smp_found_config)
                early_get_smp_config();
+#endif
        early_init_lapic_mapping();
 }
 int __init k8_scan_nodes(unsigned long start, unsigned long end)
 {
+        unsigned numnodes, cores, bits, apicid_base;
        unsigned long prevbase;
        struct bootnode nodes[8];
-        int nodeid, i, nb;
        unsigned char nodeids[8];
-        int found = 0;
+        int i, j, nb, found = 0;
-        u32 reg;
+        u32 nodeid, reg;
-        unsigned numnodes;
-        unsigned cores;
-        unsigned bits;
-        int j;
-        unsigned apicid_base;
        if (!early_pci_allowed())
                return -1;
@@ -105,7 +105,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
        prevbase = 0;
        for (i = 0; i < 8; i++) {
                unsigned long base, limit;
-                u32 nodeid;
                base = read_pci_config(0, nb, 1, 0x40 + i*8);
                limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -144,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
                limit |= (1<<24)-1;
                limit++;
-                if (limit > end_pfn << PAGE_SHIFT)
+                if (limit > max_pfn << PAGE_SHIFT)
-                        limit = end_pfn << PAGE_SHIFT;
+                        limit = max_pfn << PAGE_SHIFT;
                if (limit <= base)
                        continue;
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000000..93d82038af4b
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,510 @@
+/* Support for MMIO probes.
+ * Benfit many code from kprobes
+ * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
+ *     2007 Alexander Eichner
+ *     2008 Pekka Paalanen <pq@iki.fi>
+ */
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <linux/errno.h>
+#include <asm/debugreg.h>
+#include <linux/mmiotrace.h>
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+struct kmmio_fault_page {
+        struct list_head list;
+        struct kmmio_fault_page *release_next;
+        unsigned long page; /* location of the fault page */
+        /*
+         * Number of times this page has been registered as a part
+         * of a probe. If zero, page is disarmed and this may be freed.
+         * Used only by writers (RCU).
+         */
+        int count;
+};
+struct kmmio_delayed_release {
+        struct rcu_head rcu;
+        struct kmmio_fault_page *release_list;
+};
+struct kmmio_context {
+        struct kmmio_fault_page *fpage;
+        struct kmmio_probe *probe;
+        unsigned long saved_flags;
+        unsigned long addr;
+        int active;
+};
+static DEFINE_SPINLOCK(kmmio_lock);
+/* Protected by kmmio_lock */
+unsigned int kmmio_count;
+/* Read-protected by RCU, write-protected by kmmio_lock. */
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+static struct list_head *kmmio_page_list(unsigned long page)
+{
+        return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+}
+/* Accessed per-cpu */
+static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+        struct kmmio_probe *p;
+        list_for_each_entry_rcu(p, &kmmio_probes, list) {
+                if (addr >= p->addr && addr <= (p->addr + p->len))
+                        return p;
+        }
+        return NULL;
+}
+/* You must be holding RCU read lock. */
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+{
+        struct list_head *head;
+        struct kmmio_fault_page *p;
+        page &= PAGE_MASK;
+        head = kmmio_page_list(page);
+        list_for_each_entry_rcu(p, head, list) {
+                if (p->page == page)
+                        return p;
+        }
+        return NULL;
+}
+static void set_page_present(unsigned long addr, bool present,
+                                                        unsigned int *pglevel)
+{
+        pteval_t pteval;
+        pmdval_t pmdval;
+        unsigned int level;
+        pmd_t *pmd;
+        pte_t *pte = lookup_address(addr, &level);
+        if (!pte) {
+                pr_err("kmmio: no pte for page 0x%08lx\n", addr);
+                return;
+        }
+        if (pglevel)
+                *pglevel = level;
+        switch (level) {
+        case PG_LEVEL_2M:
+                pmd = (pmd_t *)pte;
+                pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
+                if (present)
+                        pmdval |= _PAGE_PRESENT;
+                set_pmd(pmd, __pmd(pmdval));
+                break;
+        case PG_LEVEL_4K:
+                pteval = pte_val(*pte) & ~_PAGE_PRESENT;
+                if (present)
+                        pteval |= _PAGE_PRESENT;
+                set_pte_atomic(pte, __pte(pteval));
+                break;
+        default:
+                pr_err("kmmio: unexpected page level 0x%x.\n", level);
+                return;
+        }
+        __flush_tlb_one(addr);
+}
+/** Mark the given page as not present. Access to it will trigger a fault. */
+static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+{
+        set_page_present(page & PAGE_MASK, false, pglevel);
+}
+/** Mark the given page as present. */
+static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+{
+        set_page_present(page & PAGE_MASK, true, pglevel);
+}
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ */
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+        struct kmmio_context *ctx;
+        struct kmmio_fault_page *faultpage;
+        int ret = 0; /* default to fault not handled */
+        /*
+         * Preemption is now disabled to prevent process switch during
+         * single stepping. We can only handle one active kmmio trace
+         * per cpu, so ensure that we finish it before something else
+         * gets to run. We also hold the RCU read lock over single
+         * stepping to avoid looking up the probe and kmmio_fault_page
+         * again.
+         */
+        preempt_disable();
+        rcu_read_lock();
+        faultpage = get_kmmio_fault_page(addr);
+        if (!faultpage) {
+                /*
+                 * Either this page fault is not caused by kmmio, or
+                 * another CPU just pulled the kmmio probe from under
+                 * our feet. The latter case should not be possible.
+                 */
+                goto no_kmmio;
+        }
+        ctx = &get_cpu_var(kmmio_ctx);
+        if (ctx->active) {
+                disarm_kmmio_fault_page(faultpage->page, NULL);
+                if (addr == ctx->addr) {
+                        /*
+                         * On SMP we sometimes get recursive probe hits on the
+                         * same address. Context is already saved, fall out.
+                         */
+                        pr_debug("kmmio: duplicate probe hit on CPU %d, for "
+                                                "address 0x%08lx.\n",
+                                                smp_processor_id(), addr);
+                        ret = 1;
+                        goto no_kmmio_ctx;
+                }
+                /*
+                 * Prevent overwriting already in-flight context.
+                 * This should not happen, let's hope disarming at least
+                 * prevents a panic.
+                 */
+                pr_emerg("kmmio: recursive probe hit on CPU %d, "
+                                        "for address 0x%08lx. Ignoring.\n",
+                                        smp_processor_id(), addr);
+                pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+                                        ctx->addr);
+                goto no_kmmio_ctx;
+        }
+        ctx->active++;
+        ctx->fpage = faultpage;
+        ctx->probe = get_kmmio_probe(addr);
+        ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
+        ctx->addr = addr;
+        if (ctx->probe && ctx->probe->pre_handler)
+                ctx->probe->pre_handler(ctx->probe, regs, addr);
+        /*
+         * Enable single-stepping and disable interrupts for the faulting
+         * context. Local interrupts must not get enabled during stepping.
+         */
+        regs->flags |= X86_EFLAGS_TF;
+        regs->flags &= ~X86_EFLAGS_IF;
+        /* Now we set present bit in PTE and single step. */
+        disarm_kmmio_fault_page(ctx->fpage->page, NULL);
+        /*
+         * If another cpu accesses the same page while we are stepping,
+         * the access will not be caught. It will simply succeed and the
+         * only downside is we lose the event. If this becomes a problem,
+         * the user should drop to single cpu before tracing.
+         */
+        put_cpu_var(kmmio_ctx);
+        return 1; /* fault handled */
+no_kmmio_ctx:
+        put_cpu_var(kmmio_ctx);
+no_kmmio:
+        rcu_read_unlock();
+        preempt_enable_no_resched();
+        return ret;
+}
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ * This must always get called as the pair to kmmio_handler().
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+        int ret = 0;
+        struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+        if (!ctx->active) {
+                pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+                                                        smp_processor_id());
+                goto out;
+        }
+        if (ctx->probe && ctx->probe->post_handler)
+                ctx->probe->post_handler(ctx->probe, condition, regs);
+        arm_kmmio_fault_page(ctx->fpage->page, NULL);
+        regs->flags &= ~X86_EFLAGS_TF;
+        regs->flags |= ctx->saved_flags;
+        /* These were acquired in kmmio_handler(). */
+        ctx->active--;
+        BUG_ON(ctx->active);
+        rcu_read_unlock();
+        preempt_enable_no_resched();
+        /*
+         * if somebody else is singlestepping across a probe point, flags
+         * will have TF set, in which case, continue the remaining processing
+         * of do_debug, as if this is not a probe hit.
+         */
+        if (!(regs->flags & X86_EFLAGS_TF))
+                ret = 1;
+out:
+        put_cpu_var(kmmio_ctx);
+        return ret;
+}
+/* You must be holding kmmio_lock. */
+static int add_kmmio_fault_page(unsigned long page)
+{
+        struct kmmio_fault_page *f;
+        page &= PAGE_MASK;
+        f = get_kmmio_fault_page(page);
+        if (f) {
+                if (!f->count)
+                        arm_kmmio_fault_page(f->page, NULL);
+                f->count++;
+                return 0;
+        }
+        f = kmalloc(sizeof(*f), GFP_ATOMIC);
+        if (!f)
+                return -1;
+        f->count = 1;
+        f->page = page;
+        list_add_rcu(&f->list, kmmio_page_list(f->page));
+        arm_kmmio_fault_page(f->page, NULL);
+        return 0;
+}
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long page,
+                                struct kmmio_fault_page **release_list)
+{
+        struct kmmio_fault_page *f;
+        page &= PAGE_MASK;
+        f = get_kmmio_fault_page(page);
+        if (!f)
+                return;
+        f->count--;
+        BUG_ON(f->count < 0);
+        if (!f->count) {
+                disarm_kmmio_fault_page(f->page, NULL);
+                f->release_next = *release_list;
+                *release_list = f;
+        }
+}
+/*
+ * With page-unaligned ioremaps, one or two armed pages may contain
+ * addresses from outside the intended mapping. Events for these addresses
+ * are currently silently dropped. The events may result only from programming
+ * mistakes by accessing addresses before the beginning or past the end of a
+ * mapping.
+ */
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+        unsigned long flags;
+        int ret = 0;
+        unsigned long size = 0;
+        const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+        spin_lock_irqsave(&kmmio_lock, flags);
+        if (get_kmmio_probe(p->addr)) {
+                ret = -EEXIST;
+                goto out;
+        }
+        kmmio_count++;
+        list_add_rcu(&p->list, &kmmio_probes);
+        while (size < size_lim) {
+                if (add_kmmio_fault_page(p->addr + size))
+                        pr_err("kmmio: Unable to set page fault.\n");
+                size += PAGE_SIZE;
+        }
+out:
+        spin_unlock_irqrestore(&kmmio_lock, flags);
+        /*
+         * XXX: What should I do here?
+         * Here was a call to global_flush_tlb(), but it does not exist
+         * anymore. It seems it's not needed after all.
+         */
+        return ret;
+}
+EXPORT_SYMBOL(register_kmmio_probe);
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+        struct kmmio_delayed_release *dr = container_of(
+                                                head,
+                                                struct kmmio_delayed_release,
+                                                rcu);
+        struct kmmio_fault_page *p = dr->release_list;
+        while (p) {
+                struct kmmio_fault_page *next = p->release_next;
+                BUG_ON(p->count);
+                kfree(p);
+                p = next;
+        }
+        kfree(dr);
+}
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+        struct kmmio_delayed_release *dr = container_of(
+                                                head,
+                                                struct kmmio_delayed_release,
+                                                rcu);
+        struct kmmio_fault_page *p = dr->release_list;
+        struct kmmio_fault_page **prevp = &dr->release_list;
+        unsigned long flags;
+        spin_lock_irqsave(&kmmio_lock, flags);
+        while (p) {
+                if (!p->count)
+                        list_del_rcu(&p->list);
+                else
+                        *prevp = p->release_next;
+                prevp = &p->release_next;
+                p = p->release_next;
+        }
+        spin_unlock_irqrestore(&kmmio_lock, flags);
+        /* This is the real RCU destroy call. */
+        call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ *    Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ *    Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ *    Actally free the kmmio_fault_page structs as with RCU.
+ */
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+        unsigned long flags;
+        unsigned long size = 0;
+        const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+        struct kmmio_fault_page *release_list = NULL;
+        struct kmmio_delayed_release *drelease;
+        spin_lock_irqsave(&kmmio_lock, flags);
+        while (size < size_lim) {
+                release_kmmio_fault_page(p->addr + size, &release_list);
+                size += PAGE_SIZE;
+        }
+        list_del_rcu(&p->list);
+        kmmio_count--;
+        spin_unlock_irqrestore(&kmmio_lock, flags);
+        drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+        if (!drelease) {
+                pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
+                return;
+        }
+        drelease->release_list = release_list;
+        /*
+         * This is not really RCU here. We have just disarmed a set of
+         * pages so that they cannot trigger page faults anymore. However,
+         * we cannot remove the pages from kmmio_page_table,
+         * because a probe hit might be in flight on another CPU. The
+         * pages are collected into a list, and they will be removed from
+         * kmmio_page_table when it is certain that no probe hit related to
+         * these pages can be in flight. RCU grace period sounds like a
+         * good choice.
+         *
+         * If we removed the pages too early, kmmio page fault handler might
+         * not find the respective kmmio_fault_page and determine it's not
+         * a kmmio fault, when it actually is. This would lead to madness.
+         */
+        call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
+}
+EXPORT_SYMBOL(unregister_kmmio_probe);
+static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
+                                                                void *args)
+{
+        struct die_args *arg = args;
+        if (val == DIE_DEBUG && (arg->err & DR_STEP))
+                if (post_kmmio_handler(arg->err, arg->regs) == 1)
+                        return NOTIFY_STOP;
+        return NOTIFY_DONE;
+}
+static struct notifier_block nb_die = {
+        .notifier_call = kmmio_die_notifier
+};
+static int __init init_kmmio(void)
+{
+        int i;
+        for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+                INIT_LIST_HEAD(&kmmio_page_table[i]);
+        return register_die_notifier(&nb_die);
+}
+fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
new file mode 100644
index 000000000000..672e17f8262a
--- /dev/null
+++ b/arch/x86/mm/memtest.c
@@ -0,0 +1,123 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <asm/e820.h>
+static void __init memtest(unsigned long start_phys, unsigned long size,
+                                 unsigned pattern)
+{
+        unsigned long i;
+        unsigned long *start;
+        unsigned long start_bad;
+        unsigned long last_bad;
+        unsigned long val;
+        unsigned long start_phys_aligned;
+        unsigned long count;
+        unsigned long incr;
+        switch (pattern) {
+        case 0:
+                val = 0UL;
+                break;
+        case 1:
+                val = -1UL;
+                break;
+        case 2:
+#ifdef CONFIG_X86_64
+                val = 0x5555555555555555UL;
+#else
+                val = 0x55555555UL;
+#endif
+                break;
+        case 3:
+#ifdef CONFIG_X86_64
+                val = 0xaaaaaaaaaaaaaaaaUL;
+#else
+                val = 0xaaaaaaaaUL;
+#endif
+                break;
+        default:
+                return;
+        }
+        incr = sizeof(unsigned long);
+        start_phys_aligned = ALIGN(start_phys, incr);
+        count = (size - (start_phys_aligned - start_phys))/incr;
+        start = __va(start_phys_aligned);
+        start_bad = 0;
+        last_bad = 0;
+        for (i = 0; i < count; i++)
+                start[i] = val;
+        for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
+                if (*start != val) {
+                        if (start_phys_aligned == last_bad + incr) {
+                                last_bad += incr;
+                        } else {
+                                if (start_bad) {
+                                        printk(KERN_CONT "\n  %010lx bad mem addr %010lx - %010lx reserved",
+                                                val, start_bad, last_bad + incr);
+                                        reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+                                }
+                                start_bad = last_bad = start_phys_aligned;
+                        }
+                }
+        }
+        if (start_bad) {
+                printk(KERN_CONT "\n  %016lx bad mem addr %010lx - %010lx reserved",
+                        val, start_bad, last_bad + incr);
+                reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+        }
+}
+/* default is disabled */
+static int memtest_pattern __initdata;
+static int __init parse_memtest(char *arg)
+{
+        if (arg)
+                memtest_pattern = simple_strtoul(arg, NULL, 0);
+        return 0;
+}
+early_param("memtest", parse_memtest);
+void __init early_memtest(unsigned long start, unsigned long end)
+{
+        u64 t_start, t_size;
+        unsigned pattern;
+        if (!memtest_pattern)
+                return;
+        printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+        for (pattern = 0; pattern < memtest_pattern; pattern++) {
+                t_start = start;
+                t_size = 0;
+                while (t_start < end) {
+                        t_start = find_e820_area_size(t_start, &t_size, 1);
+                        /* done ? */
+                        if (t_start >= end)
+                                break;
+                        if (t_start + t_size > end)
+                                t_size = end - t_start;
+                        printk(KERN_CONT "\n  %010llx - %010llx pattern %d",
+                                (unsigned long long)t_start,
+                                (unsigned long long)t_start + t_size, pattern);
+                        memtest(t_start, t_size, pattern);
+                        t_start += t_size;
+                }
+        }
+        printk(KERN_CONT "\n");
+}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 000000000000..635b50e85581
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,517 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *               Jeff Muizelaar, 2006, 2007
+ *               Pekka Paalanen, 2008 <pq@iki.fi>
+ *
+ * Derived from the read-mod example from relay-examples by Tom Zanussi.
+ */
+#define DEBUG 1
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/version.h>
+#include <linux/kallsyms.h>
+#include <asm/pgtable.h>
+#include <linux/mmiotrace.h>
+#include <asm/e820.h> /* for ISA_START_ADDRESS */
+#include <asm/atomic.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include "pf_in.h"
+#define NAME "mmiotrace: "
+struct trap_reason {
+        unsigned long addr;
+        unsigned long ip;
+        enum reason_type type;
+        int active_traces;
+};
+struct remap_trace {
+        struct list_head list;
+        struct kmmio_probe probe;
+        resource_size_t phys;
+        unsigned long id;
+};
+/* Accessed per-cpu. */
+static DEFINE_PER_CPU(struct trap_reason, pf_reason);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
+#if 0 /* XXX: no way gather this info anymore */
+/* Access to this is not per-cpu. */
+static DEFINE_PER_CPU(atomic_t, dropped);
+#endif
+static struct dentry *marker_file;
+static DEFINE_MUTEX(mmiotrace_mutex);
+static DEFINE_SPINLOCK(trace_lock);
+static atomic_t mmiotrace_enabled;
+static LIST_HEAD(trace_list);           /* struct remap_trace */
+/*
+ * Locking in this file:
+ * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
+ * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
+ *   and trace_lock.
+ * - Routines depending on is_enabled() must take trace_lock.
+ * - trace_list users must hold trace_lock.
+ * - is_enabled() guarantees that mmio_trace_record is allowed.
+ * - pre/post callbacks assume the effect of is_enabled() being true.
+ */
+/* module parameters */
+static unsigned long    filter_offset;
+static int              nommiotrace;
+static int              trace_pc;
+module_param(filter_offset, ulong, 0);
+module_param(nommiotrace, bool, 0);
+module_param(trace_pc, bool, 0);
+MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
+MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
+MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
+static bool is_enabled(void)
+{
+        return atomic_read(&mmiotrace_enabled);
+}
+#if 0 /* XXX: needs rewrite */
+/*
+ * Write callback for the debugfs entry:
+ * Read a marker and write it to the mmio trace log
+ */
+static ssize_t write_marker(struct file *file, const char __user *buffer,
+                                                size_t count, loff_t *ppos)
+{
+        char *event = NULL;
+        struct mm_io_header *headp;
+        ssize_t len = (count > 65535) ? 65535 : count;
+        event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
+        if (!event)
+                return -ENOMEM;
+        headp = (struct mm_io_header *)event;
+        headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
+        headp->data_len = len;
+        if (copy_from_user(event + sizeof(*headp), buffer, len)) {
+                kfree(event);
+                return -EFAULT;
+        }
+        spin_lock_irq(&trace_lock);
+#if 0 /* XXX: convert this to use tracing */
+        if (is_enabled())
+                relay_write(chan, event, sizeof(*headp) + len);
+        else
+#endif
+                len = -EINVAL;
+        spin_unlock_irq(&trace_lock);
+        kfree(event);
+        return len;
+}
+#endif
+static void print_pte(unsigned long address)
+{
+        unsigned int level;
+        pte_t *pte = lookup_address(address, &level);
+        if (!pte) {
+                pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
+                                                        __func__, address);
+                return;
+        }
+        if (level == PG_LEVEL_2M) {
+                pr_emerg(NAME "4MB pages are not currently supported: "
+                                                        "0x%08lx\n", address);
+                BUG();
+        }
+        pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
+                (unsigned long long)pte_val(*pte),
+                (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
+}
+/*
+ * For some reason the pre/post pairs have been called in an
+ * unmatched order. Report and die.
+ */
+static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
+{
+        const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+        pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
+                                        "last fault for address: 0x%08lx\n",
+                                        addr, my_reason->addr);
+        print_pte(addr);
+        print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
+        print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
+#ifdef __i386__
+        pr_emerg("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+                        regs->ax, regs->bx, regs->cx, regs->dx);
+        pr_emerg("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+                        regs->si, regs->di, regs->bp, regs->sp);
+#else
+        pr_emerg("rax: %016lx   rcx: %016lx   rdx: %016lx\n",
+                                        regs->ax, regs->cx, regs->dx);
+        pr_emerg("rsi: %016lx   rdi: %016lx   rbp: %016lx   rsp: %016lx\n",
+                                regs->si, regs->di, regs->bp, regs->sp);
+#endif
+        put_cpu_var(pf_reason);
+        BUG();
+}
+static void pre(struct kmmio_probe *p, struct pt_regs *regs,
+                                                unsigned long addr)
+{
+        struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+        struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+        const unsigned long instptr = instruction_pointer(regs);
+        const enum reason_type type = get_ins_type(instptr);
+        struct remap_trace *trace = p->private;
+        /* it doesn't make sense to have more than one active trace per cpu */
+        if (my_reason->active_traces)
+                die_kmmio_nesting_error(regs, addr);
+        else
+                my_reason->active_traces++;
+        my_reason->type = type;
+        my_reason->addr = addr;
+        my_reason->ip = instptr;
+        my_trace->phys = addr - trace->probe.addr + trace->phys;
+        my_trace->map_id = trace->id;
+        /*
+         * Only record the program counter when requested.
+         * It may taint clean-room reverse engineering.
+         */
+        if (trace_pc)
+                my_trace->pc = instptr;
+        else
+                my_trace->pc = 0;
+        /*
+         * XXX: the timestamp recorded will be *after* the tracing has been
+         * done, not at the time we hit the instruction. SMP implications
+         * on event ordering?
+         */
+        switch (type) {
+        case REG_READ:
+                my_trace->opcode = MMIO_READ;
+                my_trace->width = get_ins_mem_width(instptr);
+                break;
+        case REG_WRITE:
+                my_trace->opcode = MMIO_WRITE;
+                my_trace->width = get_ins_mem_width(instptr);
+                my_trace->value = get_ins_reg_val(instptr, regs);
+                break;
+        case IMM_WRITE:
+                my_trace->opcode = MMIO_WRITE;
+                my_trace->width = get_ins_mem_width(instptr);
+                my_trace->value = get_ins_imm_val(instptr);
+                break;
+        default:
+                {
+                        unsigned char *ip = (unsigned char *)instptr;
+                        my_trace->opcode = MMIO_UNKNOWN_OP;
+                        my_trace->width = 0;
+                        my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
+                                                                *(ip + 2);
+                }
+        }
+        put_cpu_var(cpu_trace);
+        put_cpu_var(pf_reason);
+}
+static void post(struct kmmio_probe *p, unsigned long condition,
+                                                        struct pt_regs *regs)
+{
+        struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+        struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+        /* this should always return the active_trace count to 0 */
+        my_reason->active_traces--;
+        if (my_reason->active_traces) {
+                pr_emerg(NAME "unexpected post handler");
+                BUG();
+        }
+        switch (my_reason->type) {
+        case REG_READ:
+                my_trace->value = get_ins_reg_val(my_reason->ip, regs);
+                break;
+        default:
+                break;
+        }
+        mmio_trace_rw(my_trace);
+        put_cpu_var(cpu_trace);
+        put_cpu_var(pf_reason);
+}
+static void ioremap_trace_core(resource_size_t offset, unsigned long size,
+                                                        void __iomem *addr)
+{
+        static atomic_t next_id;
+        struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+        /* These are page-unaligned. */
+        struct mmiotrace_map map = {
+                .phys = offset,
+                .virt = (unsigned long)addr,
+                .len = size,
+                .opcode = MMIO_PROBE
+        };
+        if (!trace) {
+                pr_err(NAME "kmalloc failed in ioremap\n");
+                return;
+        }
+        *trace = (struct remap_trace) {
+                .probe = {
+                        .addr = (unsigned long)addr,
+                        .len = size,
+                        .pre_handler = pre,
+                        .post_handler = post,
+                        .private = trace
+                },
+                .phys = offset,
+                .id = atomic_inc_return(&next_id)
+        };
+        map.map_id = trace->id;
+        spin_lock_irq(&trace_lock);
+        if (!is_enabled())
+                goto not_enabled;
+        mmio_trace_mapping(&map);
+        list_add_tail(&trace->list, &trace_list);
+        if (!nommiotrace)
+                register_kmmio_probe(&trace->probe);
+not_enabled:
+        spin_unlock_irq(&trace_lock);
+}
+void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+                                                void __iomem *addr)
+{
+        if (!is_enabled()) /* recheck and proper locking in *_core() */
+                return;
+        pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
+                                (unsigned long long)offset, size, addr);
+        if ((filter_offset) && (offset != filter_offset))
+                return;
+        ioremap_trace_core(offset, size, addr);
+}
+static void iounmap_trace_core(volatile void __iomem *addr)
+{
+        struct mmiotrace_map map = {
+                .phys = 0,
+                .virt = (unsigned long)addr,
+                .len = 0,
+                .opcode = MMIO_UNPROBE
+        };
+        struct remap_trace *trace;
+        struct remap_trace *tmp;
+        struct remap_trace *found_trace = NULL;
+        pr_debug(NAME "Unmapping %p.\n", addr);
+        spin_lock_irq(&trace_lock);
+        if (!is_enabled())
+                goto not_enabled;
+        list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+                if ((unsigned long)addr == trace->probe.addr) {
+                        if (!nommiotrace)
+                                unregister_kmmio_probe(&trace->probe);
+                        list_del(&trace->list);
+                        found_trace = trace;
+                        break;
+                }
+        }
+        map.map_id = (found_trace) ? found_trace->id : -1;
+        mmio_trace_mapping(&map);
+not_enabled:
+        spin_unlock_irq(&trace_lock);
+        if (found_trace) {
+                synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+                kfree(found_trace);
+        }
+}
+void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+        might_sleep();
+        if (is_enabled()) /* recheck and proper locking in *_core() */
+                iounmap_trace_core(addr);
+}
+static void clear_trace_list(void)
+{
+        struct remap_trace *trace;
+        struct remap_trace *tmp;
+        /*
+         * No locking required, because the caller ensures we are in a
+         * critical section via mutex, and is_enabled() is false,
+         * i.e. nothing can traverse or modify this list.
+         * Caller also ensures is_enabled() cannot change.
+         */
+        list_for_each_entry(trace, &trace_list, list) {
+                pr_notice(NAME "purging non-iounmapped "
+                                        "trace @0x%08lx, size 0x%lx.\n",
+                                        trace->probe.addr, trace->probe.len);
+                if (!nommiotrace)
+                        unregister_kmmio_probe(&trace->probe);
+        }
+        synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+        list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+                list_del(&trace->list);
+                kfree(trace);
+        }
+}
+#ifdef CONFIG_HOTPLUG_CPU
+static cpumask_t downed_cpus;
+static void enter_uniprocessor(void)
+{
+        int cpu;
+        int err;
+        get_online_cpus();
+        downed_cpus = cpu_online_map;
+        cpu_clear(first_cpu(cpu_online_map), downed_cpus);
+        if (num_online_cpus() > 1)
+                pr_notice(NAME "Disabling non-boot CPUs...\n");
+        put_online_cpus();
+        for_each_cpu_mask(cpu, downed_cpus) {
+                err = cpu_down(cpu);
+                if (!err)
+                        pr_info(NAME "CPU%d is down.\n", cpu);
+                else
+                        pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
+        }
+        if (num_online_cpus() > 1)
+                pr_warning(NAME "multiple CPUs still online, "
+                                                "may miss events.\n");
+}
+/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
+   but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
+static void __ref leave_uniprocessor(void)
+{
+        int cpu;
+        int err;
+        if (cpus_weight(downed_cpus) == 0)
+                return;
+        pr_notice(NAME "Re-enabling CPUs...\n");
+        for_each_cpu_mask(cpu, downed_cpus) {
+                err = cpu_up(cpu);
+                if (!err)
+                        pr_info(NAME "enabled CPU%d.\n", cpu);
+                else
+                        pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
+        }
+}
+#else /* !CONFIG_HOTPLUG_CPU */
+static void enter_uniprocessor(void)
+{
+        if (num_online_cpus() > 1)
+                pr_warning(NAME "multiple CPUs are online, may miss events. "
+                        "Suggest booting with maxcpus=1 kernel argument.\n");
+}
+static void leave_uniprocessor(void)
+{
+}
+#endif
+#if 0 /* XXX: out of order */
+static struct file_operations fops_marker = {
+        .owner =        THIS_MODULE,
+        .write =        write_marker
+};
+#endif
+void enable_mmiotrace(void)
+{
+        mutex_lock(&mmiotrace_mutex);
+        if (is_enabled())
+                goto out;
+#if 0 /* XXX: tracing does not support text entries */
+        marker_file = debugfs_create_file("marker", 0660, dir, NULL,
+                                                                &fops_marker);
+        if (!marker_file)
+                pr_err(NAME "marker file creation failed.\n");
+#endif
+        if (nommiotrace)
+                pr_info(NAME "MMIO tracing disabled.\n");
+        enter_uniprocessor();
+        spin_lock_irq(&trace_lock);
+        atomic_inc(&mmiotrace_enabled);
+        spin_unlock_irq(&trace_lock);
+        pr_info(NAME "enabled.\n");
+out:
+        mutex_unlock(&mmiotrace_mutex);
+}
+void disable_mmiotrace(void)
+{
+        mutex_lock(&mmiotrace_mutex);
+        if (!is_enabled())
+                goto out;
+        spin_lock_irq(&trace_lock);
+        atomic_dec(&mmiotrace_enabled);
+        BUG_ON(is_enabled());
+        spin_unlock_irq(&trace_lock);
+        clear_trace_list(); /* guarantees: no more kmmio callbacks */
+        leave_uniprocessor();
+        if (marker_file) {
+                debugfs_remove(marker_file);
+                marker_file = NULL;
+        }
+        pr_info(NAME "disabled.\n");
+out:
+        mutex_unlock(&mmiotrace_mutex);
+}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5d..cebcbf152d46 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,37 +20,18 @@
 #include <asm/acpi.h>
 #include <asm/k8.h>
-#ifndef Dprintk
-#define Dprintk(x...)
-#endif
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
-bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 struct memnode memnode;
-#ifdef CONFIG_SMP
-int x86_cpu_to_node_map_init[NR_CPUS] = {
-        [0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
-void *x86_cpu_to_node_map_early_ptr;
-EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
-#endif
-DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
-cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_to_cpumask_map);
 int numa_off __initdata;
-unsigned long __initdata nodemap_addr;
+static unsigned long __initdata nodemap_addr;
-unsigned long __initdata nodemap_size;
+static unsigned long __initdata nodemap_size;
 /*
 * Given a shift value, try to populate memnodemap[]
@@ -98,8 +79,8 @@ static int __init allocate_cachealigned_memnodemap(void)
                return 0;
        addr = 0x8000;
-        nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
+        nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
-        nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT,
+        nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
                                      nodemap_size, L1_CACHE_BYTES);
        if (nodemap_addr == -1UL) {
                printk(KERN_ERR
@@ -192,19 +173,19 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 void __init setup_node_bootmem(int nodeid, unsigned long start,
                               unsigned long end)
 {
-        unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size;
+        unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
        unsigned long bootmap_start, nodedata_phys;
        void *bootmap;
-        const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+        const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
        int nid;
-        start = round_up(start, ZONE_ALIGN);
+        start = roundup(start, ZONE_ALIGN);
        printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
               start, end);
        start_pfn = start >> PAGE_SHIFT;
-        end_pfn = end >> PAGE_SHIFT;
+        last_pfn = end >> PAGE_SHIFT;
        node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
                                           SMP_CACHE_BYTES);
@@ -215,9 +196,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
                nodedata_phys + pgdat_size - 1);
        memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
-        NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
+        NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
        NODE_DATA(nodeid)->node_start_pfn = start_pfn;
-        NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+        NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
        /*
         * Find a place for the bootmem map
@@ -226,14 +207,14 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
         * early_node_mem will get that with find_e820_area instead
         * of alloc_bootmem, that could clash with reserved range
         */
-        bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
+        bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
        nid = phys_to_nid(nodedata_phys);
        if (nid == nodeid)
-                bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+                bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
        else
-                bootmap_start = round_up(start, PAGE_SIZE);
+                bootmap_start = roundup(start, PAGE_SIZE);
        /*
-         * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
+         * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
         * to use that to align to PAGE_SIZE
         */
        bootmap = early_node_mem(nodeid, bootmap_start, end,
@@ -248,7 +229,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
        bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
                                         bootmap_start >> PAGE_SHIFT,
-                                         start_pfn, end_pfn);
+                                         start_pfn, last_pfn);
        printk(KERN_INFO "  bootmap [%016lx -  %016lx] pages %lx\n",
                 bootmap_start, bootmap_start + bootmap_size - 1,
@@ -309,7 +290,7 @@ void __init numa_init_array(void)
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
-char *cmdline __initdata;
+static char *cmdline __initdata;
 /*
 * Setups up nid to range from addr to addr + size.  If the end
@@ -413,15 +394,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
 }
 /*
- * Sets up the system RAM area from start_pfn to end_pfn according to the
+ * Sets up the system RAM area from start_pfn to last_pfn according to the
 * numa=fake command-line option.
 */
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
 {
        u64 size, addr = start_pfn << PAGE_SHIFT;
-        u64 max_addr = end_pfn << PAGE_SHIFT;
+        u64 max_addr = last_pfn << PAGE_SHIFT;
        int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
        memset(&nodes, 0, sizeof(nodes));
@@ -527,7 +508,7 @@ out:
 }
 #endif /* CONFIG_NUMA_EMU */
-void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
 {
        int i;
@@ -535,7 +516,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
        nodes_clear(node_online_map);
 #ifdef CONFIG_NUMA_EMU
-        if (cmdline && !numa_emulation(start_pfn, end_pfn))
+        if (cmdline && !numa_emulation(start_pfn, last_pfn))
                return;
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
@@ -543,7 +524,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 #ifdef CONFIG_ACPI_NUMA
        if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
-                                          end_pfn << PAGE_SHIFT))
+                                          last_pfn << PAGE_SHIFT))
                return;
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
@@ -551,7 +532,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 #ifdef CONFIG_K8_NUMA
        if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
-                                        end_pfn<<PAGE_SHIFT))
+                                        last_pfn<<PAGE_SHIFT))
                return;
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
@@ -561,7 +542,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
        printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
               start_pfn << PAGE_SHIFT,
-               end_pfn << PAGE_SHIFT);
+               last_pfn << PAGE_SHIFT);
        /* setup dummy node covering all memory */
        memnode_shift = 63;
        memnodemap = memnode.embedded_map;
@@ -570,29 +551,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
        node_set(0, node_possible_map);
        for (i = 0; i < NR_CPUS; i++)
                numa_set_node(i, 0);
-        /* cpumask_of_cpu() may not be available during early startup */
+        e820_register_active_regions(0, start_pfn, last_pfn);
-        memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
+        setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
-        cpu_set(0, node_to_cpumask_map[0]);
-        e820_register_active_regions(0, start_pfn, end_pfn);
-        setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
-}
-__cpuinit void numa_add_cpu(int cpu)
-{
-        set_bit(cpu,
-                (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-void __cpuinit numa_set_node(int cpu, int node)
-{
-        int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
-        if(cpu_to_node_map)
-                cpu_to_node_map[cpu] = node;
-        else if(per_cpu_offset(cpu))
-                per_cpu(x86_cpu_to_node_map, cpu) = node;
-        else
-                Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
 }
 unsigned long __init numa_free_all_bootmem(void)
@@ -613,7 +573,7 @@ void __init paging_init(void)
        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
        max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-        max_zone_pfns[ZONE_NORMAL] = end_pfn;
+        max_zone_pfns[ZONE_NORMAL] = max_pfn;
        sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();
@@ -641,6 +601,7 @@ static __init int numa_setup(char *opt)
 }
 early_param("numa", numa_setup);
+#ifdef CONFIG_NUMA
 /*
 * Setup early cpu_to_node.
 *
@@ -652,14 +613,19 @@ early_param("numa", numa_setup);
 * is already initialized in a round robin manner at numa_init_array,
 * prior to this call, and this initialization is good enough
 * for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
 */
 void __init init_cpu_to_node(void)
 {
-        int i;
+        int cpu;
+        u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
-        for (i = 0; i < NR_CPUS; i++) {
+        BUG_ON(cpu_to_apicid == NULL);
+        for_each_possible_cpu(cpu) {
                int node;
-                u16 apicid = x86_cpu_to_apicid_init[i];
+                u16 apicid = cpu_to_apicid[cpu];
                if (apicid == BAD_APICID)
                        continue;
@@ -668,8 +634,9 @@ void __init init_cpu_to_node(void)
                        continue;
                if (!node_online(node))
                        continue;
-                numa_set_node(i, node);
+                numa_set_node(cpu, node);
        }
 }
+#endif
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 75f1b109aae8..e1d106909218 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -1,8 +1,8 @@
 /*
 * self test for change_page_attr.
 *
- * Clears the global bit on random pages in the direct mapping, then reverts
+ * Clears the a test pte bit on random pages in the direct mapping,
- * and compares page tables forwards and afterwards.
+ * then reverts and compares page tables forwards and afterwards.
 */
 #include <linux/bootmem.h>
 #include <linux/kthread.h>
@@ -32,6 +32,13 @@ enum {
        GPS                     = (1<<30)
 };
+#define PAGE_CPA_TEST   __pgprot(_PAGE_CPA_TEST)
+static int pte_testbit(pte_t pte)
+{
+        return pte_flags(pte) & _PAGE_UNUSED1;
+}
 struct split_state {
        long lpg, gpg, spg, exec;
        long min_exec, max_exec;
@@ -111,6 +118,7 @@ static int pageattr_test(void)
        unsigned int level;
        int i, k;
        int err;
+        unsigned long test_addr;
        if (print)
                printk(KERN_INFO "CPA self-test:\n");
@@ -165,15 +173,15 @@ static int pageattr_test(void)
                        continue;
                }
-                err = change_page_attr_clear(addr[i], len[i],
+                test_addr = addr[i];
-                                               __pgprot(_PAGE_GLOBAL));
+                err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
                if (err < 0) {
                        printk(KERN_ERR "CPA %d failed %d\n", i, err);
                        failed++;
                }
                pte = lookup_address(addr[i], &level);
-                if (!pte || pte_global(*pte) || pte_huge(*pte)) {
+                if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
                        printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
                                pte ? (u64)pte_val(*pte) : 0ULL);
                        failed++;
@@ -198,14 +206,14 @@ static int pageattr_test(void)
                        failed++;
                        continue;
                }
-                err = change_page_attr_set(addr[i], len[i],
+                test_addr = addr[i];
-                                             __pgprot(_PAGE_GLOBAL));
+                err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
                if (err < 0) {
                        printk(KERN_ERR "CPA reverting failed: %d\n", err);
                        failed++;
                }
                pte = lookup_address(addr[i], &level);
-                if (!pte || !pte_global(*pte)) {
+                if (!pte || pte_testbit(*pte)) {
                        printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
                                addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
                        failed++;
@@ -216,8 +224,7 @@ static int pageattr_test(void)
        failed += print_split(&sc);
        if (failed) {
-                printk(KERN_ERR "NOT PASSED. Please report.\n");
+                WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
-                WARN_ON(1);
                return -EINVAL;
        } else {
                if (print)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..a9ec89c3fbca 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,68 @@
 * The current flushing context - we pass it instead of 5 arguments:
 */
 struct cpa_data {
-        unsigned long   vaddr;
+        unsigned long   *vaddr;
        pgprot_t        mask_set;
        pgprot_t        mask_clr;
        int             numpages;
-        int             flushtlb;
+        int             flags;
        unsigned long   pfn;
        unsigned        force_split : 1;
+        int             curpage;
 };
+/*
+ * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
+ * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
+ * entries change the page attribute in parallel to some other cpu
+ * splitting a large page entry along with changing the attribute.
+ */
+static DEFINE_SPINLOCK(cpa_lock);
+#define CPA_FLUSHTLB 1
+#define CPA_ARRAY 2
+#ifdef CONFIG_PROC_FS
+static unsigned long direct_pages_count[PG_LEVEL_NUM];
+void update_page_count(int level, unsigned long pages)
+{
+        unsigned long flags;
+        /* Protect against CPA */
+        spin_lock_irqsave(&pgd_lock, flags);
+        direct_pages_count[level] += pages;
+        spin_unlock_irqrestore(&pgd_lock, flags);
+}
+static void split_page_count(int level)
+{
+        direct_pages_count[level]--;
+        direct_pages_count[level - 1] += PTRS_PER_PTE;
+}
+int arch_report_meminfo(char *page)
+{
+        int n = sprintf(page, "DirectMap4k:  %8lu kB\n",
+                        direct_pages_count[PG_LEVEL_4K] << 2);
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+        n += sprintf(page + n, "DirectMap2M:  %8lu kB\n",
+                        direct_pages_count[PG_LEVEL_2M] << 11);
+#else
+        n += sprintf(page + n, "DirectMap4M:  %8lu kB\n",
+                        direct_pages_count[PG_LEVEL_2M] << 12);
+#endif
+#ifdef CONFIG_X86_64
+        if (direct_gbpages)
+                n += sprintf(page + n, "DirectMap1G:  %8lu kB\n",
+                        direct_pages_count[PG_LEVEL_1G] << 20);
+#endif
+        return n;
+}
+#else
+static inline void split_page_count(int level) { }
+#endif
 #ifdef CONFIG_X86_64
 static inline unsigned long highmap_start_pfn(void)
@@ -43,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void)
 static inline unsigned long highmap_end_pfn(void)
 {
-        return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+        return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
 }
 #endif
@@ -106,7 +159,7 @@ static void cpa_flush_all(unsigned long cache)
 {
        BUG_ON(irqs_disabled());
-        on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
+        on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
 static void __cpa_flush_range(void *arg)
@@ -127,7 +180,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
        BUG_ON(irqs_disabled());
        WARN_ON(PAGE_ALIGN(start) != start);
-        on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+        on_each_cpu(__cpa_flush_range, NULL, 1);
        if (!cache)
                return;
@@ -149,6 +202,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
        }
 }
+static void cpa_flush_array(unsigned long *start, int numpages, int cache)
+{
+        unsigned int i, level;
+        unsigned long *addr;
+        BUG_ON(irqs_disabled());
+        on_each_cpu(__cpa_flush_range, NULL, 1);
+        if (!cache)
+                return;
+        /* 4M threshold */
+        if (numpages >= 1024) {
+                if (boot_cpu_data.x86_model >= 4)
+                        wbinvd();
+                return;
+        }
+        /*
+         * We only need to flush on one CPU,
+         * clflush is a MESI-coherent instruction that
+         * will cause all other CPUs to flush the same
+         * cachelines:
+         */
+        for (i = 0, addr = start; i < numpages; i++, addr++) {
+                pte_t *pte = lookup_address(*addr, &level);
+                /*
+                 * Only flush present addresses:
+                 */
+                if (pte && (pte_val(*pte) & _PAGE_PRESENT))
+                        clflush_cache_range((void *) *addr, PAGE_SIZE);
+        }
+}
 /*
 * Certain areas of memory on x86 require very specific protection flags,
 * for example the BIOS area or kernel text. Callers don't always get this
@@ -227,6 +315,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
        return pte_offset_kernel(pmd, address);
 }
+EXPORT_SYMBOL_GPL(lookup_address);
 /*
 * Set the new pmd in all the pgds we know about:
@@ -356,7 +445,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
                 */
                new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
                __set_pmd_pte(kpte, address, new_pte);
-                cpa->flushtlb = 1;
+                cpa->flags |= CPA_FLUSHTLB;
                do_split = 0;
        }
@@ -366,84 +455,6 @@ out_unlock:
        return do_split;
 }
-static LIST_HEAD(page_pool);
-static unsigned long pool_size, pool_pages, pool_low;
-static unsigned long pool_used, pool_failed;
-static void cpa_fill_pool(struct page **ret)
-{
-        gfp_t gfp = GFP_KERNEL;
-        unsigned long flags;
-        struct page *p;
-        /*
-         * Avoid recursion (on debug-pagealloc) and also signal
-         * our priority to get to these pagetables:
-         */
-        if (current->flags & PF_MEMALLOC)
-                return;
-        current->flags |= PF_MEMALLOC;
-        /*
-         * Allocate atomically from atomic contexts:
-         */
-        if (in_atomic() || irqs_disabled() || debug_pagealloc)
-                gfp =  GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
-        while (pool_pages < pool_size || (ret && !*ret)) {
-                p = alloc_pages(gfp, 0);
-                if (!p) {
-                        pool_failed++;
-                        break;
-                }
-                /*
-                 * If the call site needs a page right now, provide it:
-                 */
-                if (ret && !*ret) {
-                        *ret = p;
-                        continue;
-                }
-                spin_lock_irqsave(&pgd_lock, flags);
-                list_add(&p->lru, &page_pool);
-                pool_pages++;
-                spin_unlock_irqrestore(&pgd_lock, flags);
-        }
-        current->flags &= ~PF_MEMALLOC;
-}
-#define SHIFT_MB                (20 - PAGE_SHIFT)
-#define ROUND_MB_GB             ((1 << 10) - 1)
-#define SHIFT_MB_GB             10
-#define POOL_PAGES_PER_GB       16
-void __init cpa_init(void)
-{
-        struct sysinfo si;
-        unsigned long gb;
-        si_meminfo(&si);
-        /*
-         * Calculate the number of pool pages:
-         *
-         * Convert totalram (nr of pages) to MiB and round to the next
-         * GiB. Shift MiB to Gib and multiply the result by
-         * POOL_PAGES_PER_GB:
-         */
-        if (debug_pagealloc) {
-                gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
-                pool_size = POOL_PAGES_PER_GB * gb;
-        } else {
-                pool_size = 1;
-        }
-        pool_low = pool_size;
-        cpa_fill_pool(NULL);
-        printk(KERN_DEBUG
-               "CPA: page pool initialized %lu of %lu pages preallocated\n",
-               pool_pages, pool_size);
-}
 static int split_large_page(pte_t *kpte, unsigned long address)
 {
        unsigned long flags, pfn, pfninc = 1;
@@ -452,28 +463,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
        pgprot_t ref_prot;
        struct page *base;
-        /*
+        if (!debug_pagealloc)
-         * Get a page from the pool. The pool list is protected by the
+                spin_unlock(&cpa_lock);
-         * pgd_lock, which we have to take anyway for the split
+        base = alloc_pages(GFP_KERNEL, 0);
-         * operation:
+        if (!debug_pagealloc)
-         */
+                spin_lock(&cpa_lock);
-        spin_lock_irqsave(&pgd_lock, flags);
+        if (!base)
-        if (list_empty(&page_pool)) {
+                return -ENOMEM;
-                spin_unlock_irqrestore(&pgd_lock, flags);
-                base = NULL;
-                cpa_fill_pool(&base);
-                if (!base)
-                        return -ENOMEM;
-                spin_lock_irqsave(&pgd_lock, flags);
-        } else {
-                base = list_first_entry(&page_pool, struct page, lru);
-                list_del(&base->lru);
-                pool_pages--;
-                if (pool_pages < pool_low)
-                        pool_low = pool_pages;
-        }
+        spin_lock_irqsave(&pgd_lock, flags);
        /*
         * Check for races, another CPU might have split this page
         * up for us already:
@@ -500,6 +498,16 @@ static int split_large_page(pte_t *kpte, unsigned long address)
        for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
                set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
+        if (address >= (unsigned long)__va(0) &&
+                address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+                split_page_count(level);
+#ifdef CONFIG_X86_64
+        if (address >= (unsigned long)__va(1UL<<32) &&
+                address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
+                split_page_count(level);
+#endif
        /*
         * Install the new, split up pagetable. Important details here:
         *
@@ -520,11 +528,8 @@ out_unlock:
         * If we dropped out via the lookup_address check under
         * pgd_lock then stick the page back into the pool:
         */
-        if (base) {
+        if (base)
-                list_add(&base->lru, &page_pool);
+                __free_page(base);
-                pool_pages++;
-        } else
-                pool_used++;
        spin_unlock_irqrestore(&pgd_lock, flags);
        return 0;
@@ -532,11 +537,16 @@ out_unlock:
 static int __change_page_attr(struct cpa_data *cpa, int primary)
 {
-        unsigned long address = cpa->vaddr;
+        unsigned long address;
        int do_split, err;
        unsigned int level;
        pte_t *kpte, old_pte;
+        if (cpa->flags & CPA_ARRAY)
+                address = cpa->vaddr[cpa->curpage];
+        else
+                address = *cpa->vaddr;
 repeat:
        kpte = lookup_address(address, &level);
        if (!kpte)
@@ -546,10 +556,9 @@ repeat:
        if (!pte_val(old_pte)) {
                if (!primary)
                        return 0;
-                printk(KERN_WARNING "CPA: called for zero pte. "
+                WARN(1, KERN_WARNING "CPA: called for zero pte. "
                       "vaddr = %lx cpa->vaddr = %lx\n", address,
-                       cpa->vaddr);
+                       *cpa->vaddr);
-                WARN_ON(1);
                return -EINVAL;
        }
@@ -575,7 +584,7 @@ repeat:
                 */
                if (pte_val(old_pte) != pte_val(new_pte)) {
                        set_pte_atomic(kpte, new_pte);
-                        cpa->flushtlb = 1;
+                        cpa->flags |= CPA_FLUSHTLB;
                }
                cpa->numpages = 1;
                return 0;
@@ -599,7 +608,25 @@ repeat:
         */
        err = split_large_page(kpte, address);
        if (!err) {
-                cpa->flushtlb = 1;
+                /*
+                 * Do a global flush tlb after splitting the large page
+                 * and before we do the actual change page attribute in the PTE.
+                 *
+                 * With out this, we violate the TLB application note, that says
+                 * "The TLBs may contain both ordinary and large-page
+                 *  translations for a 4-KByte range of linear addresses. This
+                 *  may occur if software modifies the paging structures so that
+                 *  the page size used for the address range changes. If the two
+                 *  translations differ with respect to page frame or attributes
+                 *  (e.g., permissions), processor behavior is undefined and may
+                 *  be implementation-specific."
+                 *
+                 * We do this global tlb flush inside the cpa_lock, so that we
+                 * don't allow any other cpu, with stale tlb entries change the
+                 * page attribute in parallel, that also falls into the
+                 * just split large page entry.
+                 */
+                flush_tlb_all();
                goto repeat;
        }
@@ -612,19 +639,37 @@ static int cpa_process_alias(struct cpa_data *cpa)
 {
        struct cpa_data alias_cpa;
        int ret = 0;
+        unsigned long temp_cpa_vaddr, vaddr;
-        if (cpa->pfn > max_pfn_mapped)
+        if (cpa->pfn >= max_pfn_mapped)
                return 0;
+#ifdef CONFIG_X86_64
+        if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
+                return 0;
+#endif
        /*
         * No need to redo, when the primary call touched the direct
         * mapping already:
         */
-        if (!within(cpa->vaddr, PAGE_OFFSET,
+        if (cpa->flags & CPA_ARRAY)
-                    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+                vaddr = cpa->vaddr[cpa->curpage];
+        else
+                vaddr = *cpa->vaddr;
+        if (!(within(vaddr, PAGE_OFFSET,
+                    PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
+#ifdef CONFIG_X86_64
+                || within(vaddr, PAGE_OFFSET + (1UL<<32),
+                    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
+#endif
+        )) {
                alias_cpa = *cpa;
-                alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+                temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+                alias_cpa.vaddr = &temp_cpa_vaddr;
+                alias_cpa.flags &= ~CPA_ARRAY;
                ret = __change_page_attr_set_clr(&alias_cpa, 0);
        }
@@ -636,7 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
         * No need to redo, when the primary call touched the high
         * mapping already:
         */
-        if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
+        if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
                return 0;
        /*
@@ -647,8 +692,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
                return 0;
        alias_cpa = *cpa;
-        alias_cpa.vaddr =
+        temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
-                (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
+        alias_cpa.vaddr = &temp_cpa_vaddr;
+        alias_cpa.flags &= ~CPA_ARRAY;
        /*
         * The high mapping range is imprecise, so ignore the return value.
@@ -668,8 +714,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
                 * preservation check.
                 */
                cpa->numpages = numpages;
+                /* for array changes, we can't use large page */
+                if (cpa->flags & CPA_ARRAY)
+                        cpa->numpages = 1;
+                if (!debug_pagealloc)
+                        spin_lock(&cpa_lock);
                ret = __change_page_attr(cpa, checkalias);
+                if (!debug_pagealloc)
+                        spin_unlock(&cpa_lock);
                if (ret)
                        return ret;
@@ -686,7 +739,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
                 */
                BUG_ON(cpa->numpages > numpages);
                numpages -= cpa->numpages;
-                cpa->vaddr += cpa->numpages * PAGE_SIZE;
+                if (cpa->flags & CPA_ARRAY)
+                        cpa->curpage++;
+                else
+                        *cpa->vaddr += cpa->numpages * PAGE_SIZE;
        }
        return 0;
 }
@@ -697,9 +754,9 @@ static inline int cache_attr(pgprot_t attr)
                (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
 }
-static int change_page_attr_set_clr(unsigned long addr, int numpages,
+static int change_page_attr_set_clr(unsigned long *addr, int numpages,
                                    pgprot_t mask_set, pgprot_t mask_clr,
-                                    int force_split)
+                                    int force_split, int array)
 {
        struct cpa_data cpa;
        int ret, cache, checkalias;
@@ -714,21 +771,38 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
                return 0;
        /* Ensure we are PAGE_SIZE aligned */
-        if (addr & ~PAGE_MASK) {
+        if (!array) {
-                addr &= PAGE_MASK;
+                if (*addr & ~PAGE_MASK) {
-                /*
+                        *addr &= PAGE_MASK;
-                 * People should not be passing in unaligned addresses:
+                        /*
-                 */
+                         * People should not be passing in unaligned addresses:
-                WARN_ON_ONCE(1);
+                         */
+                        WARN_ON_ONCE(1);
+                }
+        } else {
+                int i;
+                for (i = 0; i < numpages; i++) {
+                        if (addr[i] & ~PAGE_MASK) {
+                                addr[i] &= PAGE_MASK;
+                                WARN_ON_ONCE(1);
+                        }
+                }
        }
+        /* Must avoid aliasing mappings in the highmem code */
+        kmap_flush_unused();
        cpa.vaddr = addr;
        cpa.numpages = numpages;
        cpa.mask_set = mask_set;
        cpa.mask_clr = mask_clr;
-        cpa.flushtlb = 0;
+        cpa.flags = 0;
+        cpa.curpage = 0;
        cpa.force_split = force_split;
+        if (array)
+                cpa.flags |= CPA_ARRAY;
        /* No alias checking for _NX bit modifications */
        checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -737,7 +811,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
        /*
         * Check whether we really changed something:
         */
-        if (!cpa.flushtlb)
+        if (!(cpa.flags & CPA_FLUSHTLB))
                goto out;
        /*
@@ -752,27 +826,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
         * error case we fall back to cpa_flush_all (which uses
         * wbindv):
         */
-        if (!ret && cpu_has_clflush)
+        if (!ret && cpu_has_clflush) {
-                cpa_flush_range(addr, numpages, cache);
+                if (cpa.flags & CPA_ARRAY)
-        else
+                        cpa_flush_array(addr, numpages, cache);
+                else
+                        cpa_flush_range(*addr, numpages, cache);
+        } else
                cpa_flush_all(cache);
 out:
-        cpa_fill_pool(NULL);
        return ret;
 }
-static inline int change_page_attr_set(unsigned long addr, int numpages,
+static inline int change_page_attr_set(unsigned long *addr, int numpages,
-                                       pgprot_t mask)
+                                       pgprot_t mask, int array)
 {
-        return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
+        return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
+                array);
 }
-static inline int change_page_attr_clear(unsigned long addr, int numpages,
+static inline int change_page_attr_clear(unsigned long *addr, int numpages,
-                                         pgprot_t mask)
+                                         pgprot_t mask, int array)
 {
-        return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
+        return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
+                array);
 }
 int _set_memory_uc(unsigned long addr, int numpages)
@@ -780,8 +857,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
        /*
         * for now UC MINUS. see comments in ioremap_nocache()
         */
-        return change_page_attr_set(addr, numpages,
+        return change_page_attr_set(&addr, numpages,
-                                    __pgprot(_PAGE_CACHE_UC_MINUS));
+                                    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
 }
 int set_memory_uc(unsigned long addr, int numpages)
@@ -789,7 +866,7 @@ int set_memory_uc(unsigned long addr, int numpages)
        /*
         * for now UC MINUS. see comments in ioremap_nocache()
         */
-        if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+        if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
                            _PAGE_CACHE_UC_MINUS, NULL))
                return -EINVAL;
@@ -797,18 +874,56 @@ int set_memory_uc(unsigned long addr, int numpages)
 }
 EXPORT_SYMBOL(set_memory_uc);
+int set_memory_array_uc(unsigned long *addr, int addrinarray)
+{
+        unsigned long start;
+        unsigned long end;
+        int i;
+        /*
+         * for now UC MINUS. see comments in ioremap_nocache()
+         */
+        for (i = 0; i < addrinarray; i++) {
+                start = __pa(addr[i]);
+                for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+                        if (end != __pa(addr[i + 1]))
+                                break;
+                        i++;
+                }
+                if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
+                        goto out;
+        }
+        return change_page_attr_set(addr, addrinarray,
+                                    __pgprot(_PAGE_CACHE_UC_MINUS), 1);
+out:
+        for (i = 0; i < addrinarray; i++) {
+                unsigned long tmp = __pa(addr[i]);
+                if (tmp == start)
+                        break;
+                for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+                        if (end != __pa(addr[i + 1]))
+                                break;
+                        i++;
+                }
+                free_memtype(tmp, end);
+        }
+        return -EINVAL;
+}
+EXPORT_SYMBOL(set_memory_array_uc);
 int _set_memory_wc(unsigned long addr, int numpages)
 {
-        return change_page_attr_set(addr, numpages,
+        return change_page_attr_set(&addr, numpages,
-                                    __pgprot(_PAGE_CACHE_WC));
+                                    __pgprot(_PAGE_CACHE_WC), 0);
 }
 int set_memory_wc(unsigned long addr, int numpages)
 {
-        if (!pat_wc_enabled)
+        if (!pat_enabled)
                return set_memory_uc(addr, numpages);
-        if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+        if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
                _PAGE_CACHE_WC, NULL))
                return -EINVAL;
@@ -818,49 +933,71 @@ EXPORT_SYMBOL(set_memory_wc);
 int _set_memory_wb(unsigned long addr, int numpages)
 {
-        return change_page_attr_clear(addr, numpages,
+        return change_page_attr_clear(&addr, numpages,
-                                      __pgprot(_PAGE_CACHE_MASK));
+                                      __pgprot(_PAGE_CACHE_MASK), 0);
 }
 int set_memory_wb(unsigned long addr, int numpages)
 {
-        free_memtype(addr, addr + numpages * PAGE_SIZE);
+        free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
        return _set_memory_wb(addr, numpages);
 }
 EXPORT_SYMBOL(set_memory_wb);
+int set_memory_array_wb(unsigned long *addr, int addrinarray)
+{
+        int i;
+        for (i = 0; i < addrinarray; i++) {
+                unsigned long start = __pa(addr[i]);
+                unsigned long end;
+                for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
+                        if (end != __pa(addr[i + 1]))
+                                break;
+                        i++;
+                }
+                free_memtype(start, end);
+        }
+        return change_page_attr_clear(addr, addrinarray,
+                                      __pgprot(_PAGE_CACHE_MASK), 1);
+}
+EXPORT_SYMBOL(set_memory_array_wb);
 int set_memory_x(unsigned long addr, int numpages)
 {
-        return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
+        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
 }
 EXPORT_SYMBOL(set_memory_x);
 int set_memory_nx(unsigned long addr, int numpages)
 {
-        return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
+        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
 }
 EXPORT_SYMBOL(set_memory_nx);
 int set_memory_ro(unsigned long addr, int numpages)
 {
-        return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
+        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
 }
+EXPORT_SYMBOL_GPL(set_memory_ro);
 int set_memory_rw(unsigned long addr, int numpages)
 {
-        return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
+        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
 }
+EXPORT_SYMBOL_GPL(set_memory_rw);
 int set_memory_np(unsigned long addr, int numpages)
 {
-        return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
+        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
 }
 int set_memory_4k(unsigned long addr, int numpages)
 {
-        return change_page_attr_set_clr(addr, numpages, __pgprot(0),
+        return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
-                                        __pgprot(0), 1);
+                                        __pgprot(0), 1, 0);
 }
 int set_pages_uc(struct page *page, int numpages)
@@ -913,22 +1050,38 @@ int set_pages_rw(struct page *page, int numpages)
 static int __set_pages_p(struct page *page, int numpages)
 {
-        struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+        unsigned long tempaddr = (unsigned long) page_address(page);
+        struct cpa_data cpa = { .vaddr = &tempaddr,
                                .numpages = numpages,
                                .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
-                                .mask_clr = __pgprot(0)};
+                                .mask_clr = __pgprot(0),
+                                .flags = 0};
-        return __change_page_attr_set_clr(&cpa, 1);
+        /*
+         * No alias checking needed for setting present flag. otherwise,
+         * we may need to break large pages for 64-bit kernel text
+         * mappings (this adds to complexity if we want to do this from
+         * atomic context especially). Let's keep it simple!
+         */
+        return __change_page_attr_set_clr(&cpa, 0);
 }
 static int __set_pages_np(struct page *page, int numpages)
 {
-        struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+        unsigned long tempaddr = (unsigned long) page_address(page);
+        struct cpa_data cpa = { .vaddr = &tempaddr,
                                .numpages = numpages,
                                .mask_set = __pgprot(0),
-                                .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
+                                .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+                                .flags = 0};
-        return __change_page_attr_set_clr(&cpa, 1);
+        /*
+         * No alias checking needed for setting not present flag. otherwise,
+         * we may need to break large pages for 64-bit kernel text
+         * mappings (this adds to complexity if we want to do this from
+         * atomic context especially). Let's keep it simple!
+         */
+        return __change_page_attr_set_clr(&cpa, 0);
 }
 void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -948,11 +1101,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
        /*
         * The return value is ignored as the calls cannot fail.
-         * Large pages are kept enabled at boot time, and are
+         * Large pages for identity mappings are not used at boot time
-         * split up quickly with DEBUG_PAGEALLOC. If a splitup
+         * and hence no memory allocations during large page split.
-         * fails here (due to temporary memory shortage) no damage
-         * is done because we just keep the largepage intact up
-         * to the next attempt when it will likely be split up:
         */
        if (enable)
                __set_pages_p(page, numpages);
@@ -964,53 +1114,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
         * but that can deadlock->flush only current cpu:
         */
        __flush_tlb_all();
-        /*
-         * Try to refill the page pool here. We can do this only after
-         * the tlb flush.
-         */
-        cpa_fill_pool(NULL);
-}
-#ifdef CONFIG_DEBUG_FS
-static int dpa_show(struct seq_file *m, void *v)
-{
-        seq_puts(m, "DEBUG_PAGEALLOC\n");
-        seq_printf(m, "pool_size     : %lu\n", pool_size);
-        seq_printf(m, "pool_pages    : %lu\n", pool_pages);
-        seq_printf(m, "pool_low      : %lu\n", pool_low);
-        seq_printf(m, "pool_used     : %lu\n", pool_used);
-        seq_printf(m, "pool_failed   : %lu\n", pool_failed);
-        return 0;
-}
-static int dpa_open(struct inode *inode, struct file *filp)
-{
-        return single_open(filp, dpa_show, NULL);
 }
-static const struct file_operations dpa_fops = {
-        .open           = dpa_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int __init debug_pagealloc_proc_init(void)
-{
-        struct dentry *de;
-        de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
-                                 &dpa_fops);
-        if (!de)
-                return -ENOMEM;
-        return 0;
-}
-__initcall(debug_pagealloc_proc_init);
-#endif
 #ifdef CONFIG_HIBERNATION
 bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 06b7a1c90fb8..738fd0f24958 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,30 +7,32 @@
 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
 */
-#include <linux/mm.h>
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/gfp.h>
+#include <linux/mm.h>
 #include <linux/fs.h>
-#include <linux/bootmem.h>
-#include <asm/msr.h>
+#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
 #include <asm/processor.h>
-#include <asm/page.h>
+#include <asm/tlbflush.h>
 #include <asm/pgtable.h>
-#include <asm/pat.h>
-#include <asm/e820.h>
-#include <asm/cacheflush.h>
 #include <asm/fcntl.h>
+#include <asm/e820.h>
 #include <asm/mtrr.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
 #include <asm/io.h>
 #ifdef CONFIG_X86_PAT
-int __read_mostly pat_wc_enabled = 1;
+int __read_mostly pat_enabled = 1;
 void __cpuinit pat_disable(char *reason)
 {
-        pat_wc_enabled = 0;
+        pat_enabled = 0;
        printk(KERN_INFO "%s\n", reason);
 }
@@ -42,6 +44,20 @@ static int __init nopat(char *str)
 early_param("nopat", nopat);
 #endif
+static int debug_enable;
+static int __init pat_debug_setup(char *str)
+{
+        debug_enable = 1;
+        return 0;
+}
+__setup("debugpat", pat_debug_setup);
+#define dprintk(fmt, arg...) \
+        do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
 static u64 __read_mostly boot_pat_state;
 enum {
@@ -53,24 +69,25 @@ enum {
        PAT_UC_MINUS = 7,       /* UC, but can be overriden by MTRR */
 };
-#define PAT(x,y)        ((u64)PAT_ ## y << ((x)*8))
+#define PAT(x, y)       ((u64)PAT_ ## y << ((x)*8))
 void pat_init(void)
 {
        u64 pat;
-        if (!pat_wc_enabled)
+        if (!pat_enabled)
                return;
        /* Paranoia check. */
-        if (!cpu_has_pat) {
+        if (!cpu_has_pat && boot_pat_state) {
-                printk(KERN_ERR "PAT enabled, but CPU feature cleared\n");
                /*
-                 * Panic if this happens on the secondary CPU, and we
+                 * If this happens we are on a secondary CPU, but
                 * switched to PAT on the boot CPU. We have no way to
                 * undo PAT.
-                */
+                 */
-                BUG_ON(boot_pat_state);
+                printk(KERN_ERR "PAT enabled, "
+                       "but not supported by secondary CPU\n");
+                BUG();
        }
        /* Set PWT to Write-Combining. All other bits stay the same */
@@ -86,8 +103,8 @@ void pat_init(void)
         *      011 UC          _PAGE_CACHE_UC
         * PAT bit unused
         */
-        pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
+        pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
-              PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
+              PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
        /* Boot CPU check */
        if (!boot_pat_state)
@@ -103,11 +120,11 @@ void pat_init(void)
 static char *cattr_name(unsigned long flags)
 {
        switch (flags & _PAGE_CACHE_MASK) {
-                case _PAGE_CACHE_UC:            return "uncached";
+        case _PAGE_CACHE_UC:            return "uncached";
-                case _PAGE_CACHE_UC_MINUS:      return "uncached-minus";
+        case _PAGE_CACHE_UC_MINUS:      return "uncached-minus";
-                case _PAGE_CACHE_WB:            return "write-back";
+        case _PAGE_CACHE_WB:            return "write-back";
-                case _PAGE_CACHE_WC:            return "write-combining";
+        case _PAGE_CACHE_WC:            return "write-combining";
-                default:                        return "broken";
+        default:                        return "broken";
        }
 }
@@ -129,14 +146,14 @@ static char *cattr_name(unsigned long flags)
 */
 struct memtype {
-        u64 start;
+        u64                     start;
-        u64 end;
+        u64                     end;
-        unsigned long type;
+        unsigned long           type;
-        struct list_head nd;
+        struct list_head        nd;
 };
 static LIST_HEAD(memtype_list);
-static DEFINE_SPINLOCK(memtype_lock);   /* protects memtype list */
+static DEFINE_SPINLOCK(memtype_lock);   /* protects memtype list */
 /*
 * Does intersection of PAT memory type and MTRR memory type and returns
@@ -145,47 +162,113 @@ static DEFINE_SPINLOCK(memtype_lock); 	/* protects memtype list */
 * The intersection is based on "Effective Memory Type" tables in IA-32
 * SDM vol 3a
 */
-static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
+static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
-                                unsigned long *ret_prot)
 {
-        unsigned long pat_type;
-        u8 mtrr_type;
-        pat_type = prot & _PAGE_CACHE_MASK;
-        prot &= (~_PAGE_CACHE_MASK);
-        /*
-         * We return the PAT request directly for types where PAT takes
-         * precedence with respect to MTRR and for UC_MINUS.
-         * Consistency checks with other PAT requests is done later
-         * while going through memtype list.
-         */
-        if (pat_type == _PAGE_CACHE_WC) {
-                *ret_prot = prot | _PAGE_CACHE_WC;
-                return 0;
-        } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
-                *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
-                return 0;
-        } else if (pat_type == _PAGE_CACHE_UC) {
-                *ret_prot = prot | _PAGE_CACHE_UC;
-                return 0;
-        }
        /*
         * Look for MTRR hint to get the effective type in case where PAT
         * request is for WB.
         */
-        mtrr_type = mtrr_type_lookup(start, end);
+        if (req_type == _PAGE_CACHE_WB) {
+                u8 mtrr_type;
+                mtrr_type = mtrr_type_lookup(start, end);
+                if (mtrr_type == MTRR_TYPE_UNCACHABLE)
+                        return _PAGE_CACHE_UC;
+                if (mtrr_type == MTRR_TYPE_WRCOMB)
+                        return _PAGE_CACHE_WC;
+        }
-        if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
+        return req_type;
-                *ret_prot = prot | _PAGE_CACHE_UC;
+}
-        } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
-                *ret_prot = prot | _PAGE_CACHE_WC;
+static int
-        } else {
+chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
-                *ret_prot = prot | _PAGE_CACHE_WB;
+{
+        if (new->type != entry->type) {
+                if (type) {
+                        new->type = entry->type;
+                        *type = entry->type;
+                } else
+                        goto conflict;
        }
+         /* check overlaps with more than one entry in the list */
+        list_for_each_entry_continue(entry, &memtype_list, nd) {
+                if (new->end <= entry->start)
+                        break;
+                else if (new->type != entry->type)
+                        goto conflict;
+        }
        return 0;
+ conflict:
+        printk(KERN_INFO "%s:%d conflicting memory types "
+               "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
+               new->end, cattr_name(new->type), cattr_name(entry->type));
+        return -EBUSY;
+}
+static struct memtype *cached_entry;
+static u64 cached_start;
+/*
+ * For RAM pages, mark the pages as non WB memory type using
+ * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
+ * set_memory_wc() on a RAM page at a time before marking it as WB again.
+ * This is ok, because only one driver will be owning the page and
+ * doing set_memory_*() calls.
+ *
+ * For now, we use PageNonWB to track that the RAM page is being mapped
+ * as non WB. In future, we will have to use one more flag
+ * (or some other mechanism in page_struct) to distinguish between
+ * UC and WC mapping.
+ */
+static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
+                                  unsigned long *new_type)
+{
+        struct page *page;
+        u64 pfn, end_pfn;
+        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+                page = pfn_to_page(pfn);
+                if (page_mapped(page) || PageNonWB(page))
+                        goto out;
+                SetPageNonWB(page);
+        }
+        return 0;
+out:
+        end_pfn = pfn;
+        for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+                page = pfn_to_page(pfn);
+                ClearPageNonWB(page);
+        }
+        return -EINVAL;
+}
+static int free_ram_pages_type(u64 start, u64 end)
+{
+        struct page *page;
+        u64 pfn, end_pfn;
+        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+                page = pfn_to_page(pfn);
+                if (page_mapped(page) || !PageNonWB(page))
+                        goto out;
+                ClearPageNonWB(page);
+        }
+        return 0;
+out:
+        end_pfn = pfn;
+        for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+                page = pfn_to_page(pfn);
+                SetPageNonWB(page);
+        }
+        return -EINVAL;
 }
 /*
@@ -198,37 +281,37 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
 * req_type will have a special case value '-1', when requester want to inherit
 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
 *
- * If ret_type is NULL, function will return an error if it cannot reserve the
+ * If new_type is NULL, function will return an error if it cannot reserve the
- * region with req_type. If ret_type is non-null, function will return
+ * region with req_type. If new_type is non-NULL, function will return
- * available type in ret_type in case of no error. In case of any error
+ * available type in new_type in case of no error. In case of any error
 * it will return a negative return value.
 */
 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
-                        unsigned long *ret_type)
+                    unsigned long *new_type)
 {
-        struct memtype *new_entry = NULL;
+        struct memtype *new, *entry;
-        struct memtype *parse;
        unsigned long actual_type;
+        struct list_head *where;
+        int is_range_ram;
        int err = 0;
-        /* Only track when pat_wc_enabled */
+        BUG_ON(start >= end); /* end is exclusive */
-        if (!pat_wc_enabled) {
+        if (!pat_enabled) {
                /* This is identical to page table setting without PAT */
-                if (ret_type) {
+                if (new_type) {
-                        if (req_type == -1) {
+                        if (req_type == -1)
-                                *ret_type = _PAGE_CACHE_WB;
+                                *new_type = _PAGE_CACHE_WB;
-                        } else {
+                        else
-                                *ret_type = req_type;
+                                *new_type = req_type & _PAGE_CACHE_MASK;
-                        }
                }
                return 0;
        }
        /* Low ISA region is always mapped WB in page table. No need to track */
-        if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
+        if (is_ISA_range(start, end - 1)) {
-                if (ret_type)
+                if (new_type)
-                        *ret_type = _PAGE_CACHE_WB;
+                        *new_type = _PAGE_CACHE_WB;
                return 0;
        }
@@ -241,206 +324,133 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
                 */
                u8 mtrr_type = mtrr_type_lookup(start, end);
-                if (mtrr_type == MTRR_TYPE_WRBACK) {
+                if (mtrr_type == MTRR_TYPE_WRBACK)
-                        req_type = _PAGE_CACHE_WB;
                        actual_type = _PAGE_CACHE_WB;
-                } else {
+                else
-                        req_type = _PAGE_CACHE_UC_MINUS;
                        actual_type = _PAGE_CACHE_UC_MINUS;
-                }
        } else {
-                req_type &= _PAGE_CACHE_MASK;
+                actual_type = pat_x_mtrr_type(start, end,
-                err = pat_x_mtrr_type(start, end, req_type, &actual_type);
+                                              req_type & _PAGE_CACHE_MASK);
        }
-        if (err) {
+        is_range_ram = pagerange_is_ram(start, end);
-                if (ret_type)
+        if (is_range_ram == 1)
-                        *ret_type = actual_type;
+                return reserve_ram_pages_type(start, end, req_type, new_type);
+        else if (is_range_ram < 0)
                return -EINVAL;
-        }
-        new_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+        new  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
-        if (!new_entry)
+        if (!new)
                return -ENOMEM;
-        new_entry->start = start;
+        new->start      = start;
-        new_entry->end = end;
+        new->end        = end;
-        new_entry->type = actual_type;
+        new->type       = actual_type;
-        if (ret_type)
+        if (new_type)
-                *ret_type = actual_type;
+                *new_type = actual_type;
        spin_lock(&memtype_lock);
-        /* Search for existing mapping that overlaps the current range */
+        if (cached_entry && start >= cached_start)
-        list_for_each_entry(parse, &memtype_list, nd) {
+                entry = cached_entry;
-                struct memtype *saved_ptr;
+        else
+                entry = list_entry(&memtype_list, struct memtype, nd);
-                if (parse->start >= end) {
+        /* Search for existing mapping that overlaps the current range */
-                        pr_debug("New Entry\n");
+        where = NULL;
-                        list_add(&new_entry->nd, parse->nd.prev);
+        list_for_each_entry_continue(entry, &memtype_list, nd) {
-                        new_entry = NULL;
+                if (end <= entry->start) {
+                        where = entry->nd.prev;
+                        cached_entry = list_entry(where, struct memtype, nd);
                        break;
-                }
+                } else if (start <= entry->start) { /* end > entry->start */
+                        err = chk_conflict(new, entry, new_type);
-                if (start <= parse->start && end >= parse->start) {
+                        if (!err) {
-                        if (actual_type != parse->type && ret_type) {
+                                dprintk("Overlap at 0x%Lx-0x%Lx\n",
-                                actual_type = parse->type;
+                                        entry->start, entry->end);
-                                *ret_type = actual_type;
+                                where = entry->nd.prev;
-                                new_entry->type = actual_type;
+                                cached_entry = list_entry(where,
-                        }
+                                                        struct memtype, nd);
-                        if (actual_type != parse->type) {
-                                printk(
-                KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
-                                        current->comm, current->pid,
-                                        start, end,
-                                        cattr_name(actual_type),
-                                        cattr_name(parse->type));
-                                err = -EBUSY;
-                                break;
                        }
-                        saved_ptr = parse;
-                        /*
-                         * Check to see whether the request overlaps more
-                         * than one entry in the list
-                         */
-                        list_for_each_entry_continue(parse, &memtype_list, nd) {
-                                if (end <= parse->start) {
-                                        break;
-                                }
-                                if (actual_type != parse->type) {
-                                        printk(
-                KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
-                                                current->comm, current->pid,
-                                                start, end,
-                                                cattr_name(actual_type),
-                                                cattr_name(parse->type));
-                                        err = -EBUSY;
-                                        break;
-                                }
-                        }
-                        if (err) {
-                                break;
-                        }
-                        pr_debug("Overlap at 0x%Lx-0x%Lx\n",
-                               saved_ptr->start, saved_ptr->end);
-                        /* No conflict. Go ahead and add this new entry */
-                        list_add(&new_entry->nd, saved_ptr->nd.prev);
-                        new_entry = NULL;
                        break;
-                }
+                } else if (start < entry->end) { /* start > entry->start */
+                        err = chk_conflict(new, entry, new_type);
-                if (start < parse->end) {
+                        if (!err) {
-                        if (actual_type != parse->type && ret_type) {
+                                dprintk("Overlap at 0x%Lx-0x%Lx\n",
-                                actual_type = parse->type;
+                                        entry->start, entry->end);
-                                *ret_type = actual_type;
+                                cached_entry = list_entry(entry->nd.prev,
-                                new_entry->type = actual_type;
+                                                        struct memtype, nd);
-                        }
+                                /*
-                        if (actual_type != parse->type) {
+                                 * Move to right position in the linked
-                                printk(
+                                 * list to add this new entry
-                KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+                                 */
-                                        current->comm, current->pid,
+                                list_for_each_entry_continue(entry,
-                                        start, end,
+                                                        &memtype_list, nd) {
-                                        cattr_name(actual_type),
+                                        if (start <= entry->start) {
-                                        cattr_name(parse->type));
+                                                where = entry->nd.prev;
-                                err = -EBUSY;
+                                                break;
-                                break;
+                                        }
-                        }
-                        saved_ptr = parse;
-                        /*
-                         * Check to see whether the request overlaps more
-                         * than one entry in the list
-                         */
-                        list_for_each_entry_continue(parse, &memtype_list, nd) {
-                                if (end <= parse->start) {
-                                        break;
-                                }
-                                if (actual_type != parse->type) {
-                                        printk(
-                KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
-                                                current->comm, current->pid,
-                                                start, end,
-                                                cattr_name(actual_type),
-                                                cattr_name(parse->type));
-                                        err = -EBUSY;
-                                        break;
                                }
                        }
-                        if (err) {
-                                break;
-                        }
-                        pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
-                                 saved_ptr->start, saved_ptr->end);
-                        /* No conflict. Go ahead and add this new entry */
-                        list_add(&new_entry->nd, &saved_ptr->nd);
-                        new_entry = NULL;
                        break;
                }
        }
        if (err) {
-                printk(KERN_INFO
+                printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
-        "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
+                       "track %s, req %s\n",
-                        start, end, cattr_name(new_entry->type),
+                       start, end, cattr_name(new->type), cattr_name(req_type));
-                        cattr_name(req_type));
+                kfree(new);
-                kfree(new_entry);
                spin_unlock(&memtype_lock);
                return err;
        }
-        if (new_entry) {
+        cached_start = start;
-                /* No conflict. Not yet added to the list. Add to the tail */
-                list_add_tail(&new_entry->nd, &memtype_list);
-                pr_debug("New Entry\n");
-        }
-        if (ret_type) {
+        if (where)
-                pr_debug(
+                list_add(&new->nd, where);
-        "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+        else
-                        start, end, cattr_name(actual_type),
+                list_add_tail(&new->nd, &memtype_list);
-                        cattr_name(req_type), cattr_name(*ret_type));
-        } else {
-                pr_debug(
-        "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
-                        start, end, cattr_name(actual_type),
-                        cattr_name(req_type));
-        }
        spin_unlock(&memtype_lock);
+        dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+                start, end, cattr_name(new->type), cattr_name(req_type),
+                new_type ? cattr_name(*new_type) : "-");
        return err;
 }
 int free_memtype(u64 start, u64 end)
 {
-        struct memtype *ml;
+        struct memtype *entry;
        int err = -EINVAL;
+        int is_range_ram;
-        /* Only track when pat_wc_enabled */
+        if (!pat_enabled)
-        if (!pat_wc_enabled) {
                return 0;
-        }
        /* Low ISA region is always mapped WB. No need to track */
-        if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
+        if (is_ISA_range(start, end - 1))
                return 0;
-        }
+        is_range_ram = pagerange_is_ram(start, end);
+        if (is_range_ram == 1)
+                return free_ram_pages_type(start, end);
+        else if (is_range_ram < 0)
+                return -EINVAL;
        spin_lock(&memtype_lock);
-        list_for_each_entry(ml, &memtype_list, nd) {
+        list_for_each_entry(entry, &memtype_list, nd) {
-                if (ml->start == start && ml->end == end) {
+                if (entry->start == start && entry->end == end) {
-                        list_del(&ml->nd);
+                        if (cached_entry == entry || cached_start == start)
-                        kfree(ml);
+                                cached_entry = NULL;
+                        list_del(&entry->nd);
+                        kfree(entry);
                        err = 0;
                        break;
                }
@@ -452,27 +462,20 @@ int free_memtype(u64 start, u64 end)
                        current->comm, current->pid, start, end);
        }
-        pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+        dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
        return err;
 }
-/*
- * /dev/mem mmap interface. The memtype used for mapping varies:
- * - Use UC for mappings with O_SYNC flag
- * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
- *   inherit the memtype from existing mapping.
- * - Else use UC_MINUS memtype (for backward compatibility with existing
- *   X drivers.
- */
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                                unsigned long size, pgprot_t vma_prot)
 {
        return vma_prot;
 }
-#ifdef CONFIG_NONPROMISC_DEVMEM
+#ifdef CONFIG_STRICT_DEVMEM
-/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 {
        return 1;
@@ -496,20 +499,20 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
        }
        return 1;
 }
-#endif /* CONFIG_NONPROMISC_DEVMEM */
+#endif /* CONFIG_STRICT_DEVMEM */
 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                                unsigned long size, pgprot_t *vma_prot)
 {
        u64 offset = ((u64) pfn) << PAGE_SHIFT;
-        unsigned long flags = _PAGE_CACHE_UC_MINUS;
+        unsigned long flags = -1;
        int retval;
        if (!range_is_allowed(pfn, size))
                return 0;
        if (file->f_flags & O_SYNC) {
-                flags = _PAGE_CACHE_UC;
+                flags = _PAGE_CACHE_UC_MINUS;
        }
 #ifdef CONFIG_X86_32
@@ -521,24 +524,25 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
         * caching for the high addresses through the KEN pin, but
         * we maintain the tradition of paranoia in this code.
         */
-        if (!pat_wc_enabled &&
+        if (!pat_enabled &&
-            ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
+            !(boot_cpu_has(X86_FEATURE_MTRR) ||
-                test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
+              boot_cpu_has(X86_FEATURE_K6_MTRR) ||
-                test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
+              boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
-                test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
+              boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
-           (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+            (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
                flags = _PAGE_CACHE_UC;
        }
 #endif
        /*
-         * With O_SYNC, we can only take UC mapping. Fail if we cannot.
+         * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
+         *
         * Without O_SYNC, we want to get
         * - WB for WB-able memory and no other conflicting mappings
         * - UC_MINUS for non-WB-able memory with no other conflicting mappings
         * - Inherit from confliting mappings otherwise
         */
-        if (flags != _PAGE_CACHE_UC_MINUS) {
+        if (flags != -1) {
                retval = reserve_memtype(offset, offset + size, flags, NULL);
        } else {
                retval = reserve_memtype(offset, offset + size, -1, &flags);
@@ -547,8 +551,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
        if (retval < 0)
                return 0;
-        if (pfn <= max_pfn_mapped &&
+        if (((pfn < max_low_pfn_mapped) ||
-            ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
+             (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
+            ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
                free_memtype(offset, offset + size);
                printk(KERN_INFO
                "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
@@ -565,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
 {
+        unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
        u64 addr = (u64)pfn << PAGE_SHIFT;
        unsigned long flags;
-        unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
        reserve_memtype(addr, addr + size, want_flags, &flags);
        if (flags != want_flags) {
@@ -587,3 +592,90 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
        free_memtype(addr, addr + size);
 }
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
+/* get Nth element of the linked list */
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+        struct memtype *list_node, *print_entry;
+        int i = 1;
+        print_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+        if (!print_entry)
+                return NULL;
+        spin_lock(&memtype_lock);
+        list_for_each_entry(list_node, &memtype_list, nd) {
+                if (pos == i) {
+                        *print_entry = *list_node;
+                        spin_unlock(&memtype_lock);
+                        return print_entry;
+                }
+                ++i;
+        }
+        spin_unlock(&memtype_lock);
+        kfree(print_entry);
+        return NULL;
+}
+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        if (*pos == 0) {
+                ++*pos;
+                seq_printf(seq, "PAT memtype list:\n");
+        }
+        return memtype_get_idx(*pos);
+}
+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        ++*pos;
+        return memtype_get_idx(*pos);
+}
+static void memtype_seq_stop(struct seq_file *seq, void *v)
+{
+}
+static int memtype_seq_show(struct seq_file *seq, void *v)
+{
+        struct memtype *print_entry = (struct memtype *)v;
+        seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
+                        print_entry->start, print_entry->end);
+        kfree(print_entry);
+        return 0;
+}
+static struct seq_operations memtype_seq_ops = {
+        .start = memtype_seq_start,
+        .next  = memtype_seq_next,
+        .stop  = memtype_seq_stop,
+        .show  = memtype_seq_show,
+};
+static int memtype_seq_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &memtype_seq_ops);
+}
+static const struct file_operations memtype_fops = {
+        .open    = memtype_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+static int __init pat_memtype_list_init(void)
+{
+        debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
+                                NULL, &memtype_fops);
+        return 0;
+}
+late_initcall(pat_memtype_list_init);
+#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 000000000000..efa1911e20ca
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,489 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+/*  Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
+ *  Copyright by Intel Crop., 2002
+ *  Louis Zhuang (louis.zhuang@intel.com)
+ *
+ *  Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
+ */
+#include <linux/module.h>
+#include <linux/ptrace.h> /* struct pt_regs */
+#include "pf_in.h"
+#ifdef __i386__
+/* IA32 Manual 3, 2-1 */
+static unsigned char prefix_codes[] = {
+        0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
+        0x65, 0x2E, 0x3E, 0x66, 0x67
+};
+/* IA32 Manual 3, 3-432*/
+static unsigned int reg_rop[] = {
+        0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+/* IA32 Manual 3, 3-432*/
+static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
+static unsigned int rw32[] = {
+        0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
+static unsigned int mw64[] = {};
+#else /* not __i386__ */
+static unsigned char prefix_codes[] = {
+        0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
+        0xF0, 0xF3, 0xF2,
+        /* REX Prefixes */
+        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
+};
+/* AMD64 Manual 3, Appendix A*/
+static unsigned int reg_rop[] = {
+        0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
+static unsigned int rw32[] = {
+        0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+/* 8 bit only */
+static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
+/* 16 bit only */
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+/* 16 or 32 bit */
+static unsigned int mw32[] = { 0xC7 };
+/* 16, 32 or 64 bit */
+static unsigned int mw64[] = { 0x89, 0x8B };
+#endif /* not __i386__ */
+static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
+                                                                int *rexr)
+{
+        int i;
+        unsigned char *p = addr;
+        *shorted = 0;
+        *enlarged = 0;
+        *rexr = 0;
+restart:
+        for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
+                if (*p == prefix_codes[i]) {
+                        if (*p == 0x66)
+                                *shorted = 1;
+#ifdef __amd64__
+                        if ((*p & 0xf8) == 0x48)
+                                *enlarged = 1;
+                        if ((*p & 0xf4) == 0x44)
+                                *rexr = 1;
+#endif
+                        p++;
+                        goto restart;
+                }
+        }
+        return (p - addr);
+}
+static int get_opcode(unsigned char *addr, unsigned int *opcode)
+{
+        int len;
+        if (*addr == 0x0F) {
+                /* 0x0F is extension instruction */
+                *opcode = *(unsigned short *)addr;
+                len = 2;
+        } else {
+                *opcode = *addr;
+                len = 1;
+        }
+        return len;
+}
+#define CHECK_OP_TYPE(opcode, array, type) \
+        for (i = 0; i < ARRAY_SIZE(array); i++) { \
+                if (array[i] == opcode) { \
+                        rv = type; \
+                        goto exit; \
+                } \
+        }
+enum reason_type get_ins_type(unsigned long ins_addr)
+{
+        unsigned int opcode;
+        unsigned char *p;
+        int shorted, enlarged, rexr;
+        int i;
+        enum reason_type rv = OTHERS;
+        p = (unsigned char *)ins_addr;
+        p += skip_prefix(p, &shorted, &enlarged, &rexr);
+        p += get_opcode(p, &opcode);
+        CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
+        CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
+        CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
+exit:
+        return rv;
+}
+#undef CHECK_OP_TYPE
+static unsigned int get_ins_reg_width(unsigned long ins_addr)
+{
+        unsigned int opcode;
+        unsigned char *p;
+        int i, shorted, enlarged, rexr;
+        p = (unsigned char *)ins_addr;
+        p += skip_prefix(p, &shorted, &enlarged, &rexr);
+        p += get_opcode(p, &opcode);
+        for (i = 0; i < ARRAY_SIZE(rw8); i++)
+                if (rw8[i] == opcode)
+                        return 1;
+        for (i = 0; i < ARRAY_SIZE(rw32); i++)
+                if (rw32[i] == opcode)
+                        return (shorted ? 2 : (enlarged ? 8 : 4));
+        printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+        return 0;
+}
+unsigned int get_ins_mem_width(unsigned long ins_addr)
+{
+        unsigned int opcode;
+        unsigned char *p;
+        int i, shorted, enlarged, rexr;
+        p = (unsigned char *)ins_addr;
+        p += skip_prefix(p, &shorted, &enlarged, &rexr);
+        p += get_opcode(p, &opcode);
+        for (i = 0; i < ARRAY_SIZE(mw8); i++)
+                if (mw8[i] == opcode)
+                        return 1;
+        for (i = 0; i < ARRAY_SIZE(mw16); i++)
+                if (mw16[i] == opcode)
+                        return 2;
+        for (i = 0; i < ARRAY_SIZE(mw32); i++)
+                if (mw32[i] == opcode)
+                        return shorted ? 2 : 4;
+        for (i = 0; i < ARRAY_SIZE(mw64); i++)
+                if (mw64[i] == opcode)
+                        return shorted ? 2 : (enlarged ? 8 : 4);
+        printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+        return 0;
+}
+/*
+ * Define register ident in mod/rm byte.
+ * Note: these are NOT the same as in ptrace-abi.h.
+ */
+enum {
+        arg_AL = 0,
+        arg_CL = 1,
+        arg_DL = 2,
+        arg_BL = 3,
+        arg_AH = 4,
+        arg_CH = 5,
+        arg_DH = 6,
+        arg_BH = 7,
+        arg_AX = 0,
+        arg_CX = 1,
+        arg_DX = 2,
+        arg_BX = 3,
+        arg_SP = 4,
+        arg_BP = 5,
+        arg_SI = 6,
+        arg_DI = 7,
+#ifdef __amd64__
+        arg_R8  = 8,
+        arg_R9  = 9,
+        arg_R10 = 10,
+        arg_R11 = 11,
+        arg_R12 = 12,
+        arg_R13 = 13,
+        arg_R14 = 14,
+        arg_R15 = 15
+#endif
+};
+static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
+{
+        unsigned char *rv = NULL;
+        switch (no) {
+        case arg_AL:
+                rv = (unsigned char *)&regs->ax;
+                break;
+        case arg_BL:
+                rv = (unsigned char *)&regs->bx;
+                break;
+        case arg_CL:
+                rv = (unsigned char *)&regs->cx;
+                break;
+        case arg_DL:
+                rv = (unsigned char *)&regs->dx;
+                break;
+        case arg_AH:
+                rv = 1 + (unsigned char *)&regs->ax;
+                break;
+        case arg_BH:
+                rv = 1 + (unsigned char *)&regs->bx;
+                break;
+        case arg_CH:
+                rv = 1 + (unsigned char *)&regs->cx;
+                break;
+        case arg_DH:
+                rv = 1 + (unsigned char *)&regs->dx;
+                break;
+#ifdef __amd64__
+        case arg_R8:
+                rv = (unsigned char *)&regs->r8;
+                break;
+        case arg_R9:
+                rv = (unsigned char *)&regs->r9;
+                break;
+        case arg_R10:
+                rv = (unsigned char *)&regs->r10;
+                break;
+        case arg_R11:
+                rv = (unsigned char *)&regs->r11;
+                break;
+        case arg_R12:
+                rv = (unsigned char *)&regs->r12;
+                break;
+        case arg_R13:
+                rv = (unsigned char *)&regs->r13;
+                break;
+        case arg_R14:
+                rv = (unsigned char *)&regs->r14;
+                break;
+        case arg_R15:
+                rv = (unsigned char *)&regs->r15;
+                break;
+#endif
+        default:
+                printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+                break;
+        }
+        return rv;
+}
+static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
+{
+        unsigned long *rv = NULL;
+        switch (no) {
+        case arg_AX:
+                rv = &regs->ax;
+                break;
+        case arg_BX:
+                rv = &regs->bx;
+                break;
+        case arg_CX:
+                rv = &regs->cx;
+                break;
+        case arg_DX:
+                rv = &regs->dx;
+                break;
+        case arg_SP:
+                rv = &regs->sp;
+                break;
+        case arg_BP:
+                rv = &regs->bp;
+                break;
+        case arg_SI:
+                rv = &regs->si;
+                break;
+        case arg_DI:
+                rv = &regs->di;
+                break;
+#ifdef __amd64__
+        case arg_R8:
+                rv = &regs->r8;
+                break;
+        case arg_R9:
+                rv = &regs->r9;
+                break;
+        case arg_R10:
+                rv = &regs->r10;
+                break;
+        case arg_R11:
+                rv = &regs->r11;
+                break;
+        case arg_R12:
+                rv = &regs->r12;
+                break;
+        case arg_R13:
+                rv = &regs->r13;
+                break;
+        case arg_R14:
+                rv = &regs->r14;
+                break;
+        case arg_R15:
+                rv = &regs->r15;
+                break;
+#endif
+        default:
+                printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+        }
+        return rv;
+}
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
+{
+        unsigned int opcode;
+        unsigned char mod_rm;
+        int reg;
+        unsigned char *p;
+        int i, shorted, enlarged, rexr;
+        unsigned long rv;
+        p = (unsigned char *)ins_addr;
+        p += skip_prefix(p, &shorted, &enlarged, &rexr);
+        p += get_opcode(p, &opcode);
+        for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
+                if (reg_rop[i] == opcode) {
+                        rv = REG_READ;
+                        goto do_work;
+                }
+        for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
+                if (reg_wop[i] == opcode) {
+                        rv = REG_WRITE;
+                        goto do_work;
+                }
+        printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
+                                                        "0x%02x\n", opcode);
+        goto err;
+do_work:
+        mod_rm = *p;
+        reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
+        switch (get_ins_reg_width(ins_addr)) {
+        case 1:
+                return *get_reg_w8(reg, regs);
+        case 2:
+                return *(unsigned short *)get_reg_w32(reg, regs);
+        case 4:
+                return *(unsigned int *)get_reg_w32(reg, regs);
+#ifdef __amd64__
+        case 8:
+                return *(unsigned long *)get_reg_w32(reg, regs);
+#endif
+        default:
+                printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
+        }
+err:
+        return 0;
+}
+unsigned long get_ins_imm_val(unsigned long ins_addr)
+{
+        unsigned int opcode;
+        unsigned char mod_rm;
+        unsigned char mod;
+        unsigned char *p;
+        int i, shorted, enlarged, rexr;
+        unsigned long rv;
+        p = (unsigned char *)ins_addr;
+        p += skip_prefix(p, &shorted, &enlarged, &rexr);
+        p += get_opcode(p, &opcode);
+        for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
+                if (imm_wop[i] == opcode) {
+                        rv = IMM_WRITE;
+                        goto do_work;
+                }
+        printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
+                                                        "0x%02x\n", opcode);
+        goto err;
+do_work:
+        mod_rm = *p;
+        mod = mod_rm >> 6;
+        p++;
+        switch (mod) {
+        case 0:
+                /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2)  */
+                /* AMD64: XXX Check for address size prefix? */
+                if ((mod_rm & 0x7) == 0x5)
+                        p += 4;
+                break;
+        case 1:
+                p += 1;
+                break;
+        case 2:
+                p += 4;
+                break;
+        case 3:
+        default:
+                printk(KERN_ERR "mmiotrace: not a memory access instruction "
+                                                "at 0x%lx, rm_mod=0x%02x\n",
+                                                ins_addr, mod_rm);
+        }
+        switch (get_ins_reg_width(ins_addr)) {
+        case 1:
+                return *(unsigned char *)p;
+        case 2:
+                return *(unsigned short *)p;
+        case 4:
+                return *(unsigned int *)p;
+#ifdef __amd64__
+        case 8:
+                return *(unsigned long *)p;
+#endif
+        default:
+                printk(KERN_ERR "mmiotrace: Error: width.\n");
+        }
+err:
+        return 0;
+}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 000000000000..e05341a51a27
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+#ifndef __PF_H_
+#define __PF_H_
+enum reason_type {
+        NOT_ME, /* page fault is not in regions */
+        NOTHING,        /* access others point in regions */
+        REG_READ,       /* read from addr to reg */
+        REG_WRITE,      /* write from reg to addr */
+        IMM_WRITE,      /* write from imm to addr */
+        OTHERS  /* Other instructions can not intercept */
+};
+enum reason_type get_ins_type(unsigned long ins_addr);
+unsigned int get_ins_mem_width(unsigned long ins_addr);
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
+unsigned long get_ins_imm_val(unsigned long ins_addr);
+#endif /* __PF_H_ */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 50159764f694..86f2ffc43c3d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -2,6 +2,7 @@
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
+#include <asm/fixmap.h>
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
@@ -62,16 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD                           \
        (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
-static void pgd_ctor(void *p)
+static void pgd_ctor(pgd_t *pgd)
 {
-        pgd_t *pgd = p;
-        unsigned long flags;
-        /* Clear usermode parts of PGD */
-        memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
-        spin_lock_irqsave(&pgd_lock, flags);
        /* If the pgd points to a shared pagetable level (either the
           ptes in non-PAE, or shared PMD in PAE), then just copy the
           references from swapper_pg_dir. */
@@ -90,11 +83,9 @@ static void pgd_ctor(void *p)
        /* list required to sync kernel mapping updates */
        if (!SHARED_KERNEL_PMD)
                pgd_list_add(pgd);
-        spin_unlock_irqrestore(&pgd_lock, flags);
 }
-static void pgd_dtor(void *pgd)
+static void pgd_dtor(pgd_t *pgd)
 {
        unsigned long flags; /* can be called from interrupt context */
@@ -119,6 +110,72 @@ static void pgd_dtor(void *pgd)
 #ifdef CONFIG_X86_PAE
 /*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update.  Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+#define PREALLOCATED_PMDS       UNSHARED_PTRS_PER_PGD
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+        /* Note: almost everything apart from _PAGE_PRESENT is
+           reserved at the pmd (PDPT) level. */
+        set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+        /*
+         * According to Intel App note "TLBs, Paging-Structure Caches,
+         * and Their Invalidation", April 2007, document 317080-001,
+         * section 8.1: in PAE mode we explicitly have to flush the
+         * TLB via cr3 if the top-level pgd is changed...
+         */
+        if (mm == current->active_mm)
+                write_cr3(read_cr3());
+}
+#else  /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+#define PREALLOCATED_PMDS       0
+#endif  /* CONFIG_X86_PAE */
+static void free_pmds(pmd_t *pmds[])
+{
+        int i;
+        for(i = 0; i < PREALLOCATED_PMDS; i++)
+                if (pmds[i])
+                        free_page((unsigned long)pmds[i]);
+}
+static int preallocate_pmds(pmd_t *pmds[])
+{
+        int i;
+        bool failed = false;
+        for(i = 0; i < PREALLOCATED_PMDS; i++) {
+                pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+                if (pmd == NULL)
+                        failed = true;
+                pmds[i] = pmd;
+        }
+        if (failed) {
+                free_pmds(pmds);
+                return -ENOMEM;
+        }
+        return 0;
+}
+/*
 * Mop up any pmd pages which may still be attached to the pgd.
 * Normally they will be freed by munmap/exit_mmap, but any pmd we
 * preallocate which never got a corresponding vma will need to be
@@ -128,7 +185,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 {
        int i;
-        for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+        for(i = 0; i < PREALLOCATED_PMDS; i++) {
                pgd_t pgd = pgdp[i];
                if (pgd_val(pgd) != 0) {
@@ -142,32 +199,20 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
        }
 }
-/*
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
- * updating the top-level pagetable entries to guarantee the
- * processor notices the update.  Since this is expensive, and
- * all 4 top-level entries are used almost immediately in a
- * new process's life, we just pre-populate them here.
- *
- * Also, if we're in a paravirt environment where the kernel pmd is
- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
- * and initialize the kernel pmds here.
- */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
 {
        pud_t *pud;
        unsigned long addr;
        int i;
+        if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
+                return;
        pud = pud_offset(pgd, 0);
-        for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
-             i++, pud++, addr += PUD_SIZE) {
-                pmd_t *pmd = pmd_alloc_one(mm, addr);
-                if (!pmd) {
+        for (addr = i = 0; i < PREALLOCATED_PMDS;
-                        pgd_mop_up_pmds(mm, pgd);
+             i++, pud++, addr += PUD_SIZE) {
-                        return 0;
+                pmd_t *pmd = pmds[i];
-                }
                if (i >= KERNEL_PGD_BOUNDARY)
                        memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
@@ -175,61 +220,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
                pud_populate(mm, pud, pmd);
        }
-        return 1;
 }
-void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+        pgd_t *pgd;
+        pmd_t *pmds[PREALLOCATED_PMDS];
+        unsigned long flags;
-        /* Note: almost everything apart from _PAGE_PRESENT is
+        pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-           reserved at the pmd (PDPT) level. */
-        set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
-        /*
+        if (pgd == NULL)
-         * According to Intel App note "TLBs, Paging-Structure Caches,
+                goto out;
-         * and Their Invalidation", April 2007, document 317080-001,
-         * section 8.1: in PAE mode we explicitly have to flush the
-         * TLB via cr3 if the top-level pgd is changed...
-         */
-        if (mm == current->active_mm)
-                write_cr3(read_cr3());
-}
-#else  /* !CONFIG_X86_PAE */
-/* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-        return 1;
-}
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+        mm->pgd = pgd;
-{
-}
-#endif  /* CONFIG_X86_PAE */
-pgd_t *pgd_alloc(struct mm_struct *mm)
+        if (preallocate_pmds(pmds) != 0)
-{
+                goto out_free_pgd;
-        pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-        /* so that alloc_pmd can use it */
+        if (paravirt_pgd_alloc(mm) != 0)
-        mm->pgd = pgd;
+                goto out_free_pmds;
-        if (pgd)
-                pgd_ctor(pgd);
-        if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+        /*
-                pgd_dtor(pgd);
+         * Make sure that pre-populating the pmds is atomic with
-                free_page((unsigned long)pgd);
+         * respect to anything walking the pgd_list, so that they
-                pgd = NULL;
+         * never see a partially populated pgd.
-        }
+         */
+        spin_lock_irqsave(&pgd_lock, flags);
+        pgd_ctor(pgd);
+        pgd_prepopulate_pmd(mm, pgd, pmds);
+        spin_unlock_irqrestore(&pgd_lock, flags);
        return pgd;
+out_free_pmds:
+        free_pmds(pmds);
+out_free_pgd:
+        free_page((unsigned long)pgd);
+out:
+        return NULL;
 }
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
        pgd_mop_up_pmds(mm, pgd);
        pgd_dtor(pgd);
+        paravirt_pgd_free(mm, pgd);
        free_page((unsigned long)pgd);
 }
@@ -255,7 +293,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
        if (pte_young(*ptep))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
-                                         &ptep->pte);
+                                         (unsigned long *) &ptep->pte);
        if (ret)
                pte_update(vma->vm_mm, addr, ptep);
@@ -274,3 +312,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
        return young;
 }
+int fixmaps_set;
+void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
+{
+        unsigned long address = __fix_to_virt(idx);
+        if (idx >= __end_of_fixed_addresses) {
+                BUG();
+                return;
+        }
+        set_pte_vaddr(address, pte);
+        fixmaps_set++;
+}
+void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
+{
+        __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
+}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 369cf065b6a4..0951db9ee519 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,58 +20,11 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
-void show_mem(void)
-{
-        int total = 0, reserved = 0;
-        int shared = 0, cached = 0;
-        int highmem = 0;
-        struct page *page;
-        pg_data_t *pgdat;
-        unsigned long i;
-        unsigned long flags;
-        printk(KERN_INFO "Mem-info:\n");
-        show_free_areas();
-        for_each_online_pgdat(pgdat) {
-                pgdat_resize_lock(pgdat, &flags);
-                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-                        if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
-                                touch_nmi_watchdog();
-                        page = pgdat_page_nr(pgdat, i);
-                        total++;
-                        if (PageHighMem(page))
-                                highmem++;
-                        if (PageReserved(page))
-                                reserved++;
-                        else if (PageSwapCache(page))
-                                cached++;
-                        else if (page_count(page))
-                                shared += page_count(page) - 1;
-                }
-                pgdat_resize_unlock(pgdat, &flags);
-        }
-        printk(KERN_INFO "%d pages of RAM\n", total);
-        printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
-        printk(KERN_INFO "%d reserved pages\n", reserved);
-        printk(KERN_INFO "%d pages shared\n", shared);
-        printk(KERN_INFO "%d pages swap cached\n", cached);
-        printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
-        printk(KERN_INFO "%lu pages writeback\n",
-                                        global_page_state(NR_WRITEBACK));
-        printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
-        printk(KERN_INFO "%lu pages slab\n",
-                global_page_state(NR_SLAB_RECLAIMABLE) +
-                global_page_state(NR_SLAB_UNRECLAIMABLE));
-        printk(KERN_INFO "%lu pages pagetables\n",
-                                        global_page_state(NR_PAGETABLE));
-}
 /*
 * Associate a virtual page frame with a given physical page frame 
 * and protection flags for that frame.
 */ 
-static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -94,8 +47,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
                return;
        }
        pte = pte_offset_kernel(pmd, vaddr);
-        if (pgprot_val(flags))
+        if (pte_val(pteval))
-                set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags));
+                set_pte_present(&init_mm, vaddr, pte, pteval);
        else
                pte_clear(&init_mm, vaddr, pte);
@@ -141,22 +94,9 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
        __flush_tlb_one(vaddr);
 }
-static int fixmaps;
 unsigned long __FIXADDR_TOP = 0xfffff000;
 EXPORT_SYMBOL(__FIXADDR_TOP);
-void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
-{
-        unsigned long address = __fix_to_virt(idx);
-        if (idx >= __end_of_fixed_addresses) {
-                BUG();
-                return;
-        }
-        set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
-        fixmaps++;
-}
 /**
 * reserve_top_address - reserves a hole in the top of kernel address space
 * @reserve - size of hole to reserve
@@ -164,11 +104,45 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 * Can be used to relocate the fixmap area and poke a hole in the top
 * of kernel address space to make room for a hypervisor.
 */
-void reserve_top_address(unsigned long reserve)
+void __init reserve_top_address(unsigned long reserve)
 {
-        BUG_ON(fixmaps > 0);
+        BUG_ON(fixmaps_set > 0);
        printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
               (int)-reserve);
        __FIXADDR_TOP = -reserve - PAGE_SIZE;
        __VMALLOC_RESERVE += reserve;
 }
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+        if (!arg)
+                return -EINVAL;
+        /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
+        __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
+        return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+        unsigned long address;
+        if (!arg)
+                return -EINVAL;
+        address = memparse(arg, &arg);
+        reserve_top_address(address);
+        return 0;
+}
+early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
new file mode 100644
index 000000000000..16ae70fc57e7
--- /dev/null
+++ b/arch/x86/mm/srat_32.c
@@ -0,0 +1,283 @@
+/*
+ * Some of the code in this file has been gleaned from the 64 bit 
+ * discontigmem support code base.
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to Pat Gaughen <gone@us.ibm.com>
+ */
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/acpi.h>
+#include <linux/nodemask.h>
+#include <asm/srat.h>
+#include <asm/topology.h>
+#include <asm/smp.h>
+#include <asm/e820.h>
+/*
+ * proximity macros and definitions
+ */
+#define NODE_ARRAY_INDEX(x)     ((x) / 8)       /* 8 bits/char */
+#define NODE_ARRAY_OFFSET(x)    ((x) % 8)       /* 8 bits/char */
+#define BMAP_SET(bmap, bit)     ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
+#define BMAP_TEST(bmap, bit)    ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
+/* bitmap length; _PXM is at most 255 */
+#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
+static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN];        /* bitmap of proximity domains */
+#define MAX_CHUNKS_PER_NODE     3
+#define MAXCHUNKS               (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
+struct node_memory_chunk_s {
+        unsigned long   start_pfn;
+        unsigned long   end_pfn;
+        u8      pxm;            // proximity domain of node
+        u8      nid;            // which cnode contains this chunk?
+        u8      bank;           // which mem bank on this node
+};
+static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
+static int __initdata num_memory_chunks; /* total number of memory chunks */
+static u8 __initdata apicid_to_pxm[MAX_APICID];
+int numa_off __initdata;
+int acpi_numa __initdata;
+static __init void bad_srat(void)
+{
+        printk(KERN_ERR "SRAT: SRAT not used.\n");
+        acpi_numa = -1;
+        num_memory_chunks = 0;
+}
+static __init inline int srat_disabled(void)
+{
+        return numa_off || acpi_numa < 0;
+}
+/* Identify CPU proximity domains */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
+{
+        if (srat_disabled())
+                return;
+        if (cpu_affinity->header.length !=
+             sizeof(struct acpi_srat_cpu_affinity)) {
+                bad_srat();
+                return;
+        }
+        if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+                return;         /* empty entry */
+        /* mark this node as "seen" in node bitmap */
+        BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
+        apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
+        printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
+                cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
+}
+/*
+ * Identify memory proximity domains and hot-remove capabilities.
+ * Fill node memory chunk list structure.
+ */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
+{
+        unsigned long long paddr, size;
+        unsigned long start_pfn, end_pfn;
+        u8 pxm;
+        struct node_memory_chunk_s *p, *q, *pend;
+        if (srat_disabled())
+                return;
+        if (memory_affinity->header.length !=
+             sizeof(struct acpi_srat_mem_affinity)) {
+                bad_srat();
+                return;
+        }
+        if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+                return;         /* empty entry */
+        pxm = memory_affinity->proximity_domain & 0xff;
+        /* mark this node as "seen" in node bitmap */
+        BMAP_SET(pxm_bitmap, pxm);
+        /* calculate info for memory chunk structure */
+        paddr = memory_affinity->base_address;
+        size = memory_affinity->length;
+        start_pfn = paddr >> PAGE_SHIFT;
+        end_pfn = (paddr + size) >> PAGE_SHIFT;
+        if (num_memory_chunks >= MAXCHUNKS) {
+                printk(KERN_WARNING "Too many mem chunks in SRAT."
+                        " Ignoring %lld MBytes at %llx\n",
+                        size/(1024*1024), paddr);
+                return;
+        }
+        /* Insertion sort based on base address */
+        pend = &node_memory_chunk[num_memory_chunks];
+        for (p = &node_memory_chunk[0]; p < pend; p++) {
+                if (start_pfn < p->start_pfn)
+                        break;
+        }
+        if (p < pend) {
+                for (q = pend; q >= p; q--)
+                        *(q + 1) = *q;
+        }
+        p->start_pfn = start_pfn;
+        p->end_pfn = end_pfn;
+        p->pxm = pxm;
+        num_memory_chunks++;
+        printk(KERN_DEBUG "Memory range %08lx to %08lx"
+                          " in proximity domain %02x %s\n",
+                start_pfn, end_pfn,
+                pxm,
+                ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
+                 "enabled and removable" : "enabled" ) );
+}
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+}
+void acpi_numa_arch_fixup(void)
+{
+}
+/*
+ * The SRAT table always lists ascending addresses, so can always
+ * assume that the first "start" address that you see is the real
+ * start of the node, and that the current "end" address is after
+ * the previous one.
+ */
+static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
+{
+        /*
+         * Only add present memory as told by the e820.
+         * There is no guarantee from the SRAT that the memory it
+         * enumerates is present at boot time because it represents
+         * *possible* memory hotplug areas the same as normal RAM.
+         */
+        if (memory_chunk->start_pfn >= max_pfn) {
+                printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
+                        memory_chunk->start_pfn, memory_chunk->end_pfn);
+                return -1;
+        }
+        if (memory_chunk->nid != nid)
+                return -1;
+        if (!node_has_online_mem(nid))
+                node_start_pfn[nid] = memory_chunk->start_pfn;
+        if (node_start_pfn[nid] > memory_chunk->start_pfn)
+                node_start_pfn[nid] = memory_chunk->start_pfn;
+        if (node_end_pfn[nid] < memory_chunk->end_pfn)
+                node_end_pfn[nid] = memory_chunk->end_pfn;
+        return 0;
+}
+int __init get_memcfg_from_srat(void)
+{
+        int i, j, nid;
+        if (srat_disabled())
+                goto out_fail;
+        if (num_memory_chunks == 0) {
+                printk(KERN_WARNING
+                         "could not finy any ACPI SRAT memory areas.\n");
+                goto out_fail;
+        }
+        /* Calculate total number of nodes in system from PXM bitmap and create
+         * a set of sequential node IDs starting at zero.  (ACPI doesn't seem
+         * to specify the range of _PXM values.)
+         */
+        /*
+         * MCD - we no longer HAVE to number nodes sequentially.  PXM domain
+         * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
+         * 32, so we will continue numbering them in this manner until MAX_NUMNODES
+         * approaches MAX_PXM_DOMAINS for i386.
+         */
+        nodes_clear(node_online_map);
+        for (i = 0; i < MAX_PXM_DOMAINS; i++) {
+                if (BMAP_TEST(pxm_bitmap, i)) {
+                        int nid = acpi_map_pxm_to_node(i);
+                        node_set_online(nid);
+                }
+        }
+        BUG_ON(num_online_nodes() == 0);
+        /* set cnode id in memory chunk structure */
+        for (i = 0; i < num_memory_chunks; i++)
+                node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
+        printk(KERN_DEBUG "pxm bitmap: ");
+        for (i = 0; i < sizeof(pxm_bitmap); i++) {
+                printk(KERN_CONT "%02x ", pxm_bitmap[i]);
+        }
+        printk(KERN_CONT "\n");
+        printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
+                         num_online_nodes());
+        printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
+                         num_memory_chunks);
+        for (i = 0; i < MAX_APICID; i++)
+                apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
+        for (j = 0; j < num_memory_chunks; j++){
+                struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
+                printk(KERN_DEBUG
+                        "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
+                       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
+                if (node_read_chunk(chunk->nid, chunk))
+                        continue;
+                e820_register_active_regions(chunk->nid, chunk->start_pfn,
+                                             min(chunk->end_pfn, max_pfn));
+        }
+        for_each_online_node(nid) {
+                unsigned long start = node_start_pfn[nid];
+                unsigned long end = min(node_end_pfn[nid], max_pfn);
+                memory_present(nid, start, end);
+                node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
+        }
+        return 1;
+out_fail:
+        printk(KERN_ERR "failed to get NUMA memory information from SRAT"
+                        " table\n");
+        return 0;
+}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 99649dccad28..1b4763e26ea9 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -100,7 +100,19 @@ static __init inline int srat_disabled(void)
 /* Callback for SLIT parsing */
 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 {
-        acpi_slit = slit;
+        unsigned length;
+        unsigned long phys;
+        length = slit->header.length;
+        phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
+                 PAGE_SIZE);
+        if (phys == -1L)
+                panic(" Can not save slit!\n");
+        acpi_slit = __va(phys);
+        memcpy(acpi_slit, slit, length);
+        reserve_early(phys, phys + length, "ACPI SLIT");
 }
 /* Callback for Proximity Domain -> LAPIC mapping */
@@ -299,7 +311,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
                        pxmram = 0;
        }
-        e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
+        e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
        /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
        if ((long)(e820ram - pxmram) >= 1*1024*1024) {
                printk(KERN_ERR
@@ -376,7 +388,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
                if (node == NUMA_NO_NODE)
                        continue;
                if (!node_isset(node, node_possible_map))
-                        numa_set_node(i, NUMA_NO_NODE);
+                        numa_clear_node(i);
        }
        numa_init_array();
        return 0;
@@ -495,6 +507,7 @@ int __node_distance(int a, int b)
 EXPORT_SYMBOL(__node_distance);
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
 int memory_add_physaddr_to_nid(u64 start)
 {
        int i, ret = 0;
@@ -506,4 +519,4 @@ int memory_add_physaddr_to_nid(u64 start)
        return ret;
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 000000000000..d877c5b423ef
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,71 @@
+/*
+ * Written by Pekka Paalanen, 2008 <pq@iki.fi>
+ */
+#include <linux/module.h>
+#include <linux/io.h>
+#define MODULE_NAME "testmmiotrace"
+static unsigned long mmio_address;
+module_param(mmio_address, ulong, 0);
+MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
+static void do_write_test(void __iomem *p)
+{
+        unsigned int i;
+        for (i = 0; i < 256; i++)
+                iowrite8(i, p + i);
+        for (i = 1024; i < (5 * 1024); i += 2)
+                iowrite16(i * 12 + 7, p + i);
+        for (i = (5 * 1024); i < (16 * 1024); i += 4)
+                iowrite32(i * 212371 + 13, p + i);
+}
+static void do_read_test(void __iomem *p)
+{
+        unsigned int i;
+        for (i = 0; i < 256; i++)
+                ioread8(p + i);
+        for (i = 1024; i < (5 * 1024); i += 2)
+                ioread16(p + i);
+        for (i = (5 * 1024); i < (16 * 1024); i += 4)
+                ioread32(p + i);
+}
+static void do_test(void)
+{
+        void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
+        if (!p) {
+                pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
+                return;
+        }
+        do_write_test(p);
+        do_read_test(p);
+        iounmap(p);
+}
+static int __init init(void)
+{
+        if (mmio_address == 0) {
+                pr_err(MODULE_NAME ": you have to use the module argument "
+                                                        "mmio_address.\n");
+                pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
+                                " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+                return -ENXIO;
+        }
+        pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
+                                        "in PCI address space, and writing "
+                                        "rubbish in there.\n", mmio_address);
+        do_test();
+        return 0;
+}
+static void __exit cleanup(void)
+{
+        pr_debug(MODULE_NAME ": unloaded.\n");
+}
+module_init(init);
+module_exit(cleanup);
+MODULE_LICENSE("GPL");