12 files changed, 670 insertions, 70 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index d8cc96a2738f..9f05157220f5 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,8 @@
 obj-y   :=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
            pat.o pgtable.o gup.o
+obj-$(CONFIG_X86_SMP)           += tlb.o
 obj-$(CONFIG_X86_32)            += pgtable_32.o iomap_32.o
 obj-$(CONFIG_HUGETLB_PAGE)      += hugetlbpage.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 7e8db53528a7..61b41ca3b5a2 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs)
        fixup = search_exception_tables(regs->ip);
        if (fixup) {
+                /* If fixup is less than 16, it means uaccess error */
+                if (fixup->fixup < 16) {
+                        current_thread_info()->uaccess_err = -EFAULT;
+                        regs->ip += fixup->fixup;
+                        return 1;
+                }
                regs->ip = fixup->fixup;
                return 1;
        }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8f4b859a04b3..d3eee74f830a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -26,6 +26,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/magic.h>
 #include <asm/system.h>
 #include <asm/desc.h>
@@ -432,6 +433,8 @@ static noinline void no_context(struct pt_regs *regs,
                        unsigned long error_code, unsigned long address)
 {
        struct task_struct *tsk = current;
+        unsigned long *stackend;
 #ifdef CONFIG_X86_64
        unsigned long flags;
        int sig;
@@ -468,6 +471,10 @@ static noinline void no_context(struct pt_regs *regs,
        show_fault_oops(regs, error_code, address);
+        stackend = end_of_stack(tsk);
+        if (*stackend != STACK_END_MAGIC)
+                printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
        tsk->thread.cr2 = address;
        tsk->thread.trap_no = 14;
        tsk->thread.error_code = error_code;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 88f1b10de3be..00263bf07a88 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,7 +49,6 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
-#include <asm/smp.h>
 unsigned int __VMALLOC_RESERVE = 128 << 20;
@@ -138,6 +137,47 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
        return pte_offset_kernel(pmd, 0);
 }
+static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
+                                           unsigned long vaddr, pte_t *lastpte)
+{
+#ifdef CONFIG_HIGHMEM
+        /*
+         * Something (early fixmap) may already have put a pte
+         * page here, which causes the page table allocation
+         * to become nonlinear. Attempt to fix it, and if it
+         * is still nonlinear then we have to bug.
+         */
+        int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+        int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+        if (pmd_idx_kmap_begin != pmd_idx_kmap_end
+            && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
+            && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
+            && ((__pa(pte) >> PAGE_SHIFT) < table_start
+                || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+                pte_t *newpte;
+                int i;
+                BUG_ON(after_init_bootmem);
+                newpte = alloc_low_page();
+                for (i = 0; i < PTRS_PER_PTE; i++)
+                        set_pte(newpte + i, pte[i]);
+                paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
+                set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
+                BUG_ON(newpte != pte_offset_kernel(pmd, 0));
+                __flush_tlb_all();
+                paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
+                pte = newpte;
+        }
+        BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
+               && vaddr > fix_to_virt(FIX_KMAP_END)
+               && lastpte && lastpte + PTRS_PER_PTE != pte);
+#endif
+        return pte;
+}
 /*
 * This function initializes a certain range of kernel virtual memory
 * with new bootmem page tables, everywhere page tables are missing in
@@ -154,6 +194,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
        unsigned long vaddr;
        pgd_t *pgd;
        pmd_t *pmd;
+        pte_t *pte = NULL;
        vaddr = start;
        pgd_idx = pgd_index(vaddr);
@@ -165,7 +206,8 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
                pmd = pmd + pmd_index(vaddr);
                for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
                                                        pmd++, pmd_idx++) {
-                        one_page_table_init(pmd);
+                        pte = page_table_kmap_check(one_page_table_init(pmd),
+                                                    pmd, vaddr, pte);
                        vaddr += PMD_SIZE;
                }
@@ -508,7 +550,6 @@ static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
         * Fixed mappings, only the page table structure has to be
         * created - mappings will be set by set_fixmap():
         */
-        early_ioremap_clear();
        vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
        end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
        page_table_range_init(vaddr, end, pgd_base);
@@ -801,7 +842,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse)
        tables += PAGE_ALIGN(ptes * sizeof(pte_t));
        /* for fixmap */
-        tables += PAGE_SIZE * 2;
+        tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
        /*
         * RED-PEN putting page tables only on node 0 could
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 23f68e77ad1f..e6d36b490250 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -596,7 +596,7 @@ static void __init init_gbpages(void)
                direct_gbpages = 0;
 }
-static unsigned long __init kernel_physical_mapping_init(unsigned long start,
+static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
                                                unsigned long end,
                                                unsigned long page_size_mask)
 {
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index d0151d8ce452..ca53224fc56c 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -17,6 +17,7 @@
 */
 #include <asm/iomap.h>
+#include <asm/pat.h>
 #include <linux/module.h>
 /* Map 'pfn' using fixed map 'type' and protections 'prot'
@@ -29,6 +30,15 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
        pagefault_disable();
+        /*
+         * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
+         * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
+         * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
+         * user, which is "WC if the MTRR is WC, UC if you can't do that."
+         */
+        if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
+                prot = PAGE_KERNEL_UC_MINUS;
        idx = type + KM_TYPE_NR*smp_processor_id();
        vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
        set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bd85d42819e1..1448bcb7f22f 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -367,7 +367,7 @@ EXPORT_SYMBOL(ioremap_nocache);
 *
 * Must be freed with iounmap.
 */
-void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
        if (pat_enabled)
                return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
@@ -557,34 +557,9 @@ void __init early_ioremap_init(void)
        }
 }
-void __init early_ioremap_clear(void)
-{
-        pmd_t *pmd;
-        if (early_ioremap_debug)
-                printk(KERN_INFO "early_ioremap_clear()\n");
-        pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
-        pmd_clear(pmd);
-        paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
-        __flush_tlb_all();
-}
 void __init early_ioremap_reset(void)
 {
-        enum fixed_addresses idx;
-        unsigned long addr, phys;
-        pte_t *pte;
        after_paging_init = 1;
-        for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
-                addr = fix_to_virt(idx);
-                pte = early_ioremap_pte(addr);
-                if (pte_present(*pte)) {
-                        phys = pte_val(*pte) & PAGE_MASK;
-                        set_fixmap(idx, phys);
-                }
-        }
 }
 static void __init __early_set_fixmap(enum fixed_addresses idx,
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 71a14f89f89e..08d140fbc31b 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,6 +20,12 @@
 #include <asm/acpi.h>
 #include <asm/k8.h>
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
+#else
+# define DBG(x...)
+#endif
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
@@ -33,6 +39,21 @@ int numa_off __initdata;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
+DEFINE_PER_CPU(int, node_number) = 0;
+EXPORT_PER_CPU_SYMBOL(node_number);
+/*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+/*
+ * Which logical CPUs are on which nodes
+ */
+cpumask_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
 /*
 * Given a shift value, try to populate memnodemap[]
 * Returns :
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void)
 #endif
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: node_to_cpumask() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+void __init setup_node_to_cpumask_map(void)
+{
+        unsigned int node, num = 0;
+        cpumask_t *map;
+        /* setup nr_node_ids if not done yet */
+        if (nr_node_ids == MAX_NUMNODES) {
+                for_each_node_mask(node, node_possible_map)
+                        num = node;
+                nr_node_ids = num + 1;
+        }
+        /* allocate the map */
+        map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+        DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
+        pr_debug("Node to cpumask map at %p for %d nodes\n",
+                 map, nr_node_ids);
+        /* node_to_cpumask() will now work */
+        node_to_cpumask_map = map;
+}
+void __cpuinit numa_set_node(int cpu, int node)
+{
+        int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+        /* early setting, no percpu area yet */
+        if (cpu_to_node_map) {
+                cpu_to_node_map[cpu] = node;
+                return;
+        }
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+        if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
+                printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+                dump_stack();
+                return;
+        }
+#endif
+        per_cpu(x86_cpu_to_node_map, cpu) = node;
+        if (node != NUMA_NO_NODE)
+                per_cpu(node_number, cpu) = node;
+}
+void __cpuinit numa_clear_node(int cpu)
+{
+        numa_set_node(cpu, NUMA_NO_NODE);
+}
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void __cpuinit numa_add_cpu(int cpu)
+{
+        cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+/*
+ * --------- debug versions of the numa functions ---------
+ */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+        int node = early_cpu_to_node(cpu);
+        cpumask_t *mask;
+        char buf[64];
+        if (node_to_cpumask_map == NULL) {
+                printk(KERN_ERR "node_to_cpumask_map NULL\n");
+                dump_stack();
+                return;
+        }
+        mask = &node_to_cpumask_map[node];
+        if (enable)
+                cpu_set(cpu, *mask);
+        else
+                cpu_clear(cpu, *mask);
+        cpulist_scnprintf(buf, sizeof(buf), mask);
+        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+                enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+}
+void __cpuinit numa_add_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, 1);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, 0);
+}
+int cpu_to_node(int cpu)
+{
+        if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+                printk(KERN_WARNING
+                        "cpu_to_node(%d): usage too early!\n", cpu);
+                dump_stack();
+                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+        }
+        return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(cpu_to_node);
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+        if (early_per_cpu_ptr(x86_cpu_to_node_map))
+                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+        if (!per_cpu_offset(cpu)) {
+                printk(KERN_WARNING
+                        "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+                dump_stack();
+                return NUMA_NO_NODE;
+        }
+        return per_cpu(x86_cpu_to_node_map, cpu);
+}
+/* empty cpumask */
+static const cpumask_t cpu_mask_none;
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const cpumask_t *cpumask_of_node(int node)
+{
+        if (node_to_cpumask_map == NULL) {
+                printk(KERN_WARNING
+                        "cpumask_of_node(%d): no node_to_cpumask_map!\n",
+                        node);
+                dump_stack();
+                return (const cpumask_t *)&cpu_online_map;
+        }
+        if (node >= nr_node_ids) {
+                printk(KERN_WARNING
+                        "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+                        node, nr_node_ids);
+                dump_stack();
+                return &cpu_mask_none;
+        }
+        return &node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+/*
+ * Returns a bitmask of CPUs on Node 'node'.
+ *
+ * Side note: this function creates the returned cpumask on the stack
+ * so with a high NR_CPUS count, excessive stack space is used.  The
+ * node_to_cpumask_ptr function should be used whenever possible.
+ */
+cpumask_t node_to_cpumask(int node)
+{
+        if (node_to_cpumask_map == NULL) {
+                printk(KERN_WARNING
+                        "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
+                dump_stack();
+                return cpu_online_map;
+        }
+        if (node >= nr_node_ids) {
+                printk(KERN_WARNING
+                        "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
+                        node, nr_node_ids);
+                dump_stack();
+                return cpu_mask_none;
+        }
+        return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(node_to_cpumask);
+/*
+ * --------- end of debug versions of the numa functions ---------
+ */
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e89d24815f26..84ba74820ad6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -534,6 +534,36 @@ out_unlock:
        return 0;
 }
+static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
+                               int primary)
+{
+        /*
+         * Ignore all non primary paths.
+         */
+        if (!primary)
+                return 0;
+        /*
+         * Ignore the NULL PTE for kernel identity mapping, as it is expected
+         * to have holes.
+         * Also set numpages to '1' indicating that we processed cpa req for
+         * one virtual address page and its pfn. TBD: numpages can be set based
+         * on the initial value and the level returned by lookup_address().
+         */
+        if (within(vaddr, PAGE_OFFSET,
+                   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+                cpa->numpages = 1;
+                cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
+                return 0;
+        } else {
+                WARN(1, KERN_WARNING "CPA: called for zero pte. "
+                        "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
+                        *cpa->vaddr);
+                return -EFAULT;
+        }
+}
 static int __change_page_attr(struct cpa_data *cpa, int primary)
 {
        unsigned long address;
@@ -549,17 +579,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
 repeat:
        kpte = lookup_address(address, &level);
        if (!kpte)
-                return 0;
+                return __cpa_process_fault(cpa, address, primary);
        old_pte = *kpte;
-        if (!pte_val(old_pte)) {
+        if (!pte_val(old_pte))
-                if (!primary)
+                return __cpa_process_fault(cpa, address, primary);
-                        return 0;
-                WARN(1, KERN_WARNING "CPA: called for zero pte. "
-                       "vaddr = %lx cpa->vaddr = %lx\n", address,
-                       *cpa->vaddr);
-                return -EINVAL;
-        }
        if (level == PG_LEVEL_4K) {
                pte_t new_pte;
@@ -657,12 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
                vaddr = *cpa->vaddr;
        if (!(within(vaddr, PAGE_OFFSET,
-                    PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
+                    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
-#ifdef CONFIG_X86_64
-                || within(vaddr, PAGE_OFFSET + (1UL<<32),
-                    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
-#endif
-        )) {
                alias_cpa = *cpa;
                temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 8b08fb955274..9127e31c7268 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -30,7 +30,7 @@
 #ifdef CONFIG_X86_PAT
 int __read_mostly pat_enabled = 1;
-void __cpuinit pat_disable(char *reason)
+void __cpuinit pat_disable(const char *reason)
 {
        pat_enabled = 0;
        printk(KERN_INFO "%s\n", reason);
@@ -42,6 +42,11 @@ static int __init nopat(char *str)
        return 0;
 }
 early_param("nopat", nopat);
+#else
+static inline void pat_disable(const char *reason)
+{
+        (void)reason;
+}
 #endif
@@ -78,16 +83,20 @@ void pat_init(void)
        if (!pat_enabled)
                return;
-        /* Paranoia check. */
+        if (!cpu_has_pat) {
-        if (!cpu_has_pat && boot_pat_state) {
+                if (!boot_pat_state) {
-                /*
+                        pat_disable("PAT not supported by CPU.");
-                 * If this happens we are on a secondary CPU, but
+                        return;
-                 * switched to PAT on the boot CPU. We have no way to
+                } else {
-                 * undo PAT.
+                        /*
-                 */
+                         * If this happens we are on a secondary CPU, but
-                printk(KERN_ERR "PAT enabled, "
+                         * switched to PAT on the boot CPU. We have no way to
-                       "but not supported by secondary CPU\n");
+                         * undo PAT.
-                BUG();
+                         */
+                        printk(KERN_ERR "PAT enabled, "
+                               "but not supported by secondary CPU\n");
+                        BUG();
+                }
        }
        /* Set PWT to Write-Combining. All other bits stay the same */
@@ -333,11 +342,23 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
                                              req_type & _PAGE_CACHE_MASK);
        }
-        is_range_ram = pagerange_is_ram(start, end);
+        if (new_type)
-        if (is_range_ram == 1)
+                *new_type = actual_type;
-                return reserve_ram_pages_type(start, end, req_type, new_type);
-        else if (is_range_ram < 0)
+        /*
-                return -EINVAL;
+         * For legacy reasons, some parts of the physical address range in the
+         * legacy 1MB region is treated as non-RAM (even when listed as RAM in
+         * the e820 tables).  So we will track the memory attributes of this
+         * legacy 1MB region using the linear memtype_list always.
+         */
+        if (end >= ISA_END_ADDRESS) {
+                is_range_ram = pagerange_is_ram(start, end);
+                if (is_range_ram == 1)
+                        return reserve_ram_pages_type(start, end, req_type,
+                                                      new_type);
+                else if (is_range_ram < 0)
+                        return -EINVAL;
+        }
        new  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
        if (!new)
@@ -347,9 +368,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
        new->end        = end;
        new->type       = actual_type;
-        if (new_type)
-                *new_type = actual_type;
        spin_lock(&memtype_lock);
        if (cached_entry && start >= cached_start)
@@ -437,11 +455,19 @@ int free_memtype(u64 start, u64 end)
        if (is_ISA_range(start, end - 1))
                return 0;
-        is_range_ram = pagerange_is_ram(start, end);
+        /*
-        if (is_range_ram == 1)
+         * For legacy reasons, some parts of the physical address range in the
-                return free_ram_pages_type(start, end);
+         * legacy 1MB region is treated as non-RAM (even when listed as RAM in
-        else if (is_range_ram < 0)
+         * the e820 tables).  So we will track the memory attributes of this
-                return -EINVAL;
+         * legacy 1MB region using the linear memtype_list always.
+         */
+        if (end >= ISA_END_ADDRESS) {
+                is_range_ram = pagerange_is_ram(start, end);
+                if (is_range_ram == 1)
+                        return free_ram_pages_type(start, end);
+                else if (is_range_ram < 0)
+                        return -EINVAL;
+        }
        spin_lock(&memtype_lock);
        list_for_each_entry(entry, &memtype_list, nd) {
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af074..15df1baee100 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -21,6 +21,7 @@
 #include <asm/numa.h>
 #include <asm/e820.h>
 #include <asm/genapic.h>
+#include <asm/uv/uv.h>
 int acpi_numa __initdata;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
new file mode 100644
index 000000000000..72a6d4ebe34d
--- /dev/null
+++ b/arch/x86/mm/tlb.c
@@ -0,0 +1,296 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+                        = { &init_mm, 0, };
+#include <mach_ipi.h>
+/*
+ *      Smarter SMP flushing macros.
+ *              c/o Linus Torvalds.
+ *
+ *      These mean you can really definitely utterly forget about
+ *      writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *      Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+ *      More scalable flush, from Andi Kleen
+ *
+ *      To avoid global state use 8 different call vectors.
+ *      Each CPU uses a specific vector to trigger flushes on other
+ *      CPUs. Depending on the received vector the target CPUs look into
+ *      the right array slot for the flush data.
+ *
+ *      With more than 8 CPUs they are hashed to the 8 available
+ *      vectors. The limited global vector space forces us to this right now.
+ *      In future when interrupts are split into per CPU domains this could be
+ *      fixed, at the cost of triggering multiple IPIs in some cases.
+ */
+union smp_flush_state {
+        struct {
+                struct mm_struct *flush_mm;
+                unsigned long flush_va;
+                spinlock_t tlbstate_lock;
+                DECLARE_BITMAP(flush_cpumask, NR_CPUS);
+        };
+        char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
+} ____cacheline_internodealigned_in_smp;
+/* State is put into the per CPU data section, but padded
+   to a full cache line because other CPUs can access it and we don't
+   want false sharing in the per cpu data segment. */
+static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ */
+void leave_mm(int cpu)
+{
+        if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+                BUG();
+        cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
+        load_cr3(swapper_pg_dir);
+}
+EXPORT_SYMBOL_GPL(leave_mm);
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *      Stop ipi delivery for the old mm. This is not synchronized with
+ *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *      for the wrong mm, and in the worst case we perform a superfluous
+ *      tlb flush.
+ * 1a2) set cpu mmu_state to TLBSTATE_OK
+ *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *      was in lazy tlb mode.
+ * 1a3) update cpu active_mm
+ *      Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ *      Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *      cpu active_mm is correct, cpu0 already handles
+ *      flush ipis.
+ * 1b1) set cpu mmu_state to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ *      Atomically set the bit [other cpus will start sending flush ipis],
+ *      and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu mmu_state is local to each cpu, no
+ * write/read ordering problems.
+ */
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ *
+ * Interrupts are disabled.
+ */
+/*
+ * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop
+ * but still used for documentation purpose but the usage is slightly
+ * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt
+ * entry calls in with the first parameter in %eax.  Maybe define
+ * intrlinkage?
+ */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void smp_invalidate_interrupt(struct pt_regs *regs)
+{
+        unsigned int cpu;
+        unsigned int sender;
+        union smp_flush_state *f;
+        cpu = smp_processor_id();
+        /*
+         * orig_rax contains the negated interrupt vector.
+         * Use that to determine where the sender put the data.
+         */
+        sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
+        f = &flush_state[sender];
+        if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
+                goto out;
+                /*
+                 * This was a BUG() but until someone can quote me the
+                 * line from the intel manual that guarantees an IPI to
+                 * multiple CPUs is retried _only_ on the erroring CPUs
+                 * its staying as a return
+                 *
+                 * BUG();
+                 */
+        if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+                if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+                        if (f->flush_va == TLB_FLUSH_ALL)
+                                local_flush_tlb();
+                        else
+                                __flush_tlb_one(f->flush_va);
+                } else
+                        leave_mm(cpu);
+        }
+out:
+        ack_APIC_irq();
+        smp_mb__before_clear_bit();
+        cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
+        smp_mb__after_clear_bit();
+        inc_irq_stat(irq_tlb_count);
+}
+static void flush_tlb_others_ipi(const struct cpumask *cpumask,
+                                 struct mm_struct *mm, unsigned long va)
+{
+        unsigned int sender;
+        union smp_flush_state *f;
+        /* Caller has disabled preemption */
+        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+        f = &flush_state[sender];
+        /*
+         * Could avoid this lock when
+         * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
+         * probably not worth checking this for a cache-hot lock.
+         */
+        spin_lock(&f->tlbstate_lock);
+        f->flush_mm = mm;
+        f->flush_va = va;
+        cpumask_andnot(to_cpumask(f->flush_cpumask),
+                       cpumask, cpumask_of(smp_processor_id()));
+        /*
+         * Make the above memory operations globally visible before
+         * sending the IPI.
+         */
+        smp_mb();
+        /*
+         * We have to send the IPI only to
+         * CPUs affected.
+         */
+        send_IPI_mask(to_cpumask(f->flush_cpumask),
+                      INVALIDATE_TLB_VECTOR_START + sender);
+        while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
+                cpu_relax();
+        f->flush_mm = NULL;
+        f->flush_va = 0;
+        spin_unlock(&f->tlbstate_lock);
+}
+void native_flush_tlb_others(const struct cpumask *cpumask,
+                             struct mm_struct *mm, unsigned long va)
+{
+        if (is_uv_system()) {
+                unsigned int cpu;
+                cpu = get_cpu();
+                cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
+                if (cpumask)
+                        flush_tlb_others_ipi(cpumask, mm, va);
+                put_cpu();
+                return;
+        }
+        flush_tlb_others_ipi(cpumask, mm, va);
+}
+static int __cpuinit init_smp_flush(void)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(flush_state); i++)
+                spin_lock_init(&flush_state[i].tlbstate_lock);
+        return 0;
+}
+core_initcall(init_smp_flush);
+void flush_tlb_current_task(void)
+{
+        struct mm_struct *mm = current->mm;
+        preempt_disable();
+        local_flush_tlb();
+        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+                flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_mm(struct mm_struct *mm)
+{
+        preempt_disable();
+        if (current->active_mm == mm) {
+                if (current->mm)
+                        local_flush_tlb();
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+                flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        preempt_disable();
+        if (current->active_mm == mm) {
+                if (current->mm)
+                        __flush_tlb_one(va);
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+                flush_tlb_others(&mm->cpu_vm_mask, mm, va);
+        preempt_enable();
+}
+static void do_flush_tlb_all(void *info)
+{
+        unsigned long cpu = smp_processor_id();
+        __flush_tlb_all();
+        if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+                leave_mm(cpu);
+}
+void flush_tlb_all(void)
+{
+        on_each_cpu(do_flush_tlb_all, NULL, 1);
+}