Merge branch 'linus' into x86/bootmem

Conflicts: arch/x86/mm/numa_64.c Merge reason: fix the conflict, update to latest -rc and pick up this dependent fix from Yinghai: e6d2e2b2b1e1: memblock: don't adjust size in memblock_find_base() Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2011-02-14 05:55:18 -0500
committer: Ingo Molnar <mingo@elte.hu> 2011-02-14 05:55:18 -0500
commit: d2137d5af4259f50c19addb8246a186c9ffac325 (patch)
tree: 2f7e309f9cf8ef2f2698532c226edda38021fe69 /arch/x86/mm
parent: f005fe12b90c5b9fe180a09209a893e09affa8aa (diff)
parent: 795abaf1e4e188c4171e3cd3dbb11a9fcacaf505 (diff)
13 files changed, 387 insertions, 94 deletions
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index ae6ad691a14a..49b334cdd64c 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -27,6 +27,7 @@
 #include <asm/amd_nb.h>
 static struct bootnode __initdata nodes[8];
+static unsigned char __initdata nodeids[8];
 static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
 static __init int find_northbridge(void)
@@ -66,20 +67,6 @@ static __init void early_get_boot_cpu_id(void)
        if (smp_found_config)
                early_get_smp_config();
 #endif
-        early_init_lapic_mapping();
-}
-int __init amd_get_nodes(struct bootnode *physnodes)
-{
-        int i;
-        int ret = 0;
-        for_each_node_mask(i, nodes_parsed) {
-                physnodes[ret].start = nodes[i].start;
-                physnodes[ret].end = nodes[i].end;
-                ret++;
-        }
-        return ret;
 }
 int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
@@ -114,7 +101,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
                base = read_pci_config(0, nb, 1, 0x40 + i*8);
                limit = read_pci_config(0, nb, 1, 0x44 + i*8);
-                nodeid = limit & 7;
+                nodeids[i] = nodeid = limit & 7;
                if ((base & 3) == 0) {
                        if (i < numnodes)
                                pr_info("Skipping disabled node %d\n", i);
@@ -194,6 +181,76 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
        return 0;
 }
+#ifdef CONFIG_NUMA_EMU
+static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+void __init amd_get_nodes(struct bootnode *physnodes)
+{
+        int i;
+        for_each_node_mask(i, nodes_parsed) {
+                physnodes[i].start = nodes[i].start;
+                physnodes[i].end = nodes[i].end;
+        }
+}
+static int __init find_node_by_addr(unsigned long addr)
+{
+        int ret = NUMA_NO_NODE;
+        int i;
+        for (i = 0; i < 8; i++)
+                if (addr >= nodes[i].start && addr < nodes[i].end) {
+                        ret = i;
+                        break;
+                }
+        return ret;
+}
+/*
+ * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
+ * setup to represent the physical topology but reflect the emulated
+ * environment.  For each emulated node, the real node which it appears on is
+ * found and a fake pxm to nid mapping is created which mirrors the actual
+ * locality.  node_distance() then represents the correct distances between
+ * emulated nodes by using the fake acpi mappings to pxms.
+ */
+void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
+{
+        unsigned int bits;
+        unsigned int cores;
+        unsigned int apicid_base = 0;
+        int i;
+        bits = boot_cpu_data.x86_coreid_bits;
+        cores = 1 << bits;
+        early_get_boot_cpu_id();
+        if (boot_cpu_physical_apicid > 0)
+                apicid_base = boot_cpu_physical_apicid;
+        for (i = 0; i < nr_nodes; i++) {
+                int index;
+                int nid;
+                int j;
+                nid = find_node_by_addr(nodes[i].start);
+                if (nid == NUMA_NO_NODE)
+                        continue;
+                index = nodeids[nid] << bits;
+                if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
+                        for (j = apicid_base; j < cores + apicid_base; j++)
+                                fake_apicid_to_node[index + j] = i;
+#ifdef CONFIG_ACPI_NUMA
+                __acpi_map_pxm_to_node(nid, i);
+#endif
+        }
+        memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+}
+#endif /* CONFIG_NUMA_EMU */
 int __init amd_scan_nodes(void)
 {
        unsigned int bits;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 #include <asm/pgtable.h>
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);
                get_page(page);
+                SetPageReferenced(page);
                pages[*nr] = page;
                (*nr)++;
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
        VM_BUG_ON(page != compound_head(page));
        VM_BUG_ON(page_count(page) == 0);
        atomic_add(nr, &page->_count);
+        SetPageReferenced(page);
+}
+static inline void get_huge_page_tail(struct page *page)
+{
+        /*
+         * __split_huge_page_refcount() cannot run
+         * from under us.
+         */
+        VM_BUG_ON(atomic_read(&page->_count) < 0);
+        atomic_inc(&page->_count);
 }
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
+                if (PageTail(page))
+                        get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
                next = pmd_addr_end(addr, end);
-                if (pmd_none(pmd))
+                /*
+                 * The pmd_trans_splitting() check below explains why
+                 * pmdp_splitting_flush has to flush the tlb, to stop
+                 * this gup-fast code from running while we set the
+                 * splitting bit in the pmd. Returning zero will take
+                 * the slow path that will call wait_split_huge_page()
+                 * if the pmd is still in splitting state. gup-fast
+                 * can't because it has irq disabled and
+                 * wait_split_huge_page() would never return as the
+                 * tlb flush IPI wouldn't run.
+                 */
+                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd))) {
                        if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 35ee75d9061a..b8054e087ead 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -336,8 +336,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
        /*
         * We just marked the kernel text read only above, now that
         * we are going to free part of that, we need to make that
-         * writeable first.
+         * writeable and non-executable first.
         */
+        set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
        set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
        printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9f401b..c821074b7f0b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -45,6 +45,7 @@
 #include <asm/bugs.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/paravirt.h>
@@ -226,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 static inline int is_kernel_text(unsigned long addr)
 {
-        if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+        if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
                return 1;
        return 0;
 }
@@ -715,6 +716,7 @@ void __init paging_init(void)
        /*
         * NOTE: at this point the bootmem allocator is fully available.
         */
+        olpc_dt_build_devicetree();
        sparse_init();
        zone_sizes_init();
 }
@@ -912,6 +914,23 @@ void set_kernel_text_ro(void)
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 }
+static void mark_nxdata_nx(void)
+{
+        /*
+         * When this called, init has already been executed and released,
+         * so everything past _etext sould be NX.
+         */
+        unsigned long start = PFN_ALIGN(_etext);
+        /*
+         * This comes from is_kernel_text upper limit. Also HPAGE where used:
+         */
+        unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+        if (__supported_pte_mask & _PAGE_NX)
+                printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+        set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
 void mark_rodata_ro(void)
 {
        unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +965,7 @@ void mark_rodata_ro(void)
        printk(KERN_INFO "Testing CPA: write protecting again\n");
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
+        mark_nxdata_nx();
 }
 #endif
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index af3b6c8a436f..704a37cedddb 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
        e->trace.entries = e->trace_entries;
        e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
        e->trace.skip = 0;
-        save_stack_trace_bp(&e->trace, regs->bp);
+        save_stack_trace_regs(&e->trace, regs);
        /* Round address down to nearest 16 bytes */
        shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c3..ebf6d7887a38 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,6 +2,28 @@
 #include <linux/topology.h>
 #include <linux/module.h>
 #include <linux/bootmem.h>
+#include <asm/numa.h>
+#include <asm/acpi.h>
+int __initdata numa_off;
+static __init int numa_setup(char *opt)
+{
+        if (!opt)
+                return -EINVAL;
+        if (!strncmp(opt, "off", 3))
+                numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+        if (!strncmp(opt, "fake=", 5))
+                numa_emu_cmdline(opt + 5);
+#endif
+#ifdef CONFIG_ACPI_NUMA
+        if (!strncmp(opt, "noacpi", 6))
+                acpi_numa = -1;
+#endif
+        return 0;
+}
+early_param("numa", numa_setup);
 /*
 * Which logical CPUs are on which nodes
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7cc26ae0a15d..62cb634b5cf8 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -30,7 +30,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
-int numa_off __initdata;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
@@ -260,30 +259,35 @@ void __init numa_init_array(void)
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __initdata;
+static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
 static char *cmdline __initdata;
+void __init numa_emu_cmdline(char *str)
+{
+        cmdline = str;
+}
 static int __init setup_physnodes(unsigned long start, unsigned long end,
                                        int acpi, int amd)
 {
-        int nr_nodes = 0;
        int ret = 0;
        int i;
+        memset(physnodes, 0, sizeof(physnodes));
 #ifdef CONFIG_ACPI_NUMA
        if (acpi)
-                nr_nodes = acpi_get_nodes(physnodes);
+                acpi_get_nodes(physnodes, start, end);
 #endif
 #ifdef CONFIG_AMD_NUMA
        if (amd)
-                nr_nodes = amd_get_nodes(physnodes);
+                amd_get_nodes(physnodes);
 #endif
        /*
         * Basic sanity checking on the physical node map: there may be errors
         * if the SRAT or AMD code incorrectly reported the topology or the mem=
         * kernel parameter is used.
         */
-        for (i = 0; i < nr_nodes; i++) {
+        for (i = 0; i < MAX_NUMNODES; i++) {
                if (physnodes[i].start == physnodes[i].end)
                        continue;
                if (physnodes[i].start > end) {
@@ -298,17 +302,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
                        physnodes[i].start = start;
                if (physnodes[i].end > end)
                        physnodes[i].end = end;
-        }
-        /*
-         * Remove all nodes that have no memory or were truncated because of the
-         * limited address range.
-         */
-        for (i = 0; i < nr_nodes; i++) {
-                if (physnodes[i].start == physnodes[i].end)
-                        continue;
-                physnodes[ret].start = physnodes[i].start;
-                physnodes[ret].end = physnodes[i].end;
                ret++;
        }
@@ -324,6 +317,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
        return ret;
 }
+static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
+{
+        int i;
+        BUG_ON(acpi && amd);
+#ifdef CONFIG_ACPI_NUMA
+        if (acpi)
+                acpi_fake_nodes(nodes, nr_nodes);
+#endif
+#ifdef CONFIG_AMD_NUMA
+        if (amd)
+                amd_fake_nodes(nodes, nr_nodes);
+#endif
+        if (!acpi && !amd)
+                for (i = 0; i < nr_cpu_ids; i++)
+                        numa_set_node(i, 0);
+}
 /*
 * Setups up nid to range from addr to addr + size.  If the end
 * boundary is greater than max_addr, then max_addr is used instead.
@@ -352,8 +363,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
 * to max_addr.  The return value is the number of nodes allocated.
 */
-static int __init split_nodes_interleave(u64 addr, u64 max_addr,
+static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
-                                                int nr_phys_nodes, int nr_nodes)
 {
        nodemask_t physnode_mask = NODE_MASK_NONE;
        u64 size;
@@ -384,7 +394,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
                return -1;
        }
-        for (i = 0; i < nr_phys_nodes; i++)
+        for (i = 0; i < MAX_NUMNODES; i++)
                if (physnodes[i].start != physnodes[i].end)
                        node_set(i, physnode_mask);
@@ -553,11 +563,9 @@ static int __init numa_emulation(unsigned long start_pfn,
 {
        u64 addr = start_pfn << PAGE_SHIFT;
        u64 max_addr = last_pfn << PAGE_SHIFT;
-        int num_phys_nodes;
        int num_nodes;
        int i;
-        num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
        /*
         * If the numa=fake command-line contains a 'M' or 'G', it represents
         * the fixed node size.  Otherwise, if it is just a single number N,
@@ -572,7 +580,7 @@ static int __init numa_emulation(unsigned long start_pfn,
                unsigned long n;
                n = simple_strtoul(cmdline, NULL, 0);
-                num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
+                num_nodes = split_nodes_interleave(addr, max_addr, n);
        }
        if (num_nodes < 0)
@@ -596,7 +604,8 @@ static int __init numa_emulation(unsigned long start_pfn,
        init_memory_mapping_high();
        for_each_node_mask(i, node_possible_map)
                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-        acpi_fake_nodes(nodes, num_nodes);
+        setup_physnodes(addr, max_addr, acpi, amd);
+        fake_physnodes(acpi, amd, num_nodes);
        numa_init_array();
        return 0;
 }
@@ -611,8 +620,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
        nodes_clear(node_online_map);
 #ifdef CONFIG_NUMA_EMU
+        setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+                        acpi, amd);
        if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
                return;
+        setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+                        acpi, amd);
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
 #endif
@@ -663,24 +676,6 @@ unsigned long __init numa_free_all_bootmem(void)
        return pages;
 }
-static __init int numa_setup(char *opt)
-{
-        if (!opt)
-                return -EINVAL;
-        if (!strncmp(opt, "off", 3))
-                numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
-        if (!strncmp(opt, "fake=", 5))
-                cmdline = opt + 5;
-#endif
-#ifdef CONFIG_ACPI_NUMA
-        if (!strncmp(opt, "noacpi", 6))
-                acpi_numa = -1;
-#endif
-        return 0;
-}
-early_param("numa", numa_setup);
 #ifdef CONFIG_NUMA
 static __init int find_near_online_node(int node)
@@ -769,6 +764,7 @@ void __cpuinit numa_clear_node(int cpu)
 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
+#ifndef CONFIG_NUMA_EMU
 void __cpuinit numa_add_cpu(int cpu)
 {
        cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
@@ -778,34 +774,115 @@ void __cpuinit numa_remove_cpu(int cpu)
 {
        cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 }
+#else
+void __cpuinit numa_add_cpu(int cpu)
+{
+        unsigned long addr;
+        u16 apicid;
+        int physnid;
+        int nid = NUMA_NO_NODE;
+        apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+        if (apicid != BAD_APICID)
+                nid = apicid_to_node[apicid];
+        if (nid == NUMA_NO_NODE)
+                nid = early_cpu_to_node(cpu);
+        BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+        /*
+         * Use the starting address of the emulated node to find which physical
+         * node it is allocated on.
+         */
+        addr = node_start_pfn(nid) << PAGE_SHIFT;
+        for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
+                if (addr >= physnodes[physnid].start &&
+                    addr < physnodes[physnid].end)
+                        break;
+        /*
+         * Map the cpu to each emulated node that is allocated on the physical
+         * node of the cpu's apic id.
+         */
+        for_each_online_node(nid) {
+                addr = node_start_pfn(nid) << PAGE_SHIFT;
+                if (addr >= physnodes[physnid].start &&
+                    addr < physnodes[physnid].end)
+                        cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+        }
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        int i;
+        for_each_online_node(i)
+                cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#endif /* !CONFIG_NUMA_EMU */
 #else /* CONFIG_DEBUG_PER_CPU_MAPS */
+static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+{
+        int node = early_cpu_to_node(cpu);
+        struct cpumask *mask;
+        char buf[64];
+        mask = node_to_cpumask_map[node];
+        if (!mask) {
+                pr_err("node_to_cpumask_map[%i] NULL\n", node);
+                dump_stack();
+                return NULL;
+        }
+        cpulist_scnprintf(buf, sizeof(buf), mask);
+        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+                enable ? "numa_add_cpu" : "numa_remove_cpu",
+                cpu, node, buf);
+        return mask;
+}
 /*
 * --------- debug versions of the numa functions ---------
 */
+#ifndef CONFIG_NUMA_EMU
 static void __cpuinit numa_set_cpumask(int cpu, int enable)
 {
-        int node = early_cpu_to_node(cpu);
        struct cpumask *mask;
-        char buf[64];
-        mask = node_to_cpumask_map[node];
+        mask = debug_cpumask_set_cpu(cpu, enable);
-        if (mask == NULL) {
+        if (!mask)
-                printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
-                dump_stack();
                return;
-        }
        if (enable)
                cpumask_set_cpu(cpu, mask);
        else
                cpumask_clear_cpu(cpu, mask);
+}
+#else
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+        int node = early_cpu_to_node(cpu);
+        struct cpumask *mask;
+        int i;
-        cpulist_scnprintf(buf, sizeof(buf), mask);
+        for_each_online_node(i) {
-        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+                unsigned long addr;
-                enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+                addr = node_start_pfn(i) << PAGE_SHIFT;
+                if (addr < physnodes[node].start ||
+                                        addr >= physnodes[node].end)
+                        continue;
+                mask = debug_cpumask_set_cpu(cpu, enable);
+                if (!mask)
+                        return;
+                if (enable)
+                        cpumask_set_cpu(cpu, mask);
+                else
+                        cpumask_clear_cpu(cpu, mask);
+        }
 }
+#endif /* CONFIG_NUMA_EMU */
 void __cpuinit numa_add_cpu(int cpu)
 {
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d606..d343b3c81f3c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
 #include <linux/pfn.h>
 #include <linux/percpu.h>
 #include <linux/gfp.h>
+#include <linux/pci.h>
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -260,8 +261,10 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
         * The BIOS area between 640k and 1Mb needs to be executable for
         * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
         */
-        if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+        if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
                pgprot_val(forbidden) |= _PAGE_NX;
+#endif
        /*
         * The kernel text needs to be executable for obvious reasons
@@ -393,7 +396,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 {
        unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
        pte_t new_pte, old_pte, *tmp;
-        pgprot_t old_prot, new_prot;
+        pgprot_t old_prot, new_prot, req_prot;
        int i, do_split = 1;
        unsigned int level;
@@ -438,10 +441,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
         * We are safe now. Check whether the new pgprot is the same:
         */
        old_pte = *kpte;
-        old_prot = new_prot = pte_pgprot(old_pte);
+        old_prot = new_prot = req_prot = pte_pgprot(old_pte);
-        pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+        pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
-        pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+        pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
        /*
         * old_pte points to the large page base address. So we need
@@ -450,17 +453,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
        pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
        cpa->pfn = pfn;
-        new_prot = static_protections(new_prot, address, pfn);
+        new_prot = static_protections(req_prot, address, pfn);
        /*
         * We need to check the full range, whether
         * static_protection() requires a different pgprot for one of
         * the pages in the range we try to preserve:
         */
-        addr = address + PAGE_SIZE;
+        addr = address & pmask;
-        pfn++;
+        pfn = pte_pfn(old_pte);
-        for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
+        for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
-                pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+                pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
                if (pgprot_val(chk_prot) != pgprot_val(new_prot))
                        goto out_unlock;
@@ -483,7 +486,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
         * that we limited the number of possible pages already to
         * the number of pages in the large page.
         */
-        if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+        if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
                /*
                 * The address is aligned and the number of pages
                 * covers the full page.
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc89..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
        return changed;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp,
+                          pmd_t entry, int dirty)
+{
+        int changed = !pmd_same(*pmdp, entry);
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        if (changed && dirty) {
+                *pmdp = entry;
+                pmd_update_defer(vma->vm_mm, address, pmdp);
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        }
+        return changed;
+}
+#endif
 int ptep_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *ptep)
 {
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
        return ret;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                              unsigned long addr, pmd_t *pmdp)
+{
+        int ret = 0;
+        if (pmd_young(*pmdp))
+                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+                                         (unsigned long *)pmdp);
+        if (ret)
+                pmd_update(vma->vm_mm, addr, pmdp);
+        return ret;
+}
+#endif
 int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep)
 {
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
        return young;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                           unsigned long address, pmd_t *pmdp)
+{
+        int young;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        young = pmdp_test_and_clear_young(vma, address, pmdp);
+        if (young)
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        return young;
+}
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp)
+{
+        int set;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+                                (unsigned long *)pmdp);
+        if (set) {
+                pmd_update(vma->vm_mm, address, pmdp);
+                /* need tlb flush only to serialize against gup-fast */
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        }
+}
+#endif
 /**
 * reserve_top_address - reserves a hole in the top of kernel address space
 * @reserve - size of hole to reserve
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa34086..410531d3c292 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
 {
        if (!cpu_has_nx) {
                printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
-                       "missing in CPU or disabled in BIOS!\n");
+                       "missing in CPU!\n");
        } else {
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
                if (disable_nx) {
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index a17dffd136c1..ae96e7b8051d 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -59,7 +59,6 @@ static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
 static int __initdata num_memory_chunks; /* total number of memory chunks */
 static u8 __initdata apicid_to_pxm[MAX_APICID];
-int numa_off __initdata;
 int acpi_numa __initdata;
 static __init void bad_srat(void)
@@ -92,6 +91,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
        /* mark this node as "seen" in node bitmap */
        BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
+        /* don't need to check apic_id here, because it is always 8 bits */
        apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
        printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 0b961c8bffb4..4c03e13da138 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
        }
        apic_id = pa->apic_id;
+        if (apic_id >= MAX_LOCAL_APIC) {
+                printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+                return;
+        }
        apicid_to_node[apic_id] = node;
        node_set(node, cpu_nodes_parsed);
        acpi_numa = 1;
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
                apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
        else
                apic_id = pa->apic_id;
+        if (apic_id >= MAX_LOCAL_APIC) {
+                printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+                return;
+        }
        apicid_to_node[apic_id] = node;
        node_set(node, cpu_nodes_parsed);
        acpi_numa = 1;
@@ -339,18 +349,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 void __init acpi_numa_arch_fixup(void) {}
-int __init acpi_get_nodes(struct bootnode *physnodes)
+#ifdef CONFIG_NUMA_EMU
+void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
+                                unsigned long end)
 {
        int i;
-        int ret = 0;
        for_each_node_mask(i, nodes_parsed) {
-                physnodes[ret].start = nodes[i].start;
+                cutoff_node(i, start, end);
-                physnodes[ret].end = nodes[i].end;
+                physnodes[i].start = nodes[i].start;
-                ret++;
+                physnodes[i].end = nodes[i].end;
        }
-        return ret;
 }
+#endif /* CONFIG_NUMA_EMU */
 /* Use the information discovered above to actually set up the nodes. */
 int __init acpi_scan_nodes(unsigned long start, unsigned long end)
@@ -497,8 +508,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 {
        int i, j;
-        printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
-                         "topology.\n");
        for (i = 0; i < num_nodes; i++) {
                int nid, pxm;
@@ -518,6 +527,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
                            fake_apicid_to_node[j] == NUMA_NO_NODE)
                                fake_apicid_to_node[j] = i;
        }
+        /*
+         * If there are apicid-to-node mappings for physical nodes that do not
+         * have a corresponding emulated node, it should default to a guaranteed
+         * value.
+         */
+        for (i = 0; i < MAX_LOCAL_APIC; i++)
+                if (apicid_to_node[i] != NUMA_NO_NODE &&
+                    fake_apicid_to_node[i] == NUMA_NO_NODE)
+                        fake_apicid_to_node[i] = 0;
        for (i = 0; i < num_nodes; i++)
                __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
        memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 12cdbb17ad18..6acc724d5d8f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -223,7 +223,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 static void __cpuinit calculate_tlb_offset(void)
 {
-        int cpu, node, nr_node_vecs;
+        int cpu, node, nr_node_vecs, idx = 0;
        /*
         * we are changing tlb_vector_offset for each CPU in runtime, but this
         * will not cause inconsistency, as the write is atomic under X86. we
@@ -239,7 +239,7 @@ static void __cpuinit calculate_tlb_offset(void)
                nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
        for_each_online_node(node) {
-                int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+                int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
                        nr_node_vecs;
                int cpu_offset = 0;
                for_each_cpu(cpu, cpumask_of_node(node)) {
@@ -248,6 +248,7 @@ static void __cpuinit calculate_tlb_offset(void)
                        cpu_offset++;
                        cpu_offset = cpu_offset % nr_node_vecs;
                }
+                idx++;
        }
 }
author	Ingo Molnar <mingo@elte.hu>	2011-02-14 05:55:18 -0500
committer	Ingo Molnar <mingo@elte.hu>	2011-02-14 05:55:18 -0500
commit	d2137d5af4259f50c19addb8246a186c9ffac325 (patch)
tree	2f7e309f9cf8ef2f2698532c226edda38021fe69 /arch/x86/mm
parent	f005fe12b90c5b9fe180a09209a893e09affa8aa (diff)
parent	795abaf1e4e188c4171e3cd3dbb11a9fcacaf505 (diff)