10 files changed, 148 insertions, 46 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a204..79b0b372d2d0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
                spin_lock_irqsave(&pgd_lock, flags);
                list_for_each_entry(page, &pgd_list, lru) {
-                        if (!vmalloc_sync_one(page_address(page), address))
+                        spinlock_t *pgt_lock;
+                        pmd_t *ret;
+                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                        spin_lock(pgt_lock);
+                        ret = vmalloc_sync_one(page_address(page), address);
+                        spin_unlock(pgt_lock);
+                        if (!ret)
                                break;
                }
                spin_unlock_irqrestore(&pgd_lock, flags);
@@ -251,6 +260,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;
+        WARN_ON_ONCE(in_nmi());
        /*
         * Synchronize this task's top level page-table
         * with the 'reference' page table.
@@ -326,29 +337,7 @@ out:
 void vmalloc_sync_all(void)
 {
-        unsigned long address;
+        sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
-        for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-             address += PGDIR_SIZE) {
-                const pgd_t *pgd_ref = pgd_offset_k(address);
-                unsigned long flags;
-                struct page *page;
-                if (pgd_none(*pgd_ref))
-                        continue;
-                spin_lock_irqsave(&pgd_lock, flags);
-                list_for_each_entry(page, &pgd_list, lru) {
-                        pgd_t *pgd;
-                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
-                        if (pgd_none(*pgd))
-                                set_pgd(pgd, *pgd_ref);
-                        else
-                                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-                }
-                spin_unlock_irqrestore(&pgd_lock, flags);
-        }
 }
 /*
@@ -369,6 +358,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;
+        WARN_ON_ONCE(in_nmi());
        /*
         * Copy kernel mappings over when needed. This can also
         * happen within a race in page table update. In the later
@@ -894,8 +885,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
        if (pmd_large(*pmd))
                return spurious_fault_check(error_code, (pte_t *) pmd);
+        /*
+         * Note: don't use pte_present() here, since it returns true
+         * if the _PAGE_PROTNONE bit is set.  However, this aliases the
+         * _PAGE_GLOBAL bit, which for kernel pages give false positives
+         * when CONFIG_DEBUG_PAGEALLOC is used.
+         */
        pte = pte_offset_kernel(pmd, address);
-        if (!pte_present(*pte))
+        if (!(pte_flags(*pte) & _PAGE_PRESENT))
                return 0;
        ret = spurious_fault_check(error_code, pte);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca79091b9d6..558f2d332076 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -67,7 +67,7 @@ static __init void *alloc_low_page(void)
                panic("alloc_low_page: ran out of memory");
        adr = __va(pfn * PAGE_SIZE);
-        memset(adr, 0, PAGE_SIZE);
+        clear_page(adr);
        return adr;
 }
@@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE]
 static inline void save_pg_dir(void)
 {
-        memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+        copy_page(swsusp_pg_dir, swapper_pg_dir);
 }
 #else /* !CONFIG_ACPI_SLEEP */
 static inline void save_pg_dir(void)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a6674689a20..c55f900fbf89 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -98,6 +98,43 @@ static int __init nonx32_setup(char *str)
 __setup("noexec32=", nonx32_setup);
 /*
+ * When memory was added/removed make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+        unsigned long address;
+        for (address = start; address <= end; address += PGDIR_SIZE) {
+                const pgd_t *pgd_ref = pgd_offset_k(address);
+                unsigned long flags;
+                struct page *page;
+                if (pgd_none(*pgd_ref))
+                        continue;
+                spin_lock_irqsave(&pgd_lock, flags);
+                list_for_each_entry(page, &pgd_list, lru) {
+                        pgd_t *pgd;
+                        spinlock_t *pgt_lock;
+                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                        spin_lock(pgt_lock);
+                        if (pgd_none(*pgd))
+                                set_pgd(pgd, *pgd_ref);
+                        else
+                                BUG_ON(pgd_page_vaddr(*pgd)
+                                       != pgd_page_vaddr(*pgd_ref));
+                        spin_unlock(pgt_lock);
+                }
+                spin_unlock_irqrestore(&pgd_lock, flags);
+        }
+}
+/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
@@ -293,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
                panic("alloc_low_page: ran out of memory");
        adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
-        memset(adr, 0, PAGE_SIZE);
+        clear_page(adr);
        *phys  = pfn * PAGE_SIZE;
        return adr;
 }
@@ -534,11 +571,13 @@ kernel_physical_mapping_init(unsigned long start,
                             unsigned long end,
                             unsigned long page_size_mask)
 {
+        bool pgd_changed = false;
        unsigned long next, last_map_addr = end;
+        unsigned long addr;
        start = (unsigned long)__va(start);
        end = (unsigned long)__va(end);
+        addr = start;
        for (; start < end; start = next) {
                pgd_t *pgd = pgd_offset_k(start);
@@ -563,7 +602,12 @@ kernel_physical_mapping_init(unsigned long start,
                spin_lock(&init_mm.page_table_lock);
                pgd_populate(&init_mm, pgd, __va(pud_phys));
                spin_unlock(&init_mm.page_table_lock);
+                pgd_changed = true;
        }
+        if (pgd_changed)
+                sync_global_pgds(addr, end);
        __flush_tlb_all();
        return last_map_addr;
@@ -1003,6 +1047,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
                }
        }
+        sync_global_pgds((unsigned long)start_page, end);
        return 0;
 }
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 970ed579d4e4..52d54bfc1ebb 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,7 +22,7 @@
 #include <asm/numa.h>
 #include <asm/mpspec.h>
 #include <asm/apic.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 static struct bootnode __initdata nodes[8];
 static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
@@ -54,8 +54,8 @@ static __init int find_northbridge(void)
 static __init void early_get_boot_cpu_id(void)
 {
        /*
-         * need to get boot_cpu_id so can use that to create apicid_to_node
+         * need to get the APIC ID of the BSP so can use that to
-         * in k8_scan_nodes()
+         * create apicid_to_node in k8_scan_nodes()
         */
 #ifdef CONFIG_X86_MPPARSE
        /*
@@ -212,7 +212,7 @@ int __init k8_scan_nodes(void)
        bits = boot_cpu_data.x86_coreid_bits;
        cores = (1<<bits);
        apicid_base = 0;
-        /* need to get boot_cpu_id early for system with apicid lifting */
+        /* get the APIC ID of the BSP early for systems with apicid lifting */
        early_get_boot_cpu_id();
        if (boot_cpu_physical_apicid > 0) {
                pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index b3b531a4f8e5..d87dd6d042d6 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
        if (!pte)
                return false;
+        WARN_ON_ONCE(in_nmi());
        if (error_code & 2)
                kmemcheck_access(regs, address, KMEMCHECK_WRITE);
        else
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 63c19e27aa6f..324aa3f07237 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
                b == 0xf0 || b == 0xf2 || b == 0xf3
                /* Group 2 */
                || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
-                || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
+                || b == 0x64 || b == 0x65
                /* Group 3 */
                || b == 0x66
                /* Group 4 */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96c..4962f1aeda6f 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -18,7 +18,7 @@
 #include <asm/dma.h>
 #include <asm/numa.h>
 #include <asm/acpi.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590e..8be8c7d7bc89 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD                           \
        (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
-static void pgd_ctor(pgd_t *pgd)
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+        BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+        virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+        return (struct mm_struct *)page->index;
+}
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 {
        /* If the pgd points to a shared pagetable level (either the
           ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,15 +110,13 @@ static void pgd_ctor(pgd_t *pgd)
                clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
                                swapper_pg_dir + KERNEL_PGD_BOUNDARY,
                                KERNEL_PGD_PTRS);
-                paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
-                                         __pa(swapper_pg_dir) >> PAGE_SHIFT,
-                                         KERNEL_PGD_BOUNDARY,
-                                         KERNEL_PGD_PTRS);
        }
        /* list required to sync kernel mapping updates */
-        if (!SHARED_KERNEL_PMD)
+        if (!SHARED_KERNEL_PMD) {
+                pgd_set_mm(pgd, mm);
                pgd_list_add(pgd);
+        }
 }
 static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +282,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
         */
        spin_lock_irqsave(&pgd_lock, flags);
-        pgd_ctor(pgd);
+        pgd_ctor(mm, pgd);
        pgd_prepopulate_pmd(mm, pgd, pmds);
        spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index f9897f7a9ef1..9c0d0d399c30 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -420,9 +420,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
                return -1;
        }
-        for_each_node_mask(i, nodes_parsed)
+        for (i = 0; i < num_node_memblks; i++)
-                e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+                e820_register_active_regions(memblk_nodeid[i],
-                                                nodes[i].end >> PAGE_SHIFT);
+                                node_memblk_range[i].start >> PAGE_SHIFT,
+                                node_memblk_range[i].end >> PAGE_SHIFT);
        /* for out of order entries in SRAT */
        sort_node_map();
        if (!nodes_cover_memory(nodes)) {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c03f14ab6667..49358481c733 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
   want false sharing in the per cpu data segment. */
 static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
+static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
 /*
 * We cannot call mmdrop() because we are in interrupt context,
 * instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
        union smp_flush_state *f;
        /* Caller has disabled preemption */
-        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+        sender = this_cpu_read(tlb_vector_offset);
        f = &flush_state[sender];
        /*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
        flush_tlb_others_ipi(cpumask, mm, va);
 }
+static void __cpuinit calculate_tlb_offset(void)
+{
+        int cpu, node, nr_node_vecs;
+        /*
+         * we are changing tlb_vector_offset for each CPU in runtime, but this
+         * will not cause inconsistency, as the write is atomic under X86. we
+         * might see more lock contentions in a short time, but after all CPU's
+         * tlb_vector_offset are changed, everything should go normal
+         *
+         * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
+         * waste some vectors.
+         **/
+        if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
+                nr_node_vecs = 1;
+        else
+                nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
+        for_each_online_node(node) {
+                int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+                        nr_node_vecs;
+                int cpu_offset = 0;
+                for_each_cpu(cpu, cpumask_of_node(node)) {
+                        per_cpu(tlb_vector_offset, cpu) = node_offset +
+                                cpu_offset;
+                        cpu_offset++;
+                        cpu_offset = cpu_offset % nr_node_vecs;
+                }
+        }
+}
+static int tlb_cpuhp_notify(struct notifier_block *n,
+                unsigned long action, void *hcpu)
+{
+        switch (action & 0xf) {
+        case CPU_ONLINE:
+        case CPU_DEAD:
+                calculate_tlb_offset();
+        }
+        return NOTIFY_OK;
+}
 static int __cpuinit init_smp_flush(void)
 {
        int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
        for (i = 0; i < ARRAY_SIZE(flush_state); i++)
                raw_spin_lock_init(&flush_state[i].tlbstate_lock);
+        calculate_tlb_offset();
+        hotcpu_notifier(tlb_cpuhp_notify, 0);
        return 0;
 }
 core_initcall(init_smp_flush);

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4c4508e8a204..79b0b372d2d0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
229		229
230	spin_lock_irqsave(&pgd_lock, flags);	230	spin_lock_irqsave(&pgd_lock, flags);
231	list_for_each_entry(page, &pgd_list, lru) {	231	list_for_each_entry(page, &pgd_list, lru) {
232	if (!vmalloc_sync_one(page_address(page), address))	232	spinlock_t *pgt_lock;
		233	pmd_t *ret;
		234
		235	pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
		236
		237	spin_lock(pgt_lock);
		238	ret = vmalloc_sync_one(page_address(page), address);
		239	spin_unlock(pgt_lock);
		240
		241	if (!ret)
233	break;	242	break;
234	}	243	}
235	spin_unlock_irqrestore(&pgd_lock, flags);	244	spin_unlock_irqrestore(&pgd_lock, flags);
@@ -251,6 +260,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
251	if (!(address >= VMALLOC_START && address < VMALLOC_END))	260	if (!(address >= VMALLOC_START && address < VMALLOC_END))
252	return -1;	261	return -1;
253		262
		263	WARN_ON_ONCE(in_nmi());
		264
254	/*	265	/*
255	* Synchronize this task's top level page-table	266	* Synchronize this task's top level page-table
256	* with the 'reference' page table.	267	* with the 'reference' page table.
@@ -326,29 +337,7 @@ out:
326		337
327	void vmalloc_sync_all(void)	338	void vmalloc_sync_all(void)
328	{	339	{
329	unsigned long address;	340	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
330
331	for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
332	address += PGDIR_SIZE) {
333
334	const pgd_t *pgd_ref = pgd_offset_k(address);
335	unsigned long flags;
336	struct page *page;
337
338	if (pgd_none(*pgd_ref))
339	continue;
340
341	spin_lock_irqsave(&pgd_lock, flags);
342	list_for_each_entry(page, &pgd_list, lru) {
343	pgd_t *pgd;
344	pgd = (pgd_t *)page_address(page) + pgd_index(address);
345	if (pgd_none(*pgd))
346	set_pgd(pgd, *pgd_ref);
347	else
348	BUG_ON(pgd_page_vaddr(pgd) != pgd_page_vaddr(pgd_ref));
349	}
350	spin_unlock_irqrestore(&pgd_lock, flags);
351	}
352	}	341	}
353		342
354	/*	343	/*
@@ -369,6 +358,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
369	if (!(address >= VMALLOC_START && address < VMALLOC_END))	358	if (!(address >= VMALLOC_START && address < VMALLOC_END))
370	return -1;	359	return -1;
371		360
		361	WARN_ON_ONCE(in_nmi());
		362
372	/*	363	/*
373	* Copy kernel mappings over when needed. This can also	364	* Copy kernel mappings over when needed. This can also
374	* happen within a race in page table update. In the later	365	* happen within a race in page table update. In the later
@@ -894,8 +885,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
894	if (pmd_large(*pmd))	885	if (pmd_large(*pmd))
895	return spurious_fault_check(error_code, (pte_t *) pmd);	886	return spurious_fault_check(error_code, (pte_t *) pmd);
896		887
		888	/*
		889	* Note: don't use pte_present() here, since it returns true
		890	* if the _PAGE_PROTNONE bit is set. However, this aliases the
		891	* _PAGE_GLOBAL bit, which for kernel pages give false positives
		892	* when CONFIG_DEBUG_PAGEALLOC is used.
		893	*/
897	pte = pte_offset_kernel(pmd, address);	894	pte = pte_offset_kernel(pmd, address);
898	if (!pte_present(*pte))	895	if (!(pte_flags(*pte) & _PAGE_PRESENT))
899	return 0;	896	return 0;
900		897
901	ret = spurious_fault_check(error_code, pte);	898	ret = spurious_fault_check(error_code, pte);


diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bca79091b9d6..558f2d332076 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c
@@ -67,7 +67,7 @@ static __init void *alloc_low_page(void)
67	panic("alloc_low_page: ran out of memory");	67	panic("alloc_low_page: ran out of memory");
68		68
69	adr = __va(pfn * PAGE_SIZE);	69	adr = __va(pfn * PAGE_SIZE);
70	memset(adr, 0, PAGE_SIZE);	70	clear_page(adr);
71	return adr;	71	return adr;
72	}	72	}
73		73
@@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE]
558		558
559	static inline void save_pg_dir(void)	559	static inline void save_pg_dir(void)
560	{	560	{
561	memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);	561	copy_page(swsusp_pg_dir, swapper_pg_dir);
562	}	562	}
563	#else /* !CONFIG_ACPI_SLEEP */	563	#else /* !CONFIG_ACPI_SLEEP */
564	static inline void save_pg_dir(void)	564	static inline void save_pg_dir(void)


diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9a6674689a20..c55f900fbf89 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c
@@ -98,6 +98,43 @@ static int __init nonx32_setup(char *str)
98	__setup("noexec32=", nonx32_setup);	98	__setup("noexec32=", nonx32_setup);
99		99
100	/*	100	/*
		101	* When memory was added/removed make sure all the processes MM have
		102	* suitable PGD entries in the local PGD level page.
		103	*/
		104	void sync_global_pgds(unsigned long start, unsigned long end)
		105	{
		106	unsigned long address;
		107
		108	for (address = start; address <= end; address += PGDIR_SIZE) {
		109	const pgd_t *pgd_ref = pgd_offset_k(address);
		110	unsigned long flags;
		111	struct page *page;
		112
		113	if (pgd_none(*pgd_ref))
		114	continue;
		115
		116	spin_lock_irqsave(&pgd_lock, flags);
		117	list_for_each_entry(page, &pgd_list, lru) {
		118	pgd_t *pgd;
		119	spinlock_t *pgt_lock;
		120
		121	pgd = (pgd_t *)page_address(page) + pgd_index(address);
		122	pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
		123	spin_lock(pgt_lock);
		124
		125	if (pgd_none(*pgd))
		126	set_pgd(pgd, *pgd_ref);
		127	else
		128	BUG_ON(pgd_page_vaddr(*pgd)
		129	!= pgd_page_vaddr(*pgd_ref));
		130
		131	spin_unlock(pgt_lock);
		132	}
		133	spin_unlock_irqrestore(&pgd_lock, flags);
		134	}
		135	}
		136
		137	/*
101	* NOTE: This function is marked __ref because it calls __init function	138	* NOTE: This function is marked __ref because it calls __init function
102	* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.	139	* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
103	*/	140	*/
@@ -293,7 +330,7 @@ static __ref void alloc_low_page(unsigned long phys)
293	panic("alloc_low_page: ran out of memory");	330	panic("alloc_low_page: ran out of memory");
294		331
295	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);	332	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
296	memset(adr, 0, PAGE_SIZE);	333	clear_page(adr);
297	phys = pfn PAGE_SIZE;	334	phys = pfn PAGE_SIZE;
298	return adr;	335	return adr;
299	}	336	}
@@ -534,11 +571,13 @@ kernel_physical_mapping_init(unsigned long start,
534	unsigned long end,	571	unsigned long end,
535	unsigned long page_size_mask)	572	unsigned long page_size_mask)
536	{	573	{
537		574	bool pgd_changed = false;
538	unsigned long next, last_map_addr = end;	575	unsigned long next, last_map_addr = end;
		576	unsigned long addr;
539		577
540	start = (unsigned long)__va(start);	578	start = (unsigned long)__va(start);
541	end = (unsigned long)__va(end);	579	end = (unsigned long)__va(end);
		580	addr = start;
542		581
543	for (; start < end; start = next) {	582	for (; start < end; start = next) {
544	pgd_t *pgd = pgd_offset_k(start);	583	pgd_t *pgd = pgd_offset_k(start);
@@ -563,7 +602,12 @@ kernel_physical_mapping_init(unsigned long start,
563	spin_lock(&init_mm.page_table_lock);	602	spin_lock(&init_mm.page_table_lock);
564	pgd_populate(&init_mm, pgd, __va(pud_phys));	603	pgd_populate(&init_mm, pgd, __va(pud_phys));
565	spin_unlock(&init_mm.page_table_lock);	604	spin_unlock(&init_mm.page_table_lock);
		605	pgd_changed = true;
566	}	606	}
		607
		608	if (pgd_changed)
		609	sync_global_pgds(addr, end);
		610
567	__flush_tlb_all();	611	__flush_tlb_all();
568		612
569	return last_map_addr;	613	return last_map_addr;
@@ -1003,6 +1047,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
1003	}	1047	}
1004		1048
1005	}	1049	}
		1050	sync_global_pgds((unsigned long)start_page, end);
1006	return 0;	1051	return 0;
1007	}	1052	}
1008		1053


diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 970ed579d4e4..52d54bfc1ebb 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c
@@ -22,7 +22,7 @@
22	#include <asm/numa.h>	22	#include <asm/numa.h>
23	#include <asm/mpspec.h>	23	#include <asm/mpspec.h>
24	#include <asm/apic.h>	24	#include <asm/apic.h>
25	#include <asm/k8.h>	25	#include <asm/amd_nb.h>
26		26
27	static struct bootnode __initdata nodes[8];	27	static struct bootnode __initdata nodes[8];
28	static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;	28	static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
@@ -54,8 +54,8 @@ static __init int find_northbridge(void)
54	static __init void early_get_boot_cpu_id(void)	54	static __init void early_get_boot_cpu_id(void)
55	{	55	{
56	/*	56	/*
57	* need to get boot_cpu_id so can use that to create apicid_to_node	57	* need to get the APIC ID of the BSP so can use that to
58	* in k8_scan_nodes()	58	* create apicid_to_node in k8_scan_nodes()
59	*/	59	*/
60	#ifdef CONFIG_X86_MPPARSE	60	#ifdef CONFIG_X86_MPPARSE
61	/*	61	/*
@@ -212,7 +212,7 @@ int __init k8_scan_nodes(void)
212	bits = boot_cpu_data.x86_coreid_bits;	212	bits = boot_cpu_data.x86_coreid_bits;
213	cores = (1<<bits);	213	cores = (1<<bits);
214	apicid_base = 0;	214	apicid_base = 0;
215	/* need to get boot_cpu_id early for system with apicid lifting */	215	/* get the APIC ID of the BSP early for systems with apicid lifting */
216	early_get_boot_cpu_id();	216	early_get_boot_cpu_id();
217	if (boot_cpu_physical_apicid > 0) {	217	if (boot_cpu_physical_apicid > 0) {
218	pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);	218	pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);


diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index b3b531a4f8e5..d87dd6d042d6 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
631	if (!pte)	631	if (!pte)
632	return false;	632	return false;
633		633
		634	WARN_ON_ONCE(in_nmi());
		635
634	if (error_code & 2)	636	if (error_code & 2)
635	kmemcheck_access(regs, address, KMEMCHECK_WRITE);	637	kmemcheck_access(regs, address, KMEMCHECK_WRITE);
636	else	638	else


diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index 63c19e27aa6f..324aa3f07237 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
9	b == 0xf0 \|\| b == 0xf2 \|\| b == 0xf3	9	b == 0xf0 \|\| b == 0xf2 \|\| b == 0xf3
10	/* Group 2 */	10	/* Group 2 */
11	\|\| b == 0x2e \|\| b == 0x36 \|\| b == 0x3e \|\| b == 0x26	11	\|\| b == 0x2e \|\| b == 0x36 \|\| b == 0x3e \|\| b == 0x26
12	\|\| b == 0x64 \|\| b == 0x65 \|\| b == 0x2e \|\| b == 0x3e	12	\|\| b == 0x64 \|\| b == 0x65
13	/* Group 3 */	13	/* Group 3 */
14	\|\| b == 0x66	14	\|\| b == 0x66
15	/* Group 4 */	15	/* Group 4 */


diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a7bcc23ef96c..4962f1aeda6f 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c
@@ -18,7 +18,7 @@
18	#include <asm/dma.h>	18	#include <asm/dma.h>
19	#include <asm/numa.h>	19	#include <asm/numa.h>
20	#include <asm/acpi.h>	20	#include <asm/acpi.h>
21	#include <asm/k8.h>	21	#include <asm/amd_nb.h>
22		22
23	struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;	23	struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
24	EXPORT_SYMBOL(node_data);	24	EXPORT_SYMBOL(node_data);


diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5c4ee422590e..8be8c7d7bc89 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
87	#define UNSHARED_PTRS_PER_PGD \	87	#define UNSHARED_PTRS_PER_PGD \
88	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)	88	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
89		89
90	static void pgd_ctor(pgd_t *pgd)	90
		91	static void pgd_set_mm(pgd_t pgd, struct mm_struct mm)
		92	{
		93	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
		94	virt_to_page(pgd)->index = (pgoff_t)mm;
		95	}
		96
		97	struct mm_struct pgd_page_get_mm(struct page page)
		98	{
		99	return (struct mm_struct *)page->index;
		100	}
		101
		102	static void pgd_ctor(struct mm_struct mm, pgd_t pgd)
91	{	103	{
92	/* If the pgd points to a shared pagetable level (either the	104	/* If the pgd points to a shared pagetable level (either the
93	ptes in non-PAE, or shared PMD in PAE), then just copy the	105	ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,15 +110,13 @@ static void pgd_ctor(pgd_t *pgd)
98	clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,	110	clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
99	swapper_pg_dir + KERNEL_PGD_BOUNDARY,	111	swapper_pg_dir + KERNEL_PGD_BOUNDARY,
100	KERNEL_PGD_PTRS);	112	KERNEL_PGD_PTRS);
101	paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
102	__pa(swapper_pg_dir) >> PAGE_SHIFT,
103	KERNEL_PGD_BOUNDARY,
104	KERNEL_PGD_PTRS);
105	}	113	}
106		114
107	/* list required to sync kernel mapping updates */	115	/* list required to sync kernel mapping updates */
108	if (!SHARED_KERNEL_PMD)	116	if (!SHARED_KERNEL_PMD) {
		117	pgd_set_mm(pgd, mm);
109	pgd_list_add(pgd);	118	pgd_list_add(pgd);
		119	}
110	}	120	}
111		121
112	static void pgd_dtor(pgd_t *pgd)	122	static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +282,7 @@ pgd_t pgd_alloc(struct mm_struct mm)
272	*/	282	*/
273	spin_lock_irqsave(&pgd_lock, flags);	283	spin_lock_irqsave(&pgd_lock, flags);
274		284
275	pgd_ctor(pgd);	285	pgd_ctor(mm, pgd);
276	pgd_prepopulate_pmd(mm, pgd, pmds);	286	pgd_prepopulate_pmd(mm, pgd, pmds);
277		287
278	spin_unlock_irqrestore(&pgd_lock, flags);	288	spin_unlock_irqrestore(&pgd_lock, flags);


diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index f9897f7a9ef1..9c0d0d399c30 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c
@@ -420,9 +420,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
420	return -1;	420	return -1;
421	}	421	}
422		422
423	for_each_node_mask(i, nodes_parsed)	423	for (i = 0; i < num_node_memblks; i++)
424	e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,	424	e820_register_active_regions(memblk_nodeid[i],
425	nodes[i].end >> PAGE_SHIFT);	425	node_memblk_range[i].start >> PAGE_SHIFT,
		426	node_memblk_range[i].end >> PAGE_SHIFT);
		427
426	/* for out of order entries in SRAT */	428	/* for out of order entries in SRAT */
427	sort_node_map();	429	sort_node_map();
428	if (!nodes_cover_memory(nodes)) {	430	if (!nodes_cover_memory(nodes)) {


diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c03f14ab6667..49358481c733 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
5	#include <linux/smp.h>	5	#include <linux/smp.h>
6	#include <linux/interrupt.h>	6	#include <linux/interrupt.h>
7	#include <linux/module.h>	7	#include <linux/module.h>
		8	#include <linux/cpu.h>
8		9
9	#include <asm/tlbflush.h>	10	#include <asm/tlbflush.h>
10	#include <asm/mmu_context.h>	11	#include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
52	want false sharing in the per cpu data segment. */	53	want false sharing in the per cpu data segment. */
53	static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];	54	static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
54		55
		56	static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
		57
55	/*	58	/*
56	* We cannot call mmdrop() because we are in interrupt context,	59	* We cannot call mmdrop() because we are in interrupt context,
57	* instead update mm->cpu_vm_mask.	60	* instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
173	union smp_flush_state *f;	176	union smp_flush_state *f;
174		177
175	/* Caller has disabled preemption */	178	/* Caller has disabled preemption */
176	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;	179	sender = this_cpu_read(tlb_vector_offset);
177	f = &flush_state[sender];	180	f = &flush_state[sender];
178		181
179	/*	182	/*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
218	flush_tlb_others_ipi(cpumask, mm, va);	221	flush_tlb_others_ipi(cpumask, mm, va);
219	}	222	}
220		223
		224	static void __cpuinit calculate_tlb_offset(void)
		225	{
		226	int cpu, node, nr_node_vecs;
		227	/*
		228	* we are changing tlb_vector_offset for each CPU in runtime, but this
		229	* will not cause inconsistency, as the write is atomic under X86. we
		230	* might see more lock contentions in a short time, but after all CPU's
		231	* tlb_vector_offset are changed, everything should go normal
		232	*
		233	* Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
		234	* waste some vectors.
		235	**/
		236	if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
		237	nr_node_vecs = 1;
		238	else
		239	nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
		240
		241	for_each_online_node(node) {
		242	int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
		243	nr_node_vecs;
		244	int cpu_offset = 0;
		245	for_each_cpu(cpu, cpumask_of_node(node)) {
		246	per_cpu(tlb_vector_offset, cpu) = node_offset +
		247	cpu_offset;
		248	cpu_offset++;
		249	cpu_offset = cpu_offset % nr_node_vecs;
		250	}
		251	}
		252	}
		253
		254	static int tlb_cpuhp_notify(struct notifier_block *n,
		255	unsigned long action, void *hcpu)
		256	{
		257	switch (action & 0xf) {
		258	case CPU_ONLINE:
		259	case CPU_DEAD:
		260	calculate_tlb_offset();
		261	}
		262	return NOTIFY_OK;
		263	}
		264
221	static int __cpuinit init_smp_flush(void)	265	static int __cpuinit init_smp_flush(void)
222	{	266	{
223	int i;	267	int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
225	for (i = 0; i < ARRAY_SIZE(flush_state); i++)	269	for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226	raw_spin_lock_init(&flush_state[i].tlbstate_lock);	270	raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227		271
		272	calculate_tlb_offset();
		273	hotcpu_notifier(tlb_cpuhp_notify, 0);
228	return 0;	274	return 0;
229	}	275	}
230	core_initcall(init_smp_flush);	276	core_initcall(init_smp_flush);