Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Thomas Gleixner: - Make lazy TLB mode even lazier to avoid pointless switch_mm() operations, which reduces CPU load by 1-2% for memcache workloads - Small cleanups and improvements all over the place * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mm: Remove redundant check for kmem_cache_create() arm/asm/tlb.h: Fix build error implicit func declaration x86/mm/tlb: Make clear_asid_other() static x86/mm/tlb: Skip atomic operations for 'init_mm' in switch_mm_irqs_off() x86/mm/tlb: Always use lazy TLB mode x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs x86/mm/tlb: Make lazy TLB mode lazier x86/mm/tlb: Restructure switch_mm_irqs_off() x86/mm/tlb: Leave lazy TLB mode at page table free time mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids x86/mm: Add TLB purge to free pmd/pte page interfaces ioremap: Update pgtable free interfaces with addr x86/mm: Disable ioremap free page handling on x86-PAE
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-08-13 19:29:35 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-08-13 19:29:35 -0400
commit: 203b4fc903b644223a27ad3f25f3a0f3a3911d1d (patch)
tree: 8c210b67a17b74b2a39c500891a20e23c2390cf6
parent: 7edcf0d314f69e506ddd9562062b2a79fa965bb9 (diff)
parent: 765d28f136291f9639e3c031a1070fb76d6625c7 (diff)
13 files changed, 408 insertions, 225 deletions
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index d5562f9ce600..f854148c8d7c 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -292,5 +292,13 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 {
 }
+static inline void tlb_flush_remove_tables(struct mm_struct *mm)
+{
+}
+static inline void tlb_flush_remove_tables_local(void *arg)
+{
+}
 #endif /* CONFIG_MMU */
 #endif
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 493ff75670ff..8ae5d7ae4af3 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -977,12 +977,12 @@ int pmd_clear_huge(pmd_t *pmdp)
        return 1;
 }
-int pud_free_pmd_page(pud_t *pud)
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 {
        return pud_none(*pud);
 }
-int pmd_free_pte_page(pmd_t *pmd)
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 {
        return pmd_none(*pmd);
 }
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6690cd3fc8b1..511bf5fae8b8 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
 #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
 #endif
-static inline bool tlb_defer_switch_to_init_mm(void)
-{
-        /*
-         * If we have PCID, then switching to init_mm is reasonably
-         * fast.  If we don't have PCID, then switching to init_mm is
-         * quite slow, so we try to defer it in the hopes that we can
-         * avoid it entirely.  The latter approach runs the risk of
-         * receiving otherwise unnecessary IPIs.
-         *
-         * This choice is just a heuristic.  The tlb code can handle this
-         * function returning true or false regardless of whether we have
-         * PCID.
-         */
-        return !static_cpu_has(X86_FEATURE_PCID);
-}
 struct tlb_context {
        u64 ctx_id;
        u64 tlb_gen;
@@ -554,4 +538,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
        native_flush_tlb_others(mask, info)
 #endif
+extern void tlb_flush_remove_tables(struct mm_struct *mm);
+extern void tlb_flush_remove_tables_local(void *arg);
+#define HAVE_TLB_FLUSH_REMOVE_TABLES
 #endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 47b5951e592b..0f1683fcb196 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -329,9 +329,6 @@ static int __init pgd_cache_init(void)
         */
        pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
                                      SLAB_PANIC, NULL);
-        if (!pgd_cache)
-                return -ENOMEM;
        return 0;
 }
 core_initcall(pgd_cache_init);
@@ -719,28 +716,50 @@ int pmd_clear_huge(pmd_t *pmd)
        return 0;
 }
+#ifdef CONFIG_X86_64
 /**
 * pud_free_pmd_page - Clear pud entry and free pmd page.
 * @pud: Pointer to a PUD.
+ * @addr: Virtual address associated with pud.
 *
- * Context: The pud range has been unmaped and TLB purged.
+ * Context: The pud range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
+ *
+ * NOTE: Callers must allow a single page allocation.
 */
-int pud_free_pmd_page(pud_t *pud)
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 {
-        pmd_t *pmd;
+        pmd_t *pmd, *pmd_sv;
+        pte_t *pte;
        int i;
        if (pud_none(*pud))
                return 1;
        pmd = (pmd_t *)pud_page_vaddr(*pud);
+        pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
+        if (!pmd_sv)
+                return 0;
-        for (i = 0; i < PTRS_PER_PMD; i++)
+        for (i = 0; i < PTRS_PER_PMD; i++) {
-                if (!pmd_free_pte_page(&pmd[i]))
+                pmd_sv[i] = pmd[i];
-                        return 0;
+                if (!pmd_none(pmd[i]))
+                        pmd_clear(&pmd[i]);
+        }
        pud_clear(pud);
+        /* INVLPG to clear all paging-structure caches */
+        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+        for (i = 0; i < PTRS_PER_PMD; i++) {
+                if (!pmd_none(pmd_sv[i])) {
+                        pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+                        free_page((unsigned long)pte);
+                }
+        }
+        free_page((unsigned long)pmd_sv);
        free_page((unsigned long)pmd);
        return 1;
@@ -749,11 +768,12 @@ int pud_free_pmd_page(pud_t *pud)
 /**
 * pmd_free_pte_page - Clear pmd entry and free pte page.
 * @pmd: Pointer to a PMD.
+ * @addr: Virtual address associated with pmd.
 *
- * Context: The pmd range has been unmaped and TLB purged.
+ * Context: The pmd range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 */
-int pmd_free_pte_page(pmd_t *pmd)
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 {
        pte_t *pte;
@@ -762,8 +782,30 @@ int pmd_free_pte_page(pmd_t *pmd)
        pte = (pte_t *)pmd_page_vaddr(*pmd);
        pmd_clear(pmd);
+        /* INVLPG to clear all paging-structure caches */
+        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
        free_page((unsigned long)pte);
        return 1;
 }
+#else /* !CONFIG_X86_64 */
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+{
+        return pud_none(*pud);
+}
+/*
+ * Disable free page handling on x86-PAE. This assures that ioremap()
+ * does not update sync'd pmd entries. See vmalloc_sync_one().
+ */
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+{
+        return pmd_none(*pmd);
+}
+#endif /* CONFIG_X86_64 */
 #endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6eb1f34c3c85..752dbf4e0e50 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
 #include <linux/export.h>
 #include <linux/cpu.h>
 #include <linux/debugfs.h>
+#include <linux/gfp.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -35,7 +36,7 @@
 * necessary invalidation by clearing out the 'ctx_id' which
 * forces a TLB flush when the context is loaded.
 */
-void clear_asid_other(void)
+static void clear_asid_other(void)
 {
        u16 asid;
@@ -185,8 +186,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 {
        struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
        u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+        bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
        unsigned cpu = smp_processor_id();
        u64 next_tlb_gen;
+        bool need_flush;
+        u16 new_asid;
        /*
         * NB: The scheduler will call us with prev == next when switching
@@ -240,20 +244,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                           next->context.ctx_id);
                /*
-                 * We don't currently support having a real mm loaded without
+                 * Even in lazy TLB mode, the CPU should stay set in the
-                 * our cpu set in mm_cpumask().  We have all the bookkeeping
+                 * mm_cpumask. The TLB shootdown code can figure out from
-                 * in place to figure out whether we would need to flush
+                 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
-                 * if our cpu were cleared in mm_cpumask(), but we don't
-                 * currently use it.
                 */
                if (WARN_ON_ONCE(real_prev != &init_mm &&
                                 !cpumask_test_cpu(cpu, mm_cpumask(next))))
                        cpumask_set_cpu(cpu, mm_cpumask(next));
-                return;
+                /*
+                 * If the CPU is not in lazy TLB mode, we are just switching
+                 * from one thread in a process to another thread in the same
+                 * process. No TLB flush required.
+                 */
+                if (!was_lazy)
+                        return;
+                /*
+                 * Read the tlb_gen to check whether a flush is needed.
+                 * If the TLB is up to date, just use it.
+                 * The barrier synchronizes with the tlb_gen increment in
+                 * the TLB shootdown code.
+                 */
+                smp_mb();
+                next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+                if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
+                                next_tlb_gen)
+                        return;
+                /*
+                 * TLB contents went out of date while we were in lazy
+                 * mode. Fall through to the TLB switching code below.
+                 */
+                new_asid = prev_asid;
+                need_flush = true;
        } else {
-                u16 new_asid;
-                bool need_flush;
                u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
                /*
@@ -285,53 +310,60 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                        sync_current_stack_to_mm(next);
                }
-                /* Stop remote flushes for the previous mm */
+                /*
-                VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+                 * Stop remote flushes for the previous mm.
-                                real_prev != &init_mm);
+                 * Skip kernel threads; we never send init_mm TLB flushing IPIs,
-                cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+                 * but the bitmap manipulation can cause cache line contention.
+                 */
+                if (real_prev != &init_mm) {
+                        VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
+                                                mm_cpumask(real_prev)));
+                        cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+                }
                /*
                 * Start remote flushes and then read tlb_gen.
                 */
-                cpumask_set_cpu(cpu, mm_cpumask(next));
+                if (next != &init_mm)
+                        cpumask_set_cpu(cpu, mm_cpumask(next));
                next_tlb_gen = atomic64_read(&next->context.tlb_gen);
                choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+        }
-                if (need_flush) {
+        if (need_flush) {
-                        this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+                this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
-                        this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+                this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-                        load_new_mm_cr3(next->pgd, new_asid, true);
+                load_new_mm_cr3(next->pgd, new_asid, true);
-                        /*
-                         * NB: This gets called via leave_mm() in the idle path
-                         * where RCU functions differently.  Tracing normally
-                         * uses RCU, so we need to use the _rcuidle variant.
-                         *
-                         * (There is no good reason for this.  The idle code should
-                         *  be rearranged to call this before rcu_idle_enter().)
-                         */
-                        trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-                } else {
-                        /* The new ASID is already up to date. */
-                        load_new_mm_cr3(next->pgd, new_asid, false);
-                        /* See above wrt _rcuidle. */
-                        trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
-                }
                /*
-                 * Record last user mm's context id, so we can avoid
+                 * NB: This gets called via leave_mm() in the idle path
-                 * flushing branch buffer with IBPB if we switch back
+                 * where RCU functions differently.  Tracing normally
-                 * to the same user.
+                 * uses RCU, so we need to use the _rcuidle variant.
+                 *
+                 * (There is no good reason for this.  The idle code should
+                 *  be rearranged to call this before rcu_idle_enter().)
                 */
-                if (next != &init_mm)
+                trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-                        this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+        } else {
+                /* The new ASID is already up to date. */
+                load_new_mm_cr3(next->pgd, new_asid, false);
-                this_cpu_write(cpu_tlbstate.loaded_mm, next);
+                /* See above wrt _rcuidle. */
-                this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+                trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
        }
+        /*
+         * Record last user mm's context id, so we can avoid
+         * flushing branch buffer with IBPB if we switch back
+         * to the same user.
+         */
+        if (next != &init_mm)
+                this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+        this_cpu_write(cpu_tlbstate.loaded_mm, next);
+        this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
        load_mm_cr4(next);
        switch_ldt(real_prev, next);
 }
@@ -354,20 +386,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
        if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
                return;
-        if (tlb_defer_switch_to_init_mm()) {
+        this_cpu_write(cpu_tlbstate.is_lazy, true);
-                /*
-                 * There's a significant optimization that may be possible
-                 * here.  We have accurate enough TLB flush tracking that we
-                 * don't need to maintain coherence of TLB per se when we're
-                 * lazy.  We do, however, need to maintain coherence of
-                 * paging-structure caches.  We could, in principle, leave our
-                 * old mm loaded and only switch to init_mm when
-                 * tlb_remove_page() happens.
-                 */
-                this_cpu_write(cpu_tlbstate.is_lazy, true);
-        } else {
-                switch_mm(NULL, &init_mm, NULL);
-        }
 }
 /*
@@ -454,6 +473,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
                 * paging-structure cache to avoid speculatively reading
                 * garbage into our TLB.  Since switching to init_mm is barely
                 * slower than a minimal flush, just switch to init_mm.
+                 *
+                 * This should be rare, with native_flush_tlb_others skipping
+                 * IPIs to lazy TLB mode CPUs.
                 */
                switch_mm_irqs_off(NULL, &init_mm, NULL);
                return;
@@ -560,6 +582,9 @@ static void flush_tlb_func_remote(void *info)
 void native_flush_tlb_others(const struct cpumask *cpumask,
                             const struct flush_tlb_info *info)
 {
+        cpumask_var_t lazymask;
+        unsigned int cpu;
        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
        if (info->end == TLB_FLUSH_ALL)
                trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -583,8 +608,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
                 * that UV should be updated so that smp_call_function_many(),
                 * etc, are optimal on UV.
                 */
-                unsigned int cpu;
                cpu = smp_processor_id();
                cpumask = uv_flush_tlb_others(cpumask, info);
                if (cpumask)
@@ -592,8 +615,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
                                               (void *)info, 1);
                return;
        }
-        smp_call_function_many(cpumask, flush_tlb_func_remote,
+        /*
+         * A temporary cpumask is used in order to skip sending IPIs
+         * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
+         * If the allocation fails, simply IPI every CPU in mm_cpumask.
+         */
+        if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
+                smp_call_function_many(cpumask, flush_tlb_func_remote,
                               (void *)info, 1);
+                return;
+        }
+        cpumask_copy(lazymask, cpumask);
+        for_each_cpu(cpu, lazymask) {
+                if (per_cpu(cpu_tlbstate.is_lazy, cpu))
+                        cpumask_clear_cpu(cpu, lazymask);
+        }
+        smp_call_function_many(lazymask, flush_tlb_func_remote,
+                               (void *)info, 1);
+        free_cpumask_var(lazymask);
 }
 /*
@@ -646,6 +690,68 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
        put_cpu();
 }
+void tlb_flush_remove_tables_local(void *arg)
+{
+        struct mm_struct *mm = arg;
+        if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
+                        this_cpu_read(cpu_tlbstate.is_lazy)) {
+                /*
+                 * We're in lazy mode.  We need to at least flush our
+                 * paging-structure cache to avoid speculatively reading
+                 * garbage into our TLB.  Since switching to init_mm is barely
+                 * slower than a minimal flush, just switch to init_mm.
+                 */
+                switch_mm_irqs_off(NULL, &init_mm, NULL);
+        }
+}
+static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
+                                      struct cpumask *lazy_cpus)
+{
+        int cpu;
+        for_each_cpu(cpu, mm_cpumask(mm)) {
+                if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
+                        cpumask_set_cpu(cpu, lazy_cpus);
+        }
+}
+void tlb_flush_remove_tables(struct mm_struct *mm)
+{
+        int cpu = get_cpu();
+        cpumask_var_t lazy_cpus;
+        if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
+                put_cpu();
+                return;
+        }
+        if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
+                /*
+                 * If the cpumask allocation fails, do a brute force flush
+                 * on all the CPUs that have this mm loaded.
+                 */
+                smp_call_function_many(mm_cpumask(mm),
+                                tlb_flush_remove_tables_local, (void *)mm, 1);
+                put_cpu();
+                return;
+        }
+        /*
+         * CPUs with !is_lazy either received a TLB flush IPI while the user
+         * pages in this address range were unmapped, or have context switched
+         * and reloaded %CR3 since then.
+         *
+         * Shootdown IPIs at page table freeing time only need to be sent to
+         * CPUs that may have out of date TLB contents.
+         */
+        mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
+        smp_call_function_many(lazy_cpus,
+                                tlb_flush_remove_tables_local, (void *)mm, 1);
+        free_cpumask_var(lazy_cpus);
+        put_cpu();
+}
 static void do_flush_tlb_all(void *info)
 {
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index d8a33a781a57..2a29dd9c986d 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -82,6 +82,7 @@ struct mm_struct efi_mm = {
        .mmap_sem               = __RWSEM_INITIALIZER(efi_mm.mmap_sem),
        .page_table_lock        = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
        .mmlist                 = LIST_HEAD_INIT(efi_mm.mmlist),
+        .cpu_bitmap             = { [BITS_TO_LONGS(NR_CPUS)] = 0},
 };
 struct workqueue_struct *efi_rts_wq;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index f59639afaa39..b081794ba135 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1019,8 +1019,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
 int pud_clear_huge(pud_t *pud);
 int pmd_clear_huge(pmd_t *pmd);
-int pud_free_pmd_page(pud_t *pud);
+int pud_free_pmd_page(pud_t *pud, unsigned long addr);
-int pmd_free_pte_page(pmd_t *pmd);
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
 #else   /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
 static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
 {
@@ -1046,11 +1046,11 @@ static inline int pmd_clear_huge(pmd_t *pmd)
 {
        return 0;
 }
-static inline int pud_free_pmd_page(pud_t *pud)
+static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 {
        return 0;
 }
-static inline int pmd_free_pte_page(pmd_t *pmd)
+static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 {
        return 0;
 }
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 3063125197ad..e811ef7b8350 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -303,4 +303,14 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 #define tlb_migrate_finish(mm) do {} while (0)
+/*
+ * Used to flush the TLB when page tables are removed, when lazy
+ * TLB mode may cause a CPU to retain intermediate translations
+ * pointing to about-to-be-freed page table memory.
+ */
+#ifndef HAVE_TLB_FLUSH_REMOVE_TABLES
+#define tlb_flush_remove_tables(mm) do {} while (0)
+#define tlb_flush_remove_tables_local(mm) do {} while (0)
+#endif
 #endif /* _ASM_GENERIC__TLB_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 99ce070e7dcb..efdc24dd9e97 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,176 +335,183 @@ struct core_state {
 struct kioctx_table;
 struct mm_struct {
-        struct vm_area_struct *mmap;            /* list of VMAs */
+        struct {
-        struct rb_root mm_rb;
+                struct vm_area_struct *mmap;            /* list of VMAs */
-        u32 vmacache_seqnum;                   /* per-thread vmacache */
+                struct rb_root mm_rb;
+                u32 vmacache_seqnum;                   /* per-thread vmacache */
 #ifdef CONFIG_MMU
-        unsigned long (*get_unmapped_area) (struct file *filp,
+                unsigned long (*get_unmapped_area) (struct file *filp,
                                unsigned long addr, unsigned long len,
                                unsigned long pgoff, unsigned long flags);
 #endif
-        unsigned long mmap_base;                /* base of mmap area */
+                unsigned long mmap_base;        /* base of mmap area */
-        unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */
+                unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
-        /* Base adresses for compatible mmap() */
+                /* Base adresses for compatible mmap() */
-        unsigned long mmap_compat_base;
+                unsigned long mmap_compat_base;
-        unsigned long mmap_compat_legacy_base;
+                unsigned long mmap_compat_legacy_base;
 #endif
-        unsigned long task_size;                /* size of task vm space */
+                unsigned long task_size;        /* size of task vm space */
-        unsigned long highest_vm_end;           /* highest vma end address */
+                unsigned long highest_vm_end;   /* highest vma end address */
-        pgd_t * pgd;
+                pgd_t * pgd;
-        /**
+                /**
-         * @mm_users: The number of users including userspace.
+                 * @mm_users: The number of users including userspace.
-         *
+                 *
-         * Use mmget()/mmget_not_zero()/mmput() to modify. When this drops
+                 * Use mmget()/mmget_not_zero()/mmput() to modify. When this
-         * to 0 (i.e. when the task exits and there are no other temporary
+                 * drops to 0 (i.e. when the task exits and there are no other
-         * reference holders), we also release a reference on @mm_count
+                 * temporary reference holders), we also release a reference on
-         * (which may then free the &struct mm_struct if @mm_count also
+                 * @mm_count (which may then free the &struct mm_struct if
-         * drops to 0).
+                 * @mm_count also drops to 0).
-         */
+                 */
-        atomic_t mm_users;
+                atomic_t mm_users;
-        /**
+                /**
-         * @mm_count: The number of references to &struct mm_struct
+                 * @mm_count: The number of references to &struct mm_struct
-         * (@mm_users count as 1).
+                 * (@mm_users count as 1).
-         *
+                 *
-         * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
+                 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
-         * &struct mm_struct is freed.
+                 * &struct mm_struct is freed.
-         */
+                 */
-        atomic_t mm_count;
+                atomic_t mm_count;
 #ifdef CONFIG_MMU
-        atomic_long_t pgtables_bytes;           /* PTE page table pages */
+                atomic_long_t pgtables_bytes;   /* PTE page table pages */
 #endif
-        int map_count;                          /* number of VMAs */
+                int map_count;                  /* number of VMAs */
-        spinlock_t page_table_lock;             /* Protects page tables and some counters */
+                spinlock_t page_table_lock; /* Protects page tables and some
-        struct rw_semaphore mmap_sem;
+                                             * counters
+                                             */
+                struct rw_semaphore mmap_sem;
-        struct list_head mmlist;                /* List of maybe swapped mm's.  These are globally strung
+                struct list_head mmlist; /* List of maybe swapped mm's. These
-                                                 * together off init_mm.mmlist, and are protected
+                                          * are globally strung together off
-                                                 * by mmlist_lock
+                                          * init_mm.mmlist, and are protected
-                                                 */
+                                          * by mmlist_lock
+                                          */
-        unsigned long hiwater_rss;      /* High-watermark of RSS usage */
+                unsigned long hiwater_rss; /* High-watermark of RSS usage */
-        unsigned long hiwater_vm;       /* High-water virtual memory usage */
+                unsigned long hiwater_vm;  /* High-water virtual memory usage */
-        unsigned long total_vm;         /* Total pages mapped */
+                unsigned long total_vm;    /* Total pages mapped */
-        unsigned long locked_vm;        /* Pages that have PG_mlocked set */
+                unsigned long locked_vm;   /* Pages that have PG_mlocked set */
-        unsigned long pinned_vm;        /* Refcount permanently increased */
+                unsigned long pinned_vm;   /* Refcount permanently increased */
-        unsigned long data_vm;          /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
+                unsigned long data_vm;     /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
-        unsigned long exec_vm;          /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
+                unsigned long exec_vm;     /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
-        unsigned long stack_vm;         /* VM_STACK */
+                unsigned long stack_vm;    /* VM_STACK */
-        unsigned long def_flags;
+                unsigned long def_flags;
-        spinlock_t arg_lock; /* protect the below fields */
+                spinlock_t arg_lock; /* protect the below fields */
-        unsigned long start_code, end_code, start_data, end_data;
+                unsigned long start_code, end_code, start_data, end_data;
-        unsigned long start_brk, brk, start_stack;
+                unsigned long start_brk, brk, start_stack;
-        unsigned long arg_start, arg_end, env_start, env_end;
+                unsigned long arg_start, arg_end, env_start, env_end;
-        unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
+                unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
-        /*
+                /*
-         * Special counters, in some configurations protected by the
+                 * Special counters, in some configurations protected by the
-         * page_table_lock, in other configurations by being atomic.
+                 * page_table_lock, in other configurations by being atomic.
-         */
+                 */
-        struct mm_rss_stat rss_stat;
+                struct mm_rss_stat rss_stat;
-        struct linux_binfmt *binfmt;
-        cpumask_var_t cpu_vm_mask_var;
+                struct linux_binfmt *binfmt;
-        /* Architecture-specific MM context */
+                /* Architecture-specific MM context */
-        mm_context_t context;
+                mm_context_t context;
-        unsigned long flags; /* Must use atomic bitops to access the bits */
+                unsigned long flags; /* Must use atomic bitops to access */
-        struct core_state *core_state; /* coredumping support */
+                struct core_state *core_state; /* coredumping support */
 #ifdef CONFIG_MEMBARRIER
-        atomic_t membarrier_state;
+                atomic_t membarrier_state;
 #endif
 #ifdef CONFIG_AIO
-        spinlock_t                      ioctx_lock;
+                spinlock_t                      ioctx_lock;
-        struct kioctx_table __rcu       *ioctx_table;
+                struct kioctx_table __rcu       *ioctx_table;
 #endif
 #ifdef CONFIG_MEMCG
-        /*
+                /*
-         * "owner" points to a task that is regarded as the canonical
+                 * "owner" points to a task that is regarded as the canonical
-         * user/owner of this mm. All of the following must be true in
+                 * user/owner of this mm. All of the following must be true in
-         * order for it to be changed:
+                 * order for it to be changed:
-         *
+                 *
-         * current == mm->owner
+                 * current == mm->owner
-         * current->mm != mm
+                 * current->mm != mm
-         * new_owner->mm == mm
+                 * new_owner->mm == mm
-         * new_owner->alloc_lock is held
+                 * new_owner->alloc_lock is held
-         */
+                 */
-        struct task_struct __rcu *owner;
+                struct task_struct __rcu *owner;
 #endif
-        struct user_namespace *user_ns;
+                struct user_namespace *user_ns;
-        /* store ref to file /proc/<pid>/exe symlink points to */
+                /* store ref to file /proc/<pid>/exe symlink points to */
-        struct file __rcu *exe_file;
+                struct file __rcu *exe_file;
 #ifdef CONFIG_MMU_NOTIFIER
-        struct mmu_notifier_mm *mmu_notifier_mm;
+                struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-        pgtable_t pmd_huge_pte; /* protected by page_table_lock */
+                pgtable_t pmd_huge_pte; /* protected by page_table_lock */
-#endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
-        struct cpumask cpumask_allocation;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
-        /*
+                /*
-         * numa_next_scan is the next time that the PTEs will be marked
+                 * numa_next_scan is the next time that the PTEs will be marked
-         * pte_numa. NUMA hinting faults will gather statistics and migrate
+                 * pte_numa. NUMA hinting faults will gather statistics and
-         * pages to new nodes if necessary.
+                 * migrate pages to new nodes if necessary.
-         */
+                 */
-        unsigned long numa_next_scan;
+                unsigned long numa_next_scan;
-        /* Restart point for scanning and setting pte_numa */
+                /* Restart point for scanning and setting pte_numa */
-        unsigned long numa_scan_offset;
+                unsigned long numa_scan_offset;
-        /* numa_scan_seq prevents two threads setting pte_numa */
+                /* numa_scan_seq prevents two threads setting pte_numa */
-        int numa_scan_seq;
+                int numa_scan_seq;
 #endif
-        /*
+                /*
-         * An operation with batched TLB flushing is going on. Anything that
+                 * An operation with batched TLB flushing is going on. Anything
-         * can move process memory needs to flush the TLB when moving a
+                 * that can move process memory needs to flush the TLB when
-         * PROT_NONE or PROT_NUMA mapped page.
+                 * moving a PROT_NONE or PROT_NUMA mapped page.
-         */
+                 */
-        atomic_t tlb_flush_pending;
+                atomic_t tlb_flush_pending;
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-        /* See flush_tlb_batched_pending() */
+                /* See flush_tlb_batched_pending() */
-        bool tlb_flush_batched;
+                bool tlb_flush_batched;
 #endif
-        struct uprobes_state uprobes_state;
+                struct uprobes_state uprobes_state;
 #ifdef CONFIG_HUGETLB_PAGE
-        atomic_long_t hugetlb_usage;
+                atomic_long_t hugetlb_usage;
 #endif
-        struct work_struct async_put_work;
+                struct work_struct async_put_work;
 #if IS_ENABLED(CONFIG_HMM)
-        /* HMM needs to track a few things per mm */
+                /* HMM needs to track a few things per mm */
-        struct hmm *hmm;
+                struct hmm *hmm;
 #endif
-} __randomize_layout;
+        } __randomize_layout;
+        /*
+         * The mm_cpumask needs to be at the end of mm_struct, because it
+         * is dynamically sized based on nr_cpu_ids.
+         */
+        unsigned long cpu_bitmap[];
+};
 extern struct mm_struct init_mm;
+/* Pointer magic because the dynamic array size confuses some compilers. */
 static inline void mm_init_cpumask(struct mm_struct *mm)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
+        unsigned long cpu_bitmap = (unsigned long)mm;
-        mm->cpu_vm_mask_var = &mm->cpumask_allocation;
-#endif
+        cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
-        cpumask_clear(mm->cpu_vm_mask_var);
+        cpumask_clear((struct cpumask *)cpu_bitmap);
 }
 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
 static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 {
-        return mm->cpu_vm_mask_var;
+        return (struct cpumask *)&mm->cpu_bitmap;
 }
 struct mmu_gather;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1b27babc4c78..9d8d0e016fc6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2276,6 +2276,8 @@ static void sighand_ctor(void *data)
 void __init proc_caches_init(void)
 {
+        unsigned int mm_size;
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -2292,15 +2294,16 @@ void __init proc_caches_init(void)
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        /*
-         * FIXME! The "sizeof(struct mm_struct)" currently includes the
+         * The mm_cpumask is located at the end of mm_struct, and is
-         * whole struct cpumask for the OFFSTACK case. We could change
+         * dynamically sized based on the maximum CPU number this system
-         * this to *only* allocate as much of it as required by the
+         * can have, taking hotplug into account (nr_cpu_ids).
-         * maximum number of CPU's we can ever have.  The cpumask_allocation
-         * is at the end of the structure, exactly for that reason.
         */
+        mm_size = sizeof(struct mm_struct) + cpumask_size();
        mm_cachep = kmem_cache_create_usercopy("mm_struct",
-                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
+                        mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        offsetof(struct mm_struct, saved_auxv),
                        sizeof_field(struct mm_struct, saved_auxv),
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 54e5bbaa3200..517f5853ffed 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -92,7 +92,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
                if (ioremap_pmd_enabled() &&
                    ((next - addr) == PMD_SIZE) &&
                    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
-                    pmd_free_pte_page(pmd)) {
+                    pmd_free_pte_page(pmd, addr)) {
                        if (pmd_set_huge(pmd, phys_addr + addr, prot))
                                continue;
                }
@@ -119,7 +119,7 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
                if (ioremap_pud_enabled() &&
                    ((next - addr) == PUD_SIZE) &&
                    IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
-                    pud_free_pmd_page(pud)) {
+                    pud_free_pmd_page(pud, addr)) {
                        if (pud_set_huge(pud, phys_addr + addr, prot))
                                continue;
                }
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f0179c9c04c2..a787a319211e 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -15,6 +15,16 @@
 #define INIT_MM_CONTEXT(name)
 #endif
+/*
+ * For dynamically allocated mm_structs, there is a dynamically sized cpumask
+ * at the end of the structure, the size of which depends on the maximum CPU
+ * number the system can see. That way we allocate only as much memory for
+ * mm_cpumask() as needed for the hundreds, or thousands of processes that
+ * a system typically runs.
+ *
+ * Since there is only one init_mm in the entire system, keep it simple
+ * and size this cpu_bitmask to NR_CPUS.
+ */
 struct mm_struct init_mm = {
        .mm_rb          = RB_ROOT,
        .pgd            = swapper_pg_dir,
@@ -25,5 +35,6 @@ struct mm_struct init_mm = {
        .arg_lock       =  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
        .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
        .user_ns        = &init_user_ns,
+        .cpu_bitmap     = { [BITS_TO_LONGS(NR_CPUS)] = 0},
        INIT_MM_CONTEXT(init_mm)
 };
diff --git a/mm/memory.c b/mm/memory.c
index c5e87a3a82ba..3d0a74ab70f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
-/*
- * See the comment near struct mmu_table_batch.
- */
 static void tlb_remove_table_smp_sync(void *arg)
 {
-        /* Simply deliver the interrupt */
+        struct mm_struct __maybe_unused *mm = arg;
+        /*
+         * On most architectures this does nothing. Simply delivering the
+         * interrupt is enough to prevent races with software page table
+         * walking like that done in get_user_pages_fast.
+         *
+         * See the comment near struct mmu_table_batch.
+         */
+        tlb_flush_remove_tables_local(mm);
 }
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
 {
        /*
         * This isn't an RCU grace period and hence the page-tables cannot be
@@ -344,7 +348,7 @@ static void tlb_remove_table_one(void *table)
         * It is however sufficient for software page-table walkers that rely on
         * IRQ disabling. See the comment near struct mmu_table_batch.
         */
-        smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+        smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
        __tlb_remove_table(table);
 }
@@ -365,6 +369,8 @@ void tlb_table_flush(struct mmu_gather *tlb)
 {
        struct mmu_table_batch **batch = &tlb->batch;
+        tlb_flush_remove_tables(tlb->mm);
        if (*batch) {
                call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
                *batch = NULL;
@@ -387,7 +393,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
        if (*batch == NULL) {
                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                if (*batch == NULL) {
-                        tlb_remove_table_one(table);
+                        tlb_remove_table_one(table, tlb);
                        return;
                }
                (*batch)->nr = 0;
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-08-13 19:29:35 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-08-13 19:29:35 -0400
commit	203b4fc903b644223a27ad3f25f3a0f3a3911d1d (patch)
tree	8c210b67a17b74b2a39c500891a20e23c2390cf6
parent	7edcf0d314f69e506ddd9562062b2a79fa965bb9 (diff)
parent	765d28f136291f9639e3c031a1070fb76d6625c7 (diff)