aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 19:29:35 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 19:29:35 -0400
commit203b4fc903b644223a27ad3f25f3a0f3a3911d1d (patch)
tree8c210b67a17b74b2a39c500891a20e23c2390cf6
parent7edcf0d314f69e506ddd9562062b2a79fa965bb9 (diff)
parent765d28f136291f9639e3c031a1070fb76d6625c7 (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Thomas Gleixner: - Make lazy TLB mode even lazier to avoid pointless switch_mm() operations, which reduces CPU load by 1-2% for memcache workloads - Small cleanups and improvements all over the place * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mm: Remove redundant check for kmem_cache_create() arm/asm/tlb.h: Fix build error implicit func declaration x86/mm/tlb: Make clear_asid_other() static x86/mm/tlb: Skip atomic operations for 'init_mm' in switch_mm_irqs_off() x86/mm/tlb: Always use lazy TLB mode x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs x86/mm/tlb: Make lazy TLB mode lazier x86/mm/tlb: Restructure switch_mm_irqs_off() x86/mm/tlb: Leave lazy TLB mode at page table free time mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids x86/mm: Add TLB purge to free pmd/pte page interfaces ioremap: Update pgtable free interfaces with addr x86/mm: Disable ioremap free page handling on x86-PAE
-rw-r--r--arch/arm/include/asm/tlb.h8
-rw-r--r--arch/arm64/mm/mmu.c4
-rw-r--r--arch/x86/include/asm/tlbflush.h21
-rw-r--r--arch/x86/mm/pgtable.c64
-rw-r--r--arch/x86/mm/tlb.c224
-rw-r--r--drivers/firmware/efi/efi.c1
-rw-r--r--include/asm-generic/pgtable.h8
-rw-r--r--include/asm-generic/tlb.h10
-rw-r--r--include/linux/mm_types.h241
-rw-r--r--kernel/fork.c15
-rw-r--r--lib/ioremap.c4
-rw-r--r--mm/init-mm.c11
-rw-r--r--mm/memory.c22
13 files changed, 408 insertions, 225 deletions
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index d5562f9ce600..f854148c8d7c 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -292,5 +292,13 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
292{ 292{
293} 293}
294 294
295static inline void tlb_flush_remove_tables(struct mm_struct *mm)
296{
297}
298
299static inline void tlb_flush_remove_tables_local(void *arg)
300{
301}
302
295#endif /* CONFIG_MMU */ 303#endif /* CONFIG_MMU */
296#endif 304#endif
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 493ff75670ff..8ae5d7ae4af3 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -977,12 +977,12 @@ int pmd_clear_huge(pmd_t *pmdp)
977 return 1; 977 return 1;
978} 978}
979 979
980int pud_free_pmd_page(pud_t *pud) 980int pud_free_pmd_page(pud_t *pud, unsigned long addr)
981{ 981{
982 return pud_none(*pud); 982 return pud_none(*pud);
983} 983}
984 984
985int pmd_free_pte_page(pmd_t *pmd) 985int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
986{ 986{
987 return pmd_none(*pmd); 987 return pmd_none(*pmd);
988} 988}
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6690cd3fc8b1..511bf5fae8b8 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
148#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) 148#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
149#endif 149#endif
150 150
151static inline bool tlb_defer_switch_to_init_mm(void)
152{
153 /*
154 * If we have PCID, then switching to init_mm is reasonably
155 * fast. If we don't have PCID, then switching to init_mm is
156 * quite slow, so we try to defer it in the hopes that we can
157 * avoid it entirely. The latter approach runs the risk of
158 * receiving otherwise unnecessary IPIs.
159 *
160 * This choice is just a heuristic. The tlb code can handle this
161 * function returning true or false regardless of whether we have
162 * PCID.
163 */
164 return !static_cpu_has(X86_FEATURE_PCID);
165}
166
167struct tlb_context { 151struct tlb_context {
168 u64 ctx_id; 152 u64 ctx_id;
169 u64 tlb_gen; 153 u64 tlb_gen;
@@ -554,4 +538,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
554 native_flush_tlb_others(mask, info) 538 native_flush_tlb_others(mask, info)
555#endif 539#endif
556 540
541extern void tlb_flush_remove_tables(struct mm_struct *mm);
542extern void tlb_flush_remove_tables_local(void *arg);
543
544#define HAVE_TLB_FLUSH_REMOVE_TABLES
545
557#endif /* _ASM_X86_TLBFLUSH_H */ 546#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 47b5951e592b..0f1683fcb196 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -329,9 +329,6 @@ static int __init pgd_cache_init(void)
329 */ 329 */
330 pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, 330 pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
331 SLAB_PANIC, NULL); 331 SLAB_PANIC, NULL);
332 if (!pgd_cache)
333 return -ENOMEM;
334
335 return 0; 332 return 0;
336} 333}
337core_initcall(pgd_cache_init); 334core_initcall(pgd_cache_init);
@@ -719,28 +716,50 @@ int pmd_clear_huge(pmd_t *pmd)
719 return 0; 716 return 0;
720} 717}
721 718
719#ifdef CONFIG_X86_64
722/** 720/**
723 * pud_free_pmd_page - Clear pud entry and free pmd page. 721 * pud_free_pmd_page - Clear pud entry and free pmd page.
724 * @pud: Pointer to a PUD. 722 * @pud: Pointer to a PUD.
723 * @addr: Virtual address associated with pud.
725 * 724 *
726 * Context: The pud range has been unmaped and TLB purged. 725 * Context: The pud range has been unmapped and TLB purged.
727 * Return: 1 if clearing the entry succeeded. 0 otherwise. 726 * Return: 1 if clearing the entry succeeded. 0 otherwise.
727 *
728 * NOTE: Callers must allow a single page allocation.
728 */ 729 */
729int pud_free_pmd_page(pud_t *pud) 730int pud_free_pmd_page(pud_t *pud, unsigned long addr)
730{ 731{
731 pmd_t *pmd; 732 pmd_t *pmd, *pmd_sv;
733 pte_t *pte;
732 int i; 734 int i;
733 735
734 if (pud_none(*pud)) 736 if (pud_none(*pud))
735 return 1; 737 return 1;
736 738
737 pmd = (pmd_t *)pud_page_vaddr(*pud); 739 pmd = (pmd_t *)pud_page_vaddr(*pud);
740 pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
741 if (!pmd_sv)
742 return 0;
738 743
739 for (i = 0; i < PTRS_PER_PMD; i++) 744 for (i = 0; i < PTRS_PER_PMD; i++) {
740 if (!pmd_free_pte_page(&pmd[i])) 745 pmd_sv[i] = pmd[i];
741 return 0; 746 if (!pmd_none(pmd[i]))
747 pmd_clear(&pmd[i]);
748 }
742 749
743 pud_clear(pud); 750 pud_clear(pud);
751
752 /* INVLPG to clear all paging-structure caches */
753 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
754
755 for (i = 0; i < PTRS_PER_PMD; i++) {
756 if (!pmd_none(pmd_sv[i])) {
757 pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
758 free_page((unsigned long)pte);
759 }
760 }
761
762 free_page((unsigned long)pmd_sv);
744 free_page((unsigned long)pmd); 763 free_page((unsigned long)pmd);
745 764
746 return 1; 765 return 1;
@@ -749,11 +768,12 @@ int pud_free_pmd_page(pud_t *pud)
749/** 768/**
750 * pmd_free_pte_page - Clear pmd entry and free pte page. 769 * pmd_free_pte_page - Clear pmd entry and free pte page.
751 * @pmd: Pointer to a PMD. 770 * @pmd: Pointer to a PMD.
771 * @addr: Virtual address associated with pmd.
752 * 772 *
753 * Context: The pmd range has been unmaped and TLB purged. 773 * Context: The pmd range has been unmapped and TLB purged.
754 * Return: 1 if clearing the entry succeeded. 0 otherwise. 774 * Return: 1 if clearing the entry succeeded. 0 otherwise.
755 */ 775 */
756int pmd_free_pte_page(pmd_t *pmd) 776int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
757{ 777{
758 pte_t *pte; 778 pte_t *pte;
759 779
@@ -762,8 +782,30 @@ int pmd_free_pte_page(pmd_t *pmd)
762 782
763 pte = (pte_t *)pmd_page_vaddr(*pmd); 783 pte = (pte_t *)pmd_page_vaddr(*pmd);
764 pmd_clear(pmd); 784 pmd_clear(pmd);
785
786 /* INVLPG to clear all paging-structure caches */
787 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
788
765 free_page((unsigned long)pte); 789 free_page((unsigned long)pte);
766 790
767 return 1; 791 return 1;
768} 792}
793
794#else /* !CONFIG_X86_64 */
795
796int pud_free_pmd_page(pud_t *pud, unsigned long addr)
797{
798 return pud_none(*pud);
799}
800
801/*
802 * Disable free page handling on x86-PAE. This assures that ioremap()
803 * does not update sync'd pmd entries. See vmalloc_sync_one().
804 */
805int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
806{
807 return pmd_none(*pmd);
808}
809
810#endif /* CONFIG_X86_64 */
769#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 811#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6eb1f34c3c85..752dbf4e0e50 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
7#include <linux/export.h> 7#include <linux/export.h>
8#include <linux/cpu.h> 8#include <linux/cpu.h>
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/gfp.h>
10 11
11#include <asm/tlbflush.h> 12#include <asm/tlbflush.h>
12#include <asm/mmu_context.h> 13#include <asm/mmu_context.h>
@@ -35,7 +36,7 @@
35 * necessary invalidation by clearing out the 'ctx_id' which 36 * necessary invalidation by clearing out the 'ctx_id' which
36 * forces a TLB flush when the context is loaded. 37 * forces a TLB flush when the context is loaded.
37 */ 38 */
38void clear_asid_other(void) 39static void clear_asid_other(void)
39{ 40{
40 u16 asid; 41 u16 asid;
41 42
@@ -185,8 +186,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
185{ 186{
186 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 187 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
187 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 188 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
189 bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
188 unsigned cpu = smp_processor_id(); 190 unsigned cpu = smp_processor_id();
189 u64 next_tlb_gen; 191 u64 next_tlb_gen;
192 bool need_flush;
193 u16 new_asid;
190 194
191 /* 195 /*
192 * NB: The scheduler will call us with prev == next when switching 196 * NB: The scheduler will call us with prev == next when switching
@@ -240,20 +244,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
240 next->context.ctx_id); 244 next->context.ctx_id);
241 245
242 /* 246 /*
243 * We don't currently support having a real mm loaded without 247 * Even in lazy TLB mode, the CPU should stay set in the
244 * our cpu set in mm_cpumask(). We have all the bookkeeping 248 * mm_cpumask. The TLB shootdown code can figure out from
245 * in place to figure out whether we would need to flush 249 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
246 * if our cpu were cleared in mm_cpumask(), but we don't
247 * currently use it.
248 */ 250 */
249 if (WARN_ON_ONCE(real_prev != &init_mm && 251 if (WARN_ON_ONCE(real_prev != &init_mm &&
250 !cpumask_test_cpu(cpu, mm_cpumask(next)))) 252 !cpumask_test_cpu(cpu, mm_cpumask(next))))
251 cpumask_set_cpu(cpu, mm_cpumask(next)); 253 cpumask_set_cpu(cpu, mm_cpumask(next));
252 254
253 return; 255 /*
256 * If the CPU is not in lazy TLB mode, we are just switching
257 * from one thread in a process to another thread in the same
258 * process. No TLB flush required.
259 */
260 if (!was_lazy)
261 return;
262
263 /*
264 * Read the tlb_gen to check whether a flush is needed.
265 * If the TLB is up to date, just use it.
266 * The barrier synchronizes with the tlb_gen increment in
267 * the TLB shootdown code.
268 */
269 smp_mb();
270 next_tlb_gen = atomic64_read(&next->context.tlb_gen);
271 if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
272 next_tlb_gen)
273 return;
274
275 /*
276 * TLB contents went out of date while we were in lazy
277 * mode. Fall through to the TLB switching code below.
278 */
279 new_asid = prev_asid;
280 need_flush = true;
254 } else { 281 } else {
255 u16 new_asid;
256 bool need_flush;
257 u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); 282 u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
258 283
259 /* 284 /*
@@ -285,53 +310,60 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
285 sync_current_stack_to_mm(next); 310 sync_current_stack_to_mm(next);
286 } 311 }
287 312
288 /* Stop remote flushes for the previous mm */ 313 /*
289 VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && 314 * Stop remote flushes for the previous mm.
290 real_prev != &init_mm); 315 * Skip kernel threads; we never send init_mm TLB flushing IPIs,
291 cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 316 * but the bitmap manipulation can cause cache line contention.
317 */
318 if (real_prev != &init_mm) {
319 VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
320 mm_cpumask(real_prev)));
321 cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
322 }
292 323
293 /* 324 /*
294 * Start remote flushes and then read tlb_gen. 325 * Start remote flushes and then read tlb_gen.
295 */ 326 */
296 cpumask_set_cpu(cpu, mm_cpumask(next)); 327 if (next != &init_mm)
328 cpumask_set_cpu(cpu, mm_cpumask(next));
297 next_tlb_gen = atomic64_read(&next->context.tlb_gen); 329 next_tlb_gen = atomic64_read(&next->context.tlb_gen);
298 330
299 choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); 331 choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
332 }
300 333
301 if (need_flush) { 334 if (need_flush) {
302 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 335 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
303 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 336 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
304 load_new_mm_cr3(next->pgd, new_asid, true); 337 load_new_mm_cr3(next->pgd, new_asid, true);
305
306 /*
307 * NB: This gets called via leave_mm() in the idle path
308 * where RCU functions differently. Tracing normally
309 * uses RCU, so we need to use the _rcuidle variant.
310 *
311 * (There is no good reason for this. The idle code should
312 * be rearranged to call this before rcu_idle_enter().)
313 */
314 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
315 } else {
316 /* The new ASID is already up to date. */
317 load_new_mm_cr3(next->pgd, new_asid, false);
318
319 /* See above wrt _rcuidle. */
320 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
321 }
322 338
323 /* 339 /*
324 * Record last user mm's context id, so we can avoid 340 * NB: This gets called via leave_mm() in the idle path
325 * flushing branch buffer with IBPB if we switch back 341 * where RCU functions differently. Tracing normally
326 * to the same user. 342 * uses RCU, so we need to use the _rcuidle variant.
343 *
344 * (There is no good reason for this. The idle code should
345 * be rearranged to call this before rcu_idle_enter().)
327 */ 346 */
328 if (next != &init_mm) 347 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
329 this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); 348 } else {
349 /* The new ASID is already up to date. */
350 load_new_mm_cr3(next->pgd, new_asid, false);
330 351
331 this_cpu_write(cpu_tlbstate.loaded_mm, next); 352 /* See above wrt _rcuidle. */
332 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 353 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
333 } 354 }
334 355
356 /*
357 * Record last user mm's context id, so we can avoid
358 * flushing branch buffer with IBPB if we switch back
359 * to the same user.
360 */
361 if (next != &init_mm)
362 this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
363
364 this_cpu_write(cpu_tlbstate.loaded_mm, next);
365 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
366
335 load_mm_cr4(next); 367 load_mm_cr4(next);
336 switch_ldt(real_prev, next); 368 switch_ldt(real_prev, next);
337} 369}
@@ -354,20 +386,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
354 if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 386 if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
355 return; 387 return;
356 388
357 if (tlb_defer_switch_to_init_mm()) { 389 this_cpu_write(cpu_tlbstate.is_lazy, true);
358 /*
359 * There's a significant optimization that may be possible
360 * here. We have accurate enough TLB flush tracking that we
361 * don't need to maintain coherence of TLB per se when we're
362 * lazy. We do, however, need to maintain coherence of
363 * paging-structure caches. We could, in principle, leave our
364 * old mm loaded and only switch to init_mm when
365 * tlb_remove_page() happens.
366 */
367 this_cpu_write(cpu_tlbstate.is_lazy, true);
368 } else {
369 switch_mm(NULL, &init_mm, NULL);
370 }
371} 390}
372 391
373/* 392/*
@@ -454,6 +473,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
454 * paging-structure cache to avoid speculatively reading 473 * paging-structure cache to avoid speculatively reading
455 * garbage into our TLB. Since switching to init_mm is barely 474 * garbage into our TLB. Since switching to init_mm is barely
456 * slower than a minimal flush, just switch to init_mm. 475 * slower than a minimal flush, just switch to init_mm.
476 *
477 * This should be rare, with native_flush_tlb_others skipping
478 * IPIs to lazy TLB mode CPUs.
457 */ 479 */
458 switch_mm_irqs_off(NULL, &init_mm, NULL); 480 switch_mm_irqs_off(NULL, &init_mm, NULL);
459 return; 481 return;
@@ -560,6 +582,9 @@ static void flush_tlb_func_remote(void *info)
560void native_flush_tlb_others(const struct cpumask *cpumask, 582void native_flush_tlb_others(const struct cpumask *cpumask,
561 const struct flush_tlb_info *info) 583 const struct flush_tlb_info *info)
562{ 584{
585 cpumask_var_t lazymask;
586 unsigned int cpu;
587
563 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 588 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
564 if (info->end == TLB_FLUSH_ALL) 589 if (info->end == TLB_FLUSH_ALL)
565 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); 590 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -583,8 +608,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
583 * that UV should be updated so that smp_call_function_many(), 608 * that UV should be updated so that smp_call_function_many(),
584 * etc, are optimal on UV. 609 * etc, are optimal on UV.
585 */ 610 */
586 unsigned int cpu;
587
588 cpu = smp_processor_id(); 611 cpu = smp_processor_id();
589 cpumask = uv_flush_tlb_others(cpumask, info); 612 cpumask = uv_flush_tlb_others(cpumask, info);
590 if (cpumask) 613 if (cpumask)
@@ -592,8 +615,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
592 (void *)info, 1); 615 (void *)info, 1);
593 return; 616 return;
594 } 617 }
595 smp_call_function_many(cpumask, flush_tlb_func_remote, 618
619 /*
620 * A temporary cpumask is used in order to skip sending IPIs
621 * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
622 * If the allocation fails, simply IPI every CPU in mm_cpumask.
623 */
624 if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
625 smp_call_function_many(cpumask, flush_tlb_func_remote,
596 (void *)info, 1); 626 (void *)info, 1);
627 return;
628 }
629
630 cpumask_copy(lazymask, cpumask);
631
632 for_each_cpu(cpu, lazymask) {
633 if (per_cpu(cpu_tlbstate.is_lazy, cpu))
634 cpumask_clear_cpu(cpu, lazymask);
635 }
636
637 smp_call_function_many(lazymask, flush_tlb_func_remote,
638 (void *)info, 1);
639
640 free_cpumask_var(lazymask);
597} 641}
598 642
599/* 643/*
@@ -646,6 +690,68 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
646 put_cpu(); 690 put_cpu();
647} 691}
648 692
693void tlb_flush_remove_tables_local(void *arg)
694{
695 struct mm_struct *mm = arg;
696
697 if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
698 this_cpu_read(cpu_tlbstate.is_lazy)) {
699 /*
700 * We're in lazy mode. We need to at least flush our
701 * paging-structure cache to avoid speculatively reading
702 * garbage into our TLB. Since switching to init_mm is barely
703 * slower than a minimal flush, just switch to init_mm.
704 */
705 switch_mm_irqs_off(NULL, &init_mm, NULL);
706 }
707}
708
709static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
710 struct cpumask *lazy_cpus)
711{
712 int cpu;
713
714 for_each_cpu(cpu, mm_cpumask(mm)) {
715 if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
716 cpumask_set_cpu(cpu, lazy_cpus);
717 }
718}
719
720void tlb_flush_remove_tables(struct mm_struct *mm)
721{
722 int cpu = get_cpu();
723 cpumask_var_t lazy_cpus;
724
725 if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
726 put_cpu();
727 return;
728 }
729
730 if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
731 /*
732 * If the cpumask allocation fails, do a brute force flush
733 * on all the CPUs that have this mm loaded.
734 */
735 smp_call_function_many(mm_cpumask(mm),
736 tlb_flush_remove_tables_local, (void *)mm, 1);
737 put_cpu();
738 return;
739 }
740
741 /*
742 * CPUs with !is_lazy either received a TLB flush IPI while the user
743 * pages in this address range were unmapped, or have context switched
744 * and reloaded %CR3 since then.
745 *
746 * Shootdown IPIs at page table freeing time only need to be sent to
747 * CPUs that may have out of date TLB contents.
748 */
749 mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
750 smp_call_function_many(lazy_cpus,
751 tlb_flush_remove_tables_local, (void *)mm, 1);
752 free_cpumask_var(lazy_cpus);
753 put_cpu();
754}
649 755
650static void do_flush_tlb_all(void *info) 756static void do_flush_tlb_all(void *info)
651{ 757{
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index d8a33a781a57..2a29dd9c986d 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -82,6 +82,7 @@ struct mm_struct efi_mm = {
82 .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), 82 .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem),
83 .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), 83 .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
84 .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), 84 .mmlist = LIST_HEAD_INIT(efi_mm.mmlist),
85 .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
85}; 86};
86 87
87struct workqueue_struct *efi_rts_wq; 88struct workqueue_struct *efi_rts_wq;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index f59639afaa39..b081794ba135 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1019,8 +1019,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
1019int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); 1019int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
1020int pud_clear_huge(pud_t *pud); 1020int pud_clear_huge(pud_t *pud);
1021int pmd_clear_huge(pmd_t *pmd); 1021int pmd_clear_huge(pmd_t *pmd);
1022int pud_free_pmd_page(pud_t *pud); 1022int pud_free_pmd_page(pud_t *pud, unsigned long addr);
1023int pmd_free_pte_page(pmd_t *pmd); 1023int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
1024#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ 1024#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
1025static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) 1025static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
1026{ 1026{
@@ -1046,11 +1046,11 @@ static inline int pmd_clear_huge(pmd_t *pmd)
1046{ 1046{
1047 return 0; 1047 return 0;
1048} 1048}
1049static inline int pud_free_pmd_page(pud_t *pud) 1049static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
1050{ 1050{
1051 return 0; 1051 return 0;
1052} 1052}
1053static inline int pmd_free_pte_page(pmd_t *pmd) 1053static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
1054{ 1054{
1055 return 0; 1055 return 0;
1056} 1056}
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 3063125197ad..e811ef7b8350 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -303,4 +303,14 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
303 303
304#define tlb_migrate_finish(mm) do {} while (0) 304#define tlb_migrate_finish(mm) do {} while (0)
305 305
306/*
307 * Used to flush the TLB when page tables are removed, when lazy
308 * TLB mode may cause a CPU to retain intermediate translations
309 * pointing to about-to-be-freed page table memory.
310 */
311#ifndef HAVE_TLB_FLUSH_REMOVE_TABLES
312#define tlb_flush_remove_tables(mm) do {} while (0)
313#define tlb_flush_remove_tables_local(mm) do {} while (0)
314#endif
315
306#endif /* _ASM_GENERIC__TLB_H */ 316#endif /* _ASM_GENERIC__TLB_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 99ce070e7dcb..efdc24dd9e97 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,176 +335,183 @@ struct core_state {
335 335
336struct kioctx_table; 336struct kioctx_table;
337struct mm_struct { 337struct mm_struct {
338 struct vm_area_struct *mmap; /* list of VMAs */ 338 struct {
339 struct rb_root mm_rb; 339 struct vm_area_struct *mmap; /* list of VMAs */
340 u32 vmacache_seqnum; /* per-thread vmacache */ 340 struct rb_root mm_rb;
341 u32 vmacache_seqnum; /* per-thread vmacache */
341#ifdef CONFIG_MMU 342#ifdef CONFIG_MMU
342 unsigned long (*get_unmapped_area) (struct file *filp, 343 unsigned long (*get_unmapped_area) (struct file *filp,
343 unsigned long addr, unsigned long len, 344 unsigned long addr, unsigned long len,
344 unsigned long pgoff, unsigned long flags); 345 unsigned long pgoff, unsigned long flags);
345#endif 346#endif
346 unsigned long mmap_base; /* base of mmap area */ 347 unsigned long mmap_base; /* base of mmap area */
347 unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ 348 unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
348#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES 349#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
349 /* Base adresses for compatible mmap() */ 350 /* Base adresses for compatible mmap() */
350 unsigned long mmap_compat_base; 351 unsigned long mmap_compat_base;
351 unsigned long mmap_compat_legacy_base; 352 unsigned long mmap_compat_legacy_base;
352#endif 353#endif
353 unsigned long task_size; /* size of task vm space */ 354 unsigned long task_size; /* size of task vm space */
354 unsigned long highest_vm_end; /* highest vma end address */ 355 unsigned long highest_vm_end; /* highest vma end address */
355 pgd_t * pgd; 356 pgd_t * pgd;
356 357
357 /** 358 /**
358 * @mm_users: The number of users including userspace. 359 * @mm_users: The number of users including userspace.
359 * 360 *
360 * Use mmget()/mmget_not_zero()/mmput() to modify. When this drops 361 * Use mmget()/mmget_not_zero()/mmput() to modify. When this
361 * to 0 (i.e. when the task exits and there are no other temporary 362 * drops to 0 (i.e. when the task exits and there are no other
362 * reference holders), we also release a reference on @mm_count 363 * temporary reference holders), we also release a reference on
363 * (which may then free the &struct mm_struct if @mm_count also 364 * @mm_count (which may then free the &struct mm_struct if
364 * drops to 0). 365 * @mm_count also drops to 0).
365 */ 366 */
366 atomic_t mm_users; 367 atomic_t mm_users;
367 368
368 /** 369 /**
369 * @mm_count: The number of references to &struct mm_struct 370 * @mm_count: The number of references to &struct mm_struct
370 * (@mm_users count as 1). 371 * (@mm_users count as 1).
371 * 372 *
372 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the 373 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
373 * &struct mm_struct is freed. 374 * &struct mm_struct is freed.
374 */ 375 */
375 atomic_t mm_count; 376 atomic_t mm_count;
376 377
377#ifdef CONFIG_MMU 378#ifdef CONFIG_MMU
378 atomic_long_t pgtables_bytes; /* PTE page table pages */ 379 atomic_long_t pgtables_bytes; /* PTE page table pages */
379#endif 380#endif
380 int map_count; /* number of VMAs */ 381 int map_count; /* number of VMAs */
381 382
382 spinlock_t page_table_lock; /* Protects page tables and some counters */ 383 spinlock_t page_table_lock; /* Protects page tables and some
383 struct rw_semaphore mmap_sem; 384 * counters
385 */
386 struct rw_semaphore mmap_sem;
384 387
385 struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung 388 struct list_head mmlist; /* List of maybe swapped mm's. These
386 * together off init_mm.mmlist, and are protected 389 * are globally strung together off
387 * by mmlist_lock 390 * init_mm.mmlist, and are protected
388 */ 391 * by mmlist_lock
392 */
389 393
390 394
391 unsigned long hiwater_rss; /* High-watermark of RSS usage */ 395 unsigned long hiwater_rss; /* High-watermark of RSS usage */
392 unsigned long hiwater_vm; /* High-water virtual memory usage */ 396 unsigned long hiwater_vm; /* High-water virtual memory usage */
393 397
394 unsigned long total_vm; /* Total pages mapped */ 398 unsigned long total_vm; /* Total pages mapped */
395 unsigned long locked_vm; /* Pages that have PG_mlocked set */ 399 unsigned long locked_vm; /* Pages that have PG_mlocked set */
396 unsigned long pinned_vm; /* Refcount permanently increased */ 400 unsigned long pinned_vm; /* Refcount permanently increased */
397 unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ 401 unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
398 unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ 402 unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
399 unsigned long stack_vm; /* VM_STACK */ 403 unsigned long stack_vm; /* VM_STACK */
400 unsigned long def_flags; 404 unsigned long def_flags;
401 405
402 spinlock_t arg_lock; /* protect the below fields */ 406 spinlock_t arg_lock; /* protect the below fields */
403 unsigned long start_code, end_code, start_data, end_data; 407 unsigned long start_code, end_code, start_data, end_data;
404 unsigned long start_brk, brk, start_stack; 408 unsigned long start_brk, brk, start_stack;
405 unsigned long arg_start, arg_end, env_start, env_end; 409 unsigned long arg_start, arg_end, env_start, env_end;
406 410
407 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ 411 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
408 412
409 /* 413 /*
410 * Special counters, in some configurations protected by the 414 * Special counters, in some configurations protected by the
411 * page_table_lock, in other configurations by being atomic. 415 * page_table_lock, in other configurations by being atomic.
412 */ 416 */
413 struct mm_rss_stat rss_stat; 417 struct mm_rss_stat rss_stat;
414
415 struct linux_binfmt *binfmt;
416 418
417 cpumask_var_t cpu_vm_mask_var; 419 struct linux_binfmt *binfmt;
418 420
419 /* Architecture-specific MM context */ 421 /* Architecture-specific MM context */
420 mm_context_t context; 422 mm_context_t context;
421 423
422 unsigned long flags; /* Must use atomic bitops to access the bits */ 424 unsigned long flags; /* Must use atomic bitops to access */
423 425
424 struct core_state *core_state; /* coredumping support */ 426 struct core_state *core_state; /* coredumping support */
425#ifdef CONFIG_MEMBARRIER 427#ifdef CONFIG_MEMBARRIER
426 atomic_t membarrier_state; 428 atomic_t membarrier_state;
427#endif 429#endif
428#ifdef CONFIG_AIO 430#ifdef CONFIG_AIO
429 spinlock_t ioctx_lock; 431 spinlock_t ioctx_lock;
430 struct kioctx_table __rcu *ioctx_table; 432 struct kioctx_table __rcu *ioctx_table;
431#endif 433#endif
432#ifdef CONFIG_MEMCG 434#ifdef CONFIG_MEMCG
433 /* 435 /*
434 * "owner" points to a task that is regarded as the canonical 436 * "owner" points to a task that is regarded as the canonical
435 * user/owner of this mm. All of the following must be true in 437 * user/owner of this mm. All of the following must be true in
436 * order for it to be changed: 438 * order for it to be changed:
437 * 439 *
438 * current == mm->owner 440 * current == mm->owner
439 * current->mm != mm 441 * current->mm != mm
440 * new_owner->mm == mm 442 * new_owner->mm == mm
441 * new_owner->alloc_lock is held 443 * new_owner->alloc_lock is held
442 */ 444 */
443 struct task_struct __rcu *owner; 445 struct task_struct __rcu *owner;
444#endif 446#endif
445 struct user_namespace *user_ns; 447 struct user_namespace *user_ns;
446 448
447 /* store ref to file /proc/<pid>/exe symlink points to */ 449 /* store ref to file /proc/<pid>/exe symlink points to */
448 struct file __rcu *exe_file; 450 struct file __rcu *exe_file;
449#ifdef CONFIG_MMU_NOTIFIER 451#ifdef CONFIG_MMU_NOTIFIER
450 struct mmu_notifier_mm *mmu_notifier_mm; 452 struct mmu_notifier_mm *mmu_notifier_mm;
451#endif 453#endif
452#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 454#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
453 pgtable_t pmd_huge_pte; /* protected by page_table_lock */ 455 pgtable_t pmd_huge_pte; /* protected by page_table_lock */
454#endif
455#ifdef CONFIG_CPUMASK_OFFSTACK
456 struct cpumask cpumask_allocation;
457#endif 456#endif
458#ifdef CONFIG_NUMA_BALANCING 457#ifdef CONFIG_NUMA_BALANCING
459 /* 458 /*
460 * numa_next_scan is the next time that the PTEs will be marked 459 * numa_next_scan is the next time that the PTEs will be marked
461 * pte_numa. NUMA hinting faults will gather statistics and migrate 460 * pte_numa. NUMA hinting faults will gather statistics and
462 * pages to new nodes if necessary. 461 * migrate pages to new nodes if necessary.
463 */ 462 */
464 unsigned long numa_next_scan; 463 unsigned long numa_next_scan;
465 464
466 /* Restart point for scanning and setting pte_numa */ 465 /* Restart point for scanning and setting pte_numa */
467 unsigned long numa_scan_offset; 466 unsigned long numa_scan_offset;
468 467
469 /* numa_scan_seq prevents two threads setting pte_numa */ 468 /* numa_scan_seq prevents two threads setting pte_numa */
470 int numa_scan_seq; 469 int numa_scan_seq;
471#endif 470#endif
472 /* 471 /*
473 * An operation with batched TLB flushing is going on. Anything that 472 * An operation with batched TLB flushing is going on. Anything
474 * can move process memory needs to flush the TLB when moving a 473 * that can move process memory needs to flush the TLB when
475 * PROT_NONE or PROT_NUMA mapped page. 474 * moving a PROT_NONE or PROT_NUMA mapped page.
476 */ 475 */
477 atomic_t tlb_flush_pending; 476 atomic_t tlb_flush_pending;
478#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 477#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
479 /* See flush_tlb_batched_pending() */ 478 /* See flush_tlb_batched_pending() */
480 bool tlb_flush_batched; 479 bool tlb_flush_batched;
481#endif 480#endif
482 struct uprobes_state uprobes_state; 481 struct uprobes_state uprobes_state;
483#ifdef CONFIG_HUGETLB_PAGE 482#ifdef CONFIG_HUGETLB_PAGE
484 atomic_long_t hugetlb_usage; 483 atomic_long_t hugetlb_usage;
485#endif 484#endif
486 struct work_struct async_put_work; 485 struct work_struct async_put_work;
487 486
488#if IS_ENABLED(CONFIG_HMM) 487#if IS_ENABLED(CONFIG_HMM)
489 /* HMM needs to track a few things per mm */ 488 /* HMM needs to track a few things per mm */
490 struct hmm *hmm; 489 struct hmm *hmm;
491#endif 490#endif
492} __randomize_layout; 491 } __randomize_layout;
492
493 /*
494 * The mm_cpumask needs to be at the end of mm_struct, because it
495 * is dynamically sized based on nr_cpu_ids.
496 */
497 unsigned long cpu_bitmap[];
498};
493 499
494extern struct mm_struct init_mm; 500extern struct mm_struct init_mm;
495 501
502/* Pointer magic because the dynamic array size confuses some compilers. */
496static inline void mm_init_cpumask(struct mm_struct *mm) 503static inline void mm_init_cpumask(struct mm_struct *mm)
497{ 504{
498#ifdef CONFIG_CPUMASK_OFFSTACK 505 unsigned long cpu_bitmap = (unsigned long)mm;
499 mm->cpu_vm_mask_var = &mm->cpumask_allocation; 506
500#endif 507 cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
501 cpumask_clear(mm->cpu_vm_mask_var); 508 cpumask_clear((struct cpumask *)cpu_bitmap);
502} 509}
503 510
504/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ 511/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
505static inline cpumask_t *mm_cpumask(struct mm_struct *mm) 512static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
506{ 513{
507 return mm->cpu_vm_mask_var; 514 return (struct cpumask *)&mm->cpu_bitmap;
508} 515}
509 516
510struct mmu_gather; 517struct mmu_gather;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1b27babc4c78..9d8d0e016fc6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2276,6 +2276,8 @@ static void sighand_ctor(void *data)
2276 2276
2277void __init proc_caches_init(void) 2277void __init proc_caches_init(void)
2278{ 2278{
2279 unsigned int mm_size;
2280
2279 sighand_cachep = kmem_cache_create("sighand_cache", 2281 sighand_cachep = kmem_cache_create("sighand_cache",
2280 sizeof(struct sighand_struct), 0, 2282 sizeof(struct sighand_struct), 0,
2281 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| 2283 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -2292,15 +2294,16 @@ void __init proc_caches_init(void)
2292 sizeof(struct fs_struct), 0, 2294 sizeof(struct fs_struct), 0,
2293 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, 2295 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2294 NULL); 2296 NULL);
2297
2295 /* 2298 /*
2296 * FIXME! The "sizeof(struct mm_struct)" currently includes the 2299 * The mm_cpumask is located at the end of mm_struct, and is
2297 * whole struct cpumask for the OFFSTACK case. We could change 2300 * dynamically sized based on the maximum CPU number this system
2298 * this to *only* allocate as much of it as required by the 2301 * can have, taking hotplug into account (nr_cpu_ids).
2299 * maximum number of CPU's we can ever have. The cpumask_allocation
2300 * is at the end of the structure, exactly for that reason.
2301 */ 2302 */
2303 mm_size = sizeof(struct mm_struct) + cpumask_size();
2304
2302 mm_cachep = kmem_cache_create_usercopy("mm_struct", 2305 mm_cachep = kmem_cache_create_usercopy("mm_struct",
2303 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 2306 mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
2304 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, 2307 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2305 offsetof(struct mm_struct, saved_auxv), 2308 offsetof(struct mm_struct, saved_auxv),
2306 sizeof_field(struct mm_struct, saved_auxv), 2309 sizeof_field(struct mm_struct, saved_auxv),
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 54e5bbaa3200..517f5853ffed 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -92,7 +92,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
92 if (ioremap_pmd_enabled() && 92 if (ioremap_pmd_enabled() &&
93 ((next - addr) == PMD_SIZE) && 93 ((next - addr) == PMD_SIZE) &&
94 IS_ALIGNED(phys_addr + addr, PMD_SIZE) && 94 IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
95 pmd_free_pte_page(pmd)) { 95 pmd_free_pte_page(pmd, addr)) {
96 if (pmd_set_huge(pmd, phys_addr + addr, prot)) 96 if (pmd_set_huge(pmd, phys_addr + addr, prot))
97 continue; 97 continue;
98 } 98 }
@@ -119,7 +119,7 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
119 if (ioremap_pud_enabled() && 119 if (ioremap_pud_enabled() &&
120 ((next - addr) == PUD_SIZE) && 120 ((next - addr) == PUD_SIZE) &&
121 IS_ALIGNED(phys_addr + addr, PUD_SIZE) && 121 IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
122 pud_free_pmd_page(pud)) { 122 pud_free_pmd_page(pud, addr)) {
123 if (pud_set_huge(pud, phys_addr + addr, prot)) 123 if (pud_set_huge(pud, phys_addr + addr, prot))
124 continue; 124 continue;
125 } 125 }
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f0179c9c04c2..a787a319211e 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -15,6 +15,16 @@
15#define INIT_MM_CONTEXT(name) 15#define INIT_MM_CONTEXT(name)
16#endif 16#endif
17 17
18/*
19 * For dynamically allocated mm_structs, there is a dynamically sized cpumask
20 * at the end of the structure, the size of which depends on the maximum CPU
21 * number the system can see. That way we allocate only as much memory for
22 * mm_cpumask() as needed for the hundreds, or thousands of processes that
23 * a system typically runs.
24 *
25 * Since there is only one init_mm in the entire system, keep it simple
26 * and size this cpu_bitmask to NR_CPUS.
27 */
18struct mm_struct init_mm = { 28struct mm_struct init_mm = {
19 .mm_rb = RB_ROOT, 29 .mm_rb = RB_ROOT,
20 .pgd = swapper_pg_dir, 30 .pgd = swapper_pg_dir,
@@ -25,5 +35,6 @@ struct mm_struct init_mm = {
25 .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), 35 .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
26 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 36 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
27 .user_ns = &init_user_ns, 37 .user_ns = &init_user_ns,
38 .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
28 INIT_MM_CONTEXT(init_mm) 39 INIT_MM_CONTEXT(init_mm)
29}; 40};
diff --git a/mm/memory.c b/mm/memory.c
index c5e87a3a82ba..3d0a74ab70f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
326 326
327#ifdef CONFIG_HAVE_RCU_TABLE_FREE 327#ifdef CONFIG_HAVE_RCU_TABLE_FREE
328 328
329/*
330 * See the comment near struct mmu_table_batch.
331 */
332
333static void tlb_remove_table_smp_sync(void *arg) 329static void tlb_remove_table_smp_sync(void *arg)
334{ 330{
335 /* Simply deliver the interrupt */ 331 struct mm_struct __maybe_unused *mm = arg;
332 /*
333 * On most architectures this does nothing. Simply delivering the
334 * interrupt is enough to prevent races with software page table
335 * walking like that done in get_user_pages_fast.
336 *
337 * See the comment near struct mmu_table_batch.
338 */
339 tlb_flush_remove_tables_local(mm);
336} 340}
337 341
338static void tlb_remove_table_one(void *table) 342static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
339{ 343{
340 /* 344 /*
341 * This isn't an RCU grace period and hence the page-tables cannot be 345 * This isn't an RCU grace period and hence the page-tables cannot be
@@ -344,7 +348,7 @@ static void tlb_remove_table_one(void *table)
344 * It is however sufficient for software page-table walkers that rely on 348 * It is however sufficient for software page-table walkers that rely on
345 * IRQ disabling. See the comment near struct mmu_table_batch. 349 * IRQ disabling. See the comment near struct mmu_table_batch.
346 */ 350 */
347 smp_call_function(tlb_remove_table_smp_sync, NULL, 1); 351 smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
348 __tlb_remove_table(table); 352 __tlb_remove_table(table);
349} 353}
350 354
@@ -365,6 +369,8 @@ void tlb_table_flush(struct mmu_gather *tlb)
365{ 369{
366 struct mmu_table_batch **batch = &tlb->batch; 370 struct mmu_table_batch **batch = &tlb->batch;
367 371
372 tlb_flush_remove_tables(tlb->mm);
373
368 if (*batch) { 374 if (*batch) {
369 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); 375 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
370 *batch = NULL; 376 *batch = NULL;
@@ -387,7 +393,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
387 if (*batch == NULL) { 393 if (*batch == NULL) {
388 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); 394 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
389 if (*batch == NULL) { 395 if (*batch == NULL) {
390 tlb_remove_table_one(table); 396 tlb_remove_table_one(table, tlb);
391 return; 397 return;
392 } 398 }
393 (*batch)->nr = 0; 399 (*batch)->nr = 0;