diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-13 19:29:35 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-13 19:29:35 -0400 |
| commit | 203b4fc903b644223a27ad3f25f3a0f3a3911d1d (patch) | |
| tree | 8c210b67a17b74b2a39c500891a20e23c2390cf6 | |
| parent | 7edcf0d314f69e506ddd9562062b2a79fa965bb9 (diff) | |
| parent | 765d28f136291f9639e3c031a1070fb76d6625c7 (diff) | |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Thomas Gleixner:
- Make lazy TLB mode even lazier to avoid pointless switch_mm()
operations, which reduces CPU load by 1-2% for memcache workloads
- Small cleanups and improvements all over the place
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mm: Remove redundant check for kmem_cache_create()
arm/asm/tlb.h: Fix build error implicit func declaration
x86/mm/tlb: Make clear_asid_other() static
x86/mm/tlb: Skip atomic operations for 'init_mm' in switch_mm_irqs_off()
x86/mm/tlb: Always use lazy TLB mode
x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs
x86/mm/tlb: Make lazy TLB mode lazier
x86/mm/tlb: Restructure switch_mm_irqs_off()
x86/mm/tlb: Leave lazy TLB mode at page table free time
mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids
x86/mm: Add TLB purge to free pmd/pte page interfaces
ioremap: Update pgtable free interfaces with addr
x86/mm: Disable ioremap free page handling on x86-PAE
| -rw-r--r-- | arch/arm/include/asm/tlb.h | 8 | ||||
| -rw-r--r-- | arch/arm64/mm/mmu.c | 4 | ||||
| -rw-r--r-- | arch/x86/include/asm/tlbflush.h | 21 | ||||
| -rw-r--r-- | arch/x86/mm/pgtable.c | 64 | ||||
| -rw-r--r-- | arch/x86/mm/tlb.c | 224 | ||||
| -rw-r--r-- | drivers/firmware/efi/efi.c | 1 | ||||
| -rw-r--r-- | include/asm-generic/pgtable.h | 8 | ||||
| -rw-r--r-- | include/asm-generic/tlb.h | 10 | ||||
| -rw-r--r-- | include/linux/mm_types.h | 241 | ||||
| -rw-r--r-- | kernel/fork.c | 15 | ||||
| -rw-r--r-- | lib/ioremap.c | 4 | ||||
| -rw-r--r-- | mm/init-mm.c | 11 | ||||
| -rw-r--r-- | mm/memory.c | 22 |
13 files changed, 408 insertions, 225 deletions
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index d5562f9ce600..f854148c8d7c 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h | |||
| @@ -292,5 +292,13 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, | |||
| 292 | { | 292 | { |
| 293 | } | 293 | } |
| 294 | 294 | ||
| 295 | static inline void tlb_flush_remove_tables(struct mm_struct *mm) | ||
| 296 | { | ||
| 297 | } | ||
| 298 | |||
| 299 | static inline void tlb_flush_remove_tables_local(void *arg) | ||
| 300 | { | ||
| 301 | } | ||
| 302 | |||
| 295 | #endif /* CONFIG_MMU */ | 303 | #endif /* CONFIG_MMU */ |
| 296 | #endif | 304 | #endif |
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 493ff75670ff..8ae5d7ae4af3 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c | |||
| @@ -977,12 +977,12 @@ int pmd_clear_huge(pmd_t *pmdp) | |||
| 977 | return 1; | 977 | return 1; |
| 978 | } | 978 | } |
| 979 | 979 | ||
| 980 | int pud_free_pmd_page(pud_t *pud) | 980 | int pud_free_pmd_page(pud_t *pud, unsigned long addr) |
| 981 | { | 981 | { |
| 982 | return pud_none(*pud); | 982 | return pud_none(*pud); |
| 983 | } | 983 | } |
| 984 | 984 | ||
| 985 | int pmd_free_pte_page(pmd_t *pmd) | 985 | int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) |
| 986 | { | 986 | { |
| 987 | return pmd_none(*pmd); | 987 | return pmd_none(*pmd); |
| 988 | } | 988 | } |
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6690cd3fc8b1..511bf5fae8b8 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
| @@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) | |||
| 148 | #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) | 148 | #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) |
| 149 | #endif | 149 | #endif |
| 150 | 150 | ||
| 151 | static inline bool tlb_defer_switch_to_init_mm(void) | ||
| 152 | { | ||
| 153 | /* | ||
| 154 | * If we have PCID, then switching to init_mm is reasonably | ||
| 155 | * fast. If we don't have PCID, then switching to init_mm is | ||
| 156 | * quite slow, so we try to defer it in the hopes that we can | ||
| 157 | * avoid it entirely. The latter approach runs the risk of | ||
| 158 | * receiving otherwise unnecessary IPIs. | ||
| 159 | * | ||
| 160 | * This choice is just a heuristic. The tlb code can handle this | ||
| 161 | * function returning true or false regardless of whether we have | ||
| 162 | * PCID. | ||
| 163 | */ | ||
| 164 | return !static_cpu_has(X86_FEATURE_PCID); | ||
| 165 | } | ||
| 166 | |||
| 167 | struct tlb_context { | 151 | struct tlb_context { |
| 168 | u64 ctx_id; | 152 | u64 ctx_id; |
| 169 | u64 tlb_gen; | 153 | u64 tlb_gen; |
| @@ -554,4 +538,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); | |||
| 554 | native_flush_tlb_others(mask, info) | 538 | native_flush_tlb_others(mask, info) |
| 555 | #endif | 539 | #endif |
| 556 | 540 | ||
| 541 | extern void tlb_flush_remove_tables(struct mm_struct *mm); | ||
| 542 | extern void tlb_flush_remove_tables_local(void *arg); | ||
| 543 | |||
| 544 | #define HAVE_TLB_FLUSH_REMOVE_TABLES | ||
| 545 | |||
| 557 | #endif /* _ASM_X86_TLBFLUSH_H */ | 546 | #endif /* _ASM_X86_TLBFLUSH_H */ |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 47b5951e592b..0f1683fcb196 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
| @@ -329,9 +329,6 @@ static int __init pgd_cache_init(void) | |||
| 329 | */ | 329 | */ |
| 330 | pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, | 330 | pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, |
| 331 | SLAB_PANIC, NULL); | 331 | SLAB_PANIC, NULL); |
| 332 | if (!pgd_cache) | ||
| 333 | return -ENOMEM; | ||
| 334 | |||
| 335 | return 0; | 332 | return 0; |
| 336 | } | 333 | } |
| 337 | core_initcall(pgd_cache_init); | 334 | core_initcall(pgd_cache_init); |
| @@ -719,28 +716,50 @@ int pmd_clear_huge(pmd_t *pmd) | |||
| 719 | return 0; | 716 | return 0; |
| 720 | } | 717 | } |
| 721 | 718 | ||
| 719 | #ifdef CONFIG_X86_64 | ||
| 722 | /** | 720 | /** |
| 723 | * pud_free_pmd_page - Clear pud entry and free pmd page. | 721 | * pud_free_pmd_page - Clear pud entry and free pmd page. |
| 724 | * @pud: Pointer to a PUD. | 722 | * @pud: Pointer to a PUD. |
| 723 | * @addr: Virtual address associated with pud. | ||
| 725 | * | 724 | * |
| 726 | * Context: The pud range has been unmaped and TLB purged. | 725 | * Context: The pud range has been unmapped and TLB purged. |
| 727 | * Return: 1 if clearing the entry succeeded. 0 otherwise. | 726 | * Return: 1 if clearing the entry succeeded. 0 otherwise. |
| 727 | * | ||
| 728 | * NOTE: Callers must allow a single page allocation. | ||
| 728 | */ | 729 | */ |
| 729 | int pud_free_pmd_page(pud_t *pud) | 730 | int pud_free_pmd_page(pud_t *pud, unsigned long addr) |
| 730 | { | 731 | { |
| 731 | pmd_t *pmd; | 732 | pmd_t *pmd, *pmd_sv; |
| 733 | pte_t *pte; | ||
| 732 | int i; | 734 | int i; |
| 733 | 735 | ||
| 734 | if (pud_none(*pud)) | 736 | if (pud_none(*pud)) |
| 735 | return 1; | 737 | return 1; |
| 736 | 738 | ||
| 737 | pmd = (pmd_t *)pud_page_vaddr(*pud); | 739 | pmd = (pmd_t *)pud_page_vaddr(*pud); |
| 740 | pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); | ||
| 741 | if (!pmd_sv) | ||
| 742 | return 0; | ||
| 738 | 743 | ||
| 739 | for (i = 0; i < PTRS_PER_PMD; i++) | 744 | for (i = 0; i < PTRS_PER_PMD; i++) { |
| 740 | if (!pmd_free_pte_page(&pmd[i])) | 745 | pmd_sv[i] = pmd[i]; |
| 741 | return 0; | 746 | if (!pmd_none(pmd[i])) |
| 747 | pmd_clear(&pmd[i]); | ||
| 748 | } | ||
| 742 | 749 | ||
| 743 | pud_clear(pud); | 750 | pud_clear(pud); |
| 751 | |||
| 752 | /* INVLPG to clear all paging-structure caches */ | ||
| 753 | flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); | ||
| 754 | |||
| 755 | for (i = 0; i < PTRS_PER_PMD; i++) { | ||
| 756 | if (!pmd_none(pmd_sv[i])) { | ||
| 757 | pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); | ||
| 758 | free_page((unsigned long)pte); | ||
| 759 | } | ||
| 760 | } | ||
| 761 | |||
| 762 | free_page((unsigned long)pmd_sv); | ||
| 744 | free_page((unsigned long)pmd); | 763 | free_page((unsigned long)pmd); |
| 745 | 764 | ||
| 746 | return 1; | 765 | return 1; |
| @@ -749,11 +768,12 @@ int pud_free_pmd_page(pud_t *pud) | |||
| 749 | /** | 768 | /** |
| 750 | * pmd_free_pte_page - Clear pmd entry and free pte page. | 769 | * pmd_free_pte_page - Clear pmd entry and free pte page. |
| 751 | * @pmd: Pointer to a PMD. | 770 | * @pmd: Pointer to a PMD. |
| 771 | * @addr: Virtual address associated with pmd. | ||
| 752 | * | 772 | * |
| 753 | * Context: The pmd range has been unmaped and TLB purged. | 773 | * Context: The pmd range has been unmapped and TLB purged. |
| 754 | * Return: 1 if clearing the entry succeeded. 0 otherwise. | 774 | * Return: 1 if clearing the entry succeeded. 0 otherwise. |
| 755 | */ | 775 | */ |
| 756 | int pmd_free_pte_page(pmd_t *pmd) | 776 | int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) |
| 757 | { | 777 | { |
| 758 | pte_t *pte; | 778 | pte_t *pte; |
| 759 | 779 | ||
| @@ -762,8 +782,30 @@ int pmd_free_pte_page(pmd_t *pmd) | |||
| 762 | 782 | ||
| 763 | pte = (pte_t *)pmd_page_vaddr(*pmd); | 783 | pte = (pte_t *)pmd_page_vaddr(*pmd); |
| 764 | pmd_clear(pmd); | 784 | pmd_clear(pmd); |
| 785 | |||
| 786 | /* INVLPG to clear all paging-structure caches */ | ||
| 787 | flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); | ||
| 788 | |||
| 765 | free_page((unsigned long)pte); | 789 | free_page((unsigned long)pte); |
| 766 | 790 | ||
| 767 | return 1; | 791 | return 1; |
| 768 | } | 792 | } |
| 793 | |||
| 794 | #else /* !CONFIG_X86_64 */ | ||
| 795 | |||
| 796 | int pud_free_pmd_page(pud_t *pud, unsigned long addr) | ||
| 797 | { | ||
| 798 | return pud_none(*pud); | ||
| 799 | } | ||
| 800 | |||
| 801 | /* | ||
| 802 | * Disable free page handling on x86-PAE. This assures that ioremap() | ||
| 803 | * does not update sync'd pmd entries. See vmalloc_sync_one(). | ||
| 804 | */ | ||
| 805 | int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) | ||
| 806 | { | ||
| 807 | return pmd_none(*pmd); | ||
| 808 | } | ||
| 809 | |||
| 810 | #endif /* CONFIG_X86_64 */ | ||
| 769 | #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ | 811 | #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6eb1f34c3c85..752dbf4e0e50 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | #include <linux/export.h> | 7 | #include <linux/export.h> |
| 8 | #include <linux/cpu.h> | 8 | #include <linux/cpu.h> |
| 9 | #include <linux/debugfs.h> | 9 | #include <linux/debugfs.h> |
| 10 | #include <linux/gfp.h> | ||
| 10 | 11 | ||
| 11 | #include <asm/tlbflush.h> | 12 | #include <asm/tlbflush.h> |
| 12 | #include <asm/mmu_context.h> | 13 | #include <asm/mmu_context.h> |
| @@ -35,7 +36,7 @@ | |||
| 35 | * necessary invalidation by clearing out the 'ctx_id' which | 36 | * necessary invalidation by clearing out the 'ctx_id' which |
| 36 | * forces a TLB flush when the context is loaded. | 37 | * forces a TLB flush when the context is loaded. |
| 37 | */ | 38 | */ |
| 38 | void clear_asid_other(void) | 39 | static void clear_asid_other(void) |
| 39 | { | 40 | { |
| 40 | u16 asid; | 41 | u16 asid; |
| 41 | 42 | ||
| @@ -185,8 +186,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |||
| 185 | { | 186 | { |
| 186 | struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); | 187 | struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); |
| 187 | u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); | 188 | u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); |
| 189 | bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); | ||
| 188 | unsigned cpu = smp_processor_id(); | 190 | unsigned cpu = smp_processor_id(); |
| 189 | u64 next_tlb_gen; | 191 | u64 next_tlb_gen; |
| 192 | bool need_flush; | ||
| 193 | u16 new_asid; | ||
| 190 | 194 | ||
| 191 | /* | 195 | /* |
| 192 | * NB: The scheduler will call us with prev == next when switching | 196 | * NB: The scheduler will call us with prev == next when switching |
| @@ -240,20 +244,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |||
| 240 | next->context.ctx_id); | 244 | next->context.ctx_id); |
| 241 | 245 | ||
| 242 | /* | 246 | /* |
| 243 | * We don't currently support having a real mm loaded without | 247 | * Even in lazy TLB mode, the CPU should stay set in the |
| 244 | * our cpu set in mm_cpumask(). We have all the bookkeeping | 248 | * mm_cpumask. The TLB shootdown code can figure out from |
| 245 | * in place to figure out whether we would need to flush | 249 | * from cpu_tlbstate.is_lazy whether or not to send an IPI. |
| 246 | * if our cpu were cleared in mm_cpumask(), but we don't | ||
| 247 | * currently use it. | ||
| 248 | */ | 250 | */ |
| 249 | if (WARN_ON_ONCE(real_prev != &init_mm && | 251 | if (WARN_ON_ONCE(real_prev != &init_mm && |
| 250 | !cpumask_test_cpu(cpu, mm_cpumask(next)))) | 252 | !cpumask_test_cpu(cpu, mm_cpumask(next)))) |
| 251 | cpumask_set_cpu(cpu, mm_cpumask(next)); | 253 | cpumask_set_cpu(cpu, mm_cpumask(next)); |
| 252 | 254 | ||
| 253 | return; | 255 | /* |
| 256 | * If the CPU is not in lazy TLB mode, we are just switching | ||
| 257 | * from one thread in a process to another thread in the same | ||
| 258 | * process. No TLB flush required. | ||
| 259 | */ | ||
| 260 | if (!was_lazy) | ||
| 261 | return; | ||
| 262 | |||
| 263 | /* | ||
| 264 | * Read the tlb_gen to check whether a flush is needed. | ||
| 265 | * If the TLB is up to date, just use it. | ||
| 266 | * The barrier synchronizes with the tlb_gen increment in | ||
| 267 | * the TLB shootdown code. | ||
| 268 | */ | ||
| 269 | smp_mb(); | ||
| 270 | next_tlb_gen = atomic64_read(&next->context.tlb_gen); | ||
| 271 | if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == | ||
| 272 | next_tlb_gen) | ||
| 273 | return; | ||
| 274 | |||
| 275 | /* | ||
| 276 | * TLB contents went out of date while we were in lazy | ||
| 277 | * mode. Fall through to the TLB switching code below. | ||
| 278 | */ | ||
| 279 | new_asid = prev_asid; | ||
| 280 | need_flush = true; | ||
| 254 | } else { | 281 | } else { |
| 255 | u16 new_asid; | ||
| 256 | bool need_flush; | ||
| 257 | u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); | 282 | u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); |
| 258 | 283 | ||
| 259 | /* | 284 | /* |
| @@ -285,53 +310,60 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | |||
| 285 | sync_current_stack_to_mm(next); | 310 | sync_current_stack_to_mm(next); |
| 286 | } | 311 | } |
| 287 | 312 | ||
| 288 | /* Stop remote flushes for the previous mm */ | 313 | /* |
| 289 | VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && | 314 | * Stop remote flushes for the previous mm. |
| 290 | real_prev != &init_mm); | 315 | * Skip kernel threads; we never send init_mm TLB flushing IPIs, |
| 291 | cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); | 316 | * but the bitmap manipulation can cause cache line contention. |
| 317 | */ | ||
| 318 | if (real_prev != &init_mm) { | ||
| 319 | VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, | ||
| 320 | mm_cpumask(real_prev))); | ||
| 321 | cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); | ||
| 322 | } | ||
| 292 | 323 | ||
| 293 | /* | 324 | /* |
| 294 | * Start remote flushes and then read tlb_gen. | 325 | * Start remote flushes and then read tlb_gen. |
| 295 | */ | 326 | */ |
| 296 | cpumask_set_cpu(cpu, mm_cpumask(next)); | 327 | if (next != &init_mm) |
| 328 | cpumask_set_cpu(cpu, mm_cpumask(next)); | ||
| 297 | next_tlb_gen = atomic64_read(&next->context.tlb_gen); | 329 | next_tlb_gen = atomic64_read(&next->context.tlb_gen); |
| 298 | 330 | ||
| 299 | choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); | 331 | choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); |
| 332 | } | ||
| 300 | 333 | ||
| 301 | if (need_flush) { | 334 | if (need_flush) { |
| 302 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); | 335 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); |
| 303 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); | 336 | this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); |
| 304 | load_new_mm_cr3(next->pgd, new_asid, true); | 337 | load_new_mm_cr3(next->pgd, new_asid, true); |
| 305 | |||
| 306 | /* | ||
| 307 | * NB: This gets called via leave_mm() in the idle path | ||
| 308 | * where RCU functions differently. Tracing normally | ||
| 309 | * uses RCU, so we need to use the _rcuidle variant. | ||
| 310 | * | ||
| 311 | * (There is no good reason for this. The idle code should | ||
| 312 | * be rearranged to call this before rcu_idle_enter().) | ||
| 313 | */ | ||
| 314 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
| 315 | } else { | ||
| 316 | /* The new ASID is already up to date. */ | ||
| 317 | load_new_mm_cr3(next->pgd, new_asid, false); | ||
| 318 | |||
| 319 | /* See above wrt _rcuidle. */ | ||
| 320 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); | ||
| 321 | } | ||
| 322 | 338 | ||
| 323 | /* | 339 | /* |
| 324 | * Record last user mm's context id, so we can avoid | 340 | * NB: This gets called via leave_mm() in the idle path |
| 325 | * flushing branch buffer with IBPB if we switch back | 341 | * where RCU functions differently. Tracing normally |
| 326 | * to the same user. | 342 | * uses RCU, so we need to use the _rcuidle variant. |
| 343 | * | ||
| 344 | * (There is no good reason for this. The idle code should | ||
| 345 | * be rearranged to call this before rcu_idle_enter().) | ||
| 327 | */ | 346 | */ |
| 328 | if (next != &init_mm) | 347 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
| 329 | this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); | 348 | } else { |
| 349 | /* The new ASID is already up to date. */ | ||
| 350 | load_new_mm_cr3(next->pgd, new_asid, false); | ||
| 330 | 351 | ||
| 331 | this_cpu_write(cpu_tlbstate.loaded_mm, next); | 352 | /* See above wrt _rcuidle. */ |
| 332 | this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); | 353 | trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); |
| 333 | } | 354 | } |
| 334 | 355 | ||
| 356 | /* | ||
| 357 | * Record last user mm's context id, so we can avoid | ||
| 358 | * flushing branch buffer with IBPB if we switch back | ||
| 359 | * to the same user. | ||
| 360 | */ | ||
| 361 | if (next != &init_mm) | ||
| 362 | this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); | ||
| 363 | |||
| 364 | this_cpu_write(cpu_tlbstate.loaded_mm, next); | ||
| 365 | this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); | ||
| 366 | |||
| 335 | load_mm_cr4(next); | 367 | load_mm_cr4(next); |
| 336 | switch_ldt(real_prev, next); | 368 | switch_ldt(real_prev, next); |
| 337 | } | 369 | } |
| @@ -354,20 +386,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) | |||
| 354 | if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) | 386 | if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) |
| 355 | return; | 387 | return; |
| 356 | 388 | ||
| 357 | if (tlb_defer_switch_to_init_mm()) { | 389 | this_cpu_write(cpu_tlbstate.is_lazy, true); |
| 358 | /* | ||
| 359 | * There's a significant optimization that may be possible | ||
| 360 | * here. We have accurate enough TLB flush tracking that we | ||
| 361 | * don't need to maintain coherence of TLB per se when we're | ||
| 362 | * lazy. We do, however, need to maintain coherence of | ||
| 363 | * paging-structure caches. We could, in principle, leave our | ||
| 364 | * old mm loaded and only switch to init_mm when | ||
| 365 | * tlb_remove_page() happens. | ||
| 366 | */ | ||
| 367 | this_cpu_write(cpu_tlbstate.is_lazy, true); | ||
| 368 | } else { | ||
| 369 | switch_mm(NULL, &init_mm, NULL); | ||
| 370 | } | ||
| 371 | } | 390 | } |
| 372 | 391 | ||
| 373 | /* | 392 | /* |
| @@ -454,6 +473,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, | |||
| 454 | * paging-structure cache to avoid speculatively reading | 473 | * paging-structure cache to avoid speculatively reading |
| 455 | * garbage into our TLB. Since switching to init_mm is barely | 474 | * garbage into our TLB. Since switching to init_mm is barely |
| 456 | * slower than a minimal flush, just switch to init_mm. | 475 | * slower than a minimal flush, just switch to init_mm. |
| 476 | * | ||
| 477 | * This should be rare, with native_flush_tlb_others skipping | ||
| 478 | * IPIs to lazy TLB mode CPUs. | ||
| 457 | */ | 479 | */ |
| 458 | switch_mm_irqs_off(NULL, &init_mm, NULL); | 480 | switch_mm_irqs_off(NULL, &init_mm, NULL); |
| 459 | return; | 481 | return; |
| @@ -560,6 +582,9 @@ static void flush_tlb_func_remote(void *info) | |||
| 560 | void native_flush_tlb_others(const struct cpumask *cpumask, | 582 | void native_flush_tlb_others(const struct cpumask *cpumask, |
| 561 | const struct flush_tlb_info *info) | 583 | const struct flush_tlb_info *info) |
| 562 | { | 584 | { |
| 585 | cpumask_var_t lazymask; | ||
| 586 | unsigned int cpu; | ||
| 587 | |||
| 563 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); | 588 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); |
| 564 | if (info->end == TLB_FLUSH_ALL) | 589 | if (info->end == TLB_FLUSH_ALL) |
| 565 | trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); | 590 | trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); |
| @@ -583,8 +608,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
| 583 | * that UV should be updated so that smp_call_function_many(), | 608 | * that UV should be updated so that smp_call_function_many(), |
| 584 | * etc, are optimal on UV. | 609 | * etc, are optimal on UV. |
| 585 | */ | 610 | */ |
| 586 | unsigned int cpu; | ||
| 587 | |||
| 588 | cpu = smp_processor_id(); | 611 | cpu = smp_processor_id(); |
| 589 | cpumask = uv_flush_tlb_others(cpumask, info); | 612 | cpumask = uv_flush_tlb_others(cpumask, info); |
| 590 | if (cpumask) | 613 | if (cpumask) |
| @@ -592,8 +615,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
| 592 | (void *)info, 1); | 615 | (void *)info, 1); |
| 593 | return; | 616 | return; |
| 594 | } | 617 | } |
| 595 | smp_call_function_many(cpumask, flush_tlb_func_remote, | 618 | |
| 619 | /* | ||
| 620 | * A temporary cpumask is used in order to skip sending IPIs | ||
| 621 | * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm). | ||
| 622 | * If the allocation fails, simply IPI every CPU in mm_cpumask. | ||
| 623 | */ | ||
| 624 | if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) { | ||
| 625 | smp_call_function_many(cpumask, flush_tlb_func_remote, | ||
| 596 | (void *)info, 1); | 626 | (void *)info, 1); |
| 627 | return; | ||
| 628 | } | ||
| 629 | |||
| 630 | cpumask_copy(lazymask, cpumask); | ||
| 631 | |||
| 632 | for_each_cpu(cpu, lazymask) { | ||
| 633 | if (per_cpu(cpu_tlbstate.is_lazy, cpu)) | ||
| 634 | cpumask_clear_cpu(cpu, lazymask); | ||
| 635 | } | ||
| 636 | |||
| 637 | smp_call_function_many(lazymask, flush_tlb_func_remote, | ||
| 638 | (void *)info, 1); | ||
| 639 | |||
| 640 | free_cpumask_var(lazymask); | ||
| 597 | } | 641 | } |
| 598 | 642 | ||
| 599 | /* | 643 | /* |
| @@ -646,6 +690,68 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |||
| 646 | put_cpu(); | 690 | put_cpu(); |
| 647 | } | 691 | } |
| 648 | 692 | ||
| 693 | void tlb_flush_remove_tables_local(void *arg) | ||
| 694 | { | ||
| 695 | struct mm_struct *mm = arg; | ||
| 696 | |||
| 697 | if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm && | ||
| 698 | this_cpu_read(cpu_tlbstate.is_lazy)) { | ||
| 699 | /* | ||
| 700 | * We're in lazy mode. We need to at least flush our | ||
| 701 | * paging-structure cache to avoid speculatively reading | ||
| 702 | * garbage into our TLB. Since switching to init_mm is barely | ||
| 703 | * slower than a minimal flush, just switch to init_mm. | ||
| 704 | */ | ||
| 705 | switch_mm_irqs_off(NULL, &init_mm, NULL); | ||
| 706 | } | ||
| 707 | } | ||
| 708 | |||
| 709 | static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm, | ||
| 710 | struct cpumask *lazy_cpus) | ||
| 711 | { | ||
| 712 | int cpu; | ||
| 713 | |||
| 714 | for_each_cpu(cpu, mm_cpumask(mm)) { | ||
| 715 | if (!per_cpu(cpu_tlbstate.is_lazy, cpu)) | ||
| 716 | cpumask_set_cpu(cpu, lazy_cpus); | ||
| 717 | } | ||
| 718 | } | ||
| 719 | |||
| 720 | void tlb_flush_remove_tables(struct mm_struct *mm) | ||
| 721 | { | ||
| 722 | int cpu = get_cpu(); | ||
| 723 | cpumask_var_t lazy_cpus; | ||
| 724 | |||
| 725 | if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) { | ||
| 726 | put_cpu(); | ||
| 727 | return; | ||
| 728 | } | ||
| 729 | |||
| 730 | if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) { | ||
| 731 | /* | ||
| 732 | * If the cpumask allocation fails, do a brute force flush | ||
| 733 | * on all the CPUs that have this mm loaded. | ||
| 734 | */ | ||
| 735 | smp_call_function_many(mm_cpumask(mm), | ||
| 736 | tlb_flush_remove_tables_local, (void *)mm, 1); | ||
| 737 | put_cpu(); | ||
| 738 | return; | ||
| 739 | } | ||
| 740 | |||
| 741 | /* | ||
| 742 | * CPUs with !is_lazy either received a TLB flush IPI while the user | ||
| 743 | * pages in this address range were unmapped, or have context switched | ||
| 744 | * and reloaded %CR3 since then. | ||
| 745 | * | ||
| 746 | * Shootdown IPIs at page table freeing time only need to be sent to | ||
| 747 | * CPUs that may have out of date TLB contents. | ||
| 748 | */ | ||
| 749 | mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus); | ||
| 750 | smp_call_function_many(lazy_cpus, | ||
| 751 | tlb_flush_remove_tables_local, (void *)mm, 1); | ||
| 752 | free_cpumask_var(lazy_cpus); | ||
| 753 | put_cpu(); | ||
| 754 | } | ||
| 649 | 755 | ||
| 650 | static void do_flush_tlb_all(void *info) | 756 | static void do_flush_tlb_all(void *info) |
| 651 | { | 757 | { |
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index d8a33a781a57..2a29dd9c986d 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c | |||
| @@ -82,6 +82,7 @@ struct mm_struct efi_mm = { | |||
| 82 | .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), | 82 | .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), |
| 83 | .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), | 83 | .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), |
| 84 | .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), | 84 | .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), |
| 85 | .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, | ||
| 85 | }; | 86 | }; |
| 86 | 87 | ||
| 87 | struct workqueue_struct *efi_rts_wq; | 88 | struct workqueue_struct *efi_rts_wq; |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index f59639afaa39..b081794ba135 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
| @@ -1019,8 +1019,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); | |||
| 1019 | int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); | 1019 | int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); |
| 1020 | int pud_clear_huge(pud_t *pud); | 1020 | int pud_clear_huge(pud_t *pud); |
| 1021 | int pmd_clear_huge(pmd_t *pmd); | 1021 | int pmd_clear_huge(pmd_t *pmd); |
| 1022 | int pud_free_pmd_page(pud_t *pud); | 1022 | int pud_free_pmd_page(pud_t *pud, unsigned long addr); |
| 1023 | int pmd_free_pte_page(pmd_t *pmd); | 1023 | int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); |
| 1024 | #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ | 1024 | #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ |
| 1025 | static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) | 1025 | static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) |
| 1026 | { | 1026 | { |
| @@ -1046,11 +1046,11 @@ static inline int pmd_clear_huge(pmd_t *pmd) | |||
| 1046 | { | 1046 | { |
| 1047 | return 0; | 1047 | return 0; |
| 1048 | } | 1048 | } |
| 1049 | static inline int pud_free_pmd_page(pud_t *pud) | 1049 | static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) |
| 1050 | { | 1050 | { |
| 1051 | return 0; | 1051 | return 0; |
| 1052 | } | 1052 | } |
| 1053 | static inline int pmd_free_pte_page(pmd_t *pmd) | 1053 | static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) |
| 1054 | { | 1054 | { |
| 1055 | return 0; | 1055 | return 0; |
| 1056 | } | 1056 | } |
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 3063125197ad..e811ef7b8350 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h | |||
| @@ -303,4 +303,14 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, | |||
| 303 | 303 | ||
| 304 | #define tlb_migrate_finish(mm) do {} while (0) | 304 | #define tlb_migrate_finish(mm) do {} while (0) |
| 305 | 305 | ||
| 306 | /* | ||
| 307 | * Used to flush the TLB when page tables are removed, when lazy | ||
| 308 | * TLB mode may cause a CPU to retain intermediate translations | ||
| 309 | * pointing to about-to-be-freed page table memory. | ||
| 310 | */ | ||
| 311 | #ifndef HAVE_TLB_FLUSH_REMOVE_TABLES | ||
| 312 | #define tlb_flush_remove_tables(mm) do {} while (0) | ||
| 313 | #define tlb_flush_remove_tables_local(mm) do {} while (0) | ||
| 314 | #endif | ||
| 315 | |||
| 306 | #endif /* _ASM_GENERIC__TLB_H */ | 316 | #endif /* _ASM_GENERIC__TLB_H */ |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 99ce070e7dcb..efdc24dd9e97 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
| @@ -335,176 +335,183 @@ struct core_state { | |||
| 335 | 335 | ||
| 336 | struct kioctx_table; | 336 | struct kioctx_table; |
| 337 | struct mm_struct { | 337 | struct mm_struct { |
| 338 | struct vm_area_struct *mmap; /* list of VMAs */ | 338 | struct { |
| 339 | struct rb_root mm_rb; | 339 | struct vm_area_struct *mmap; /* list of VMAs */ |
| 340 | u32 vmacache_seqnum; /* per-thread vmacache */ | 340 | struct rb_root mm_rb; |
| 341 | u32 vmacache_seqnum; /* per-thread vmacache */ | ||
| 341 | #ifdef CONFIG_MMU | 342 | #ifdef CONFIG_MMU |
| 342 | unsigned long (*get_unmapped_area) (struct file *filp, | 343 | unsigned long (*get_unmapped_area) (struct file *filp, |
| 343 | unsigned long addr, unsigned long len, | 344 | unsigned long addr, unsigned long len, |
| 344 | unsigned long pgoff, unsigned long flags); | 345 | unsigned long pgoff, unsigned long flags); |
| 345 | #endif | 346 | #endif |
| 346 | unsigned long mmap_base; /* base of mmap area */ | 347 | unsigned long mmap_base; /* base of mmap area */ |
| 347 | unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ | 348 | unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ |
| 348 | #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES | 349 | #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES |
| 349 | /* Base adresses for compatible mmap() */ | 350 | /* Base adresses for compatible mmap() */ |
| 350 | unsigned long mmap_compat_base; | 351 | unsigned long mmap_compat_base; |
| 351 | unsigned long mmap_compat_legacy_base; | 352 | unsigned long mmap_compat_legacy_base; |
| 352 | #endif | 353 | #endif |
| 353 | unsigned long task_size; /* size of task vm space */ | 354 | unsigned long task_size; /* size of task vm space */ |
| 354 | unsigned long highest_vm_end; /* highest vma end address */ | 355 | unsigned long highest_vm_end; /* highest vma end address */ |
| 355 | pgd_t * pgd; | 356 | pgd_t * pgd; |
| 356 | 357 | ||
| 357 | /** | 358 | /** |
| 358 | * @mm_users: The number of users including userspace. | 359 | * @mm_users: The number of users including userspace. |
| 359 | * | 360 | * |
| 360 | * Use mmget()/mmget_not_zero()/mmput() to modify. When this drops | 361 | * Use mmget()/mmget_not_zero()/mmput() to modify. When this |
| 361 | * to 0 (i.e. when the task exits and there are no other temporary | 362 | * drops to 0 (i.e. when the task exits and there are no other |
| 362 | * reference holders), we also release a reference on @mm_count | 363 | * temporary reference holders), we also release a reference on |
| 363 | * (which may then free the &struct mm_struct if @mm_count also | 364 | * @mm_count (which may then free the &struct mm_struct if |
| 364 | * drops to 0). | 365 | * @mm_count also drops to 0). |
| 365 | */ | 366 | */ |
| 366 | atomic_t mm_users; | 367 | atomic_t mm_users; |
| 367 | 368 | ||
| 368 | /** | 369 | /** |
| 369 | * @mm_count: The number of references to &struct mm_struct | 370 | * @mm_count: The number of references to &struct mm_struct |
| 370 | * (@mm_users count as 1). | 371 | * (@mm_users count as 1). |
| 371 | * | 372 | * |
| 372 | * Use mmgrab()/mmdrop() to modify. When this drops to 0, the | 373 | * Use mmgrab()/mmdrop() to modify. When this drops to 0, the |
| 373 | * &struct mm_struct is freed. | 374 | * &struct mm_struct is freed. |
| 374 | */ | 375 | */ |
| 375 | atomic_t mm_count; | 376 | atomic_t mm_count; |
| 376 | 377 | ||
| 377 | #ifdef CONFIG_MMU | 378 | #ifdef CONFIG_MMU |
| 378 | atomic_long_t pgtables_bytes; /* PTE page table pages */ | 379 | atomic_long_t pgtables_bytes; /* PTE page table pages */ |
| 379 | #endif | 380 | #endif |
| 380 | int map_count; /* number of VMAs */ | 381 | int map_count; /* number of VMAs */ |
| 381 | 382 | ||
| 382 | spinlock_t page_table_lock; /* Protects page tables and some counters */ | 383 | spinlock_t page_table_lock; /* Protects page tables and some |
| 383 | struct rw_semaphore mmap_sem; | 384 | * counters |
| 385 | */ | ||
| 386 | struct rw_semaphore mmap_sem; | ||
| 384 | 387 | ||
| 385 | struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung | 388 | struct list_head mmlist; /* List of maybe swapped mm's. These |
| 386 | * together off init_mm.mmlist, and are protected | 389 | * are globally strung together off |
| 387 | * by mmlist_lock | 390 | * init_mm.mmlist, and are protected |
| 388 | */ | 391 | * by mmlist_lock |
| 392 | */ | ||
| 389 | 393 | ||
| 390 | 394 | ||
| 391 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ | 395 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ |
| 392 | unsigned long hiwater_vm; /* High-water virtual memory usage */ | 396 | unsigned long hiwater_vm; /* High-water virtual memory usage */ |
| 393 | 397 | ||
| 394 | unsigned long total_vm; /* Total pages mapped */ | 398 | unsigned long total_vm; /* Total pages mapped */ |
| 395 | unsigned long locked_vm; /* Pages that have PG_mlocked set */ | 399 | unsigned long locked_vm; /* Pages that have PG_mlocked set */ |
| 396 | unsigned long pinned_vm; /* Refcount permanently increased */ | 400 | unsigned long pinned_vm; /* Refcount permanently increased */ |
| 397 | unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ | 401 | unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ |
| 398 | unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ | 402 | unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ |
| 399 | unsigned long stack_vm; /* VM_STACK */ | 403 | unsigned long stack_vm; /* VM_STACK */ |
| 400 | unsigned long def_flags; | 404 | unsigned long def_flags; |
| 401 | 405 | ||
| 402 | spinlock_t arg_lock; /* protect the below fields */ | 406 | spinlock_t arg_lock; /* protect the below fields */ |
| 403 | unsigned long start_code, end_code, start_data, end_data; | 407 | unsigned long start_code, end_code, start_data, end_data; |
| 404 | unsigned long start_brk, brk, start_stack; | 408 | unsigned long start_brk, brk, start_stack; |
| 405 | unsigned long arg_start, arg_end, env_start, env_end; | 409 | unsigned long arg_start, arg_end, env_start, env_end; |
| 406 | 410 | ||
| 407 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ | 411 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ |
| 408 | 412 | ||
| 409 | /* | 413 | /* |
| 410 | * Special counters, in some configurations protected by the | 414 | * Special counters, in some configurations protected by the |
| 411 | * page_table_lock, in other configurations by being atomic. | 415 | * page_table_lock, in other configurations by being atomic. |
| 412 | */ | 416 | */ |
| 413 | struct mm_rss_stat rss_stat; | 417 | struct mm_rss_stat rss_stat; |
| 414 | |||
| 415 | struct linux_binfmt *binfmt; | ||
| 416 | 418 | ||
| 417 | cpumask_var_t cpu_vm_mask_var; | 419 | struct linux_binfmt *binfmt; |
| 418 | 420 | ||
| 419 | /* Architecture-specific MM context */ | 421 | /* Architecture-specific MM context */ |
| 420 | mm_context_t context; | 422 | mm_context_t context; |
| 421 | 423 | ||
| 422 | unsigned long flags; /* Must use atomic bitops to access the bits */ | 424 | unsigned long flags; /* Must use atomic bitops to access */ |
| 423 | 425 | ||
| 424 | struct core_state *core_state; /* coredumping support */ | 426 | struct core_state *core_state; /* coredumping support */ |
| 425 | #ifdef CONFIG_MEMBARRIER | 427 | #ifdef CONFIG_MEMBARRIER |
| 426 | atomic_t membarrier_state; | 428 | atomic_t membarrier_state; |
| 427 | #endif | 429 | #endif |
| 428 | #ifdef CONFIG_AIO | 430 | #ifdef CONFIG_AIO |
| 429 | spinlock_t ioctx_lock; | 431 | spinlock_t ioctx_lock; |
| 430 | struct kioctx_table __rcu *ioctx_table; | 432 | struct kioctx_table __rcu *ioctx_table; |
| 431 | #endif | 433 | #endif |
| 432 | #ifdef CONFIG_MEMCG | 434 | #ifdef CONFIG_MEMCG |
| 433 | /* | 435 | /* |
| 434 | * "owner" points to a task that is regarded as the canonical | 436 | * "owner" points to a task that is regarded as the canonical |
| 435 | * user/owner of this mm. All of the following must be true in | 437 | * user/owner of this mm. All of the following must be true in |
| 436 | * order for it to be changed: | 438 | * order for it to be changed: |
| 437 | * | 439 | * |
| 438 | * current == mm->owner | 440 | * current == mm->owner |
| 439 | * current->mm != mm | 441 | * current->mm != mm |
| 440 | * new_owner->mm == mm | 442 | * new_owner->mm == mm |
| 441 | * new_owner->alloc_lock is held | 443 | * new_owner->alloc_lock is held |
| 442 | */ | 444 | */ |
| 443 | struct task_struct __rcu *owner; | 445 | struct task_struct __rcu *owner; |
| 444 | #endif | 446 | #endif |
| 445 | struct user_namespace *user_ns; | 447 | struct user_namespace *user_ns; |
| 446 | 448 | ||
| 447 | /* store ref to file /proc/<pid>/exe symlink points to */ | 449 | /* store ref to file /proc/<pid>/exe symlink points to */ |
| 448 | struct file __rcu *exe_file; | 450 | struct file __rcu *exe_file; |
| 449 | #ifdef CONFIG_MMU_NOTIFIER | 451 | #ifdef CONFIG_MMU_NOTIFIER |
| 450 | struct mmu_notifier_mm *mmu_notifier_mm; | 452 | struct mmu_notifier_mm *mmu_notifier_mm; |
| 451 | #endif | 453 | #endif |
| 452 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | 454 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
| 453 | pgtable_t pmd_huge_pte; /* protected by page_table_lock */ | 455 | pgtable_t pmd_huge_pte; /* protected by page_table_lock */ |
| 454 | #endif | ||
| 455 | #ifdef CONFIG_CPUMASK_OFFSTACK | ||
| 456 | struct cpumask cpumask_allocation; | ||
| 457 | #endif | 456 | #endif |
| 458 | #ifdef CONFIG_NUMA_BALANCING | 457 | #ifdef CONFIG_NUMA_BALANCING |
| 459 | /* | 458 | /* |
| 460 | * numa_next_scan is the next time that the PTEs will be marked | 459 | * numa_next_scan is the next time that the PTEs will be marked |
| 461 | * pte_numa. NUMA hinting faults will gather statistics and migrate | 460 | * pte_numa. NUMA hinting faults will gather statistics and |
| 462 | * pages to new nodes if necessary. | 461 | * migrate pages to new nodes if necessary. |
| 463 | */ | 462 | */ |
| 464 | unsigned long numa_next_scan; | 463 | unsigned long numa_next_scan; |
| 465 | 464 | ||
| 466 | /* Restart point for scanning and setting pte_numa */ | 465 | /* Restart point for scanning and setting pte_numa */ |
| 467 | unsigned long numa_scan_offset; | 466 | unsigned long numa_scan_offset; |
| 468 | 467 | ||
| 469 | /* numa_scan_seq prevents two threads setting pte_numa */ | 468 | /* numa_scan_seq prevents two threads setting pte_numa */ |
| 470 | int numa_scan_seq; | 469 | int numa_scan_seq; |
| 471 | #endif | 470 | #endif |
| 472 | /* | 471 | /* |
| 473 | * An operation with batched TLB flushing is going on. Anything that | 472 | * An operation with batched TLB flushing is going on. Anything |
| 474 | * can move process memory needs to flush the TLB when moving a | 473 | * that can move process memory needs to flush the TLB when |
| 475 | * PROT_NONE or PROT_NUMA mapped page. | 474 | * moving a PROT_NONE or PROT_NUMA mapped page. |
| 476 | */ | 475 | */ |
| 477 | atomic_t tlb_flush_pending; | 476 | atomic_t tlb_flush_pending; |
| 478 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH | 477 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
| 479 | /* See flush_tlb_batched_pending() */ | 478 | /* See flush_tlb_batched_pending() */ |
| 480 | bool tlb_flush_batched; | 479 | bool tlb_flush_batched; |
| 481 | #endif | 480 | #endif |
| 482 | struct uprobes_state uprobes_state; | 481 | struct uprobes_state uprobes_state; |
| 483 | #ifdef CONFIG_HUGETLB_PAGE | 482 | #ifdef CONFIG_HUGETLB_PAGE |
| 484 | atomic_long_t hugetlb_usage; | 483 | atomic_long_t hugetlb_usage; |
| 485 | #endif | 484 | #endif |
| 486 | struct work_struct async_put_work; | 485 | struct work_struct async_put_work; |
| 487 | 486 | ||
| 488 | #if IS_ENABLED(CONFIG_HMM) | 487 | #if IS_ENABLED(CONFIG_HMM) |
| 489 | /* HMM needs to track a few things per mm */ | 488 | /* HMM needs to track a few things per mm */ |
| 490 | struct hmm *hmm; | 489 | struct hmm *hmm; |
| 491 | #endif | 490 | #endif |
| 492 | } __randomize_layout; | 491 | } __randomize_layout; |
| 492 | |||
| 493 | /* | ||
| 494 | * The mm_cpumask needs to be at the end of mm_struct, because it | ||
| 495 | * is dynamically sized based on nr_cpu_ids. | ||
| 496 | */ | ||
| 497 | unsigned long cpu_bitmap[]; | ||
| 498 | }; | ||
| 493 | 499 | ||
| 494 | extern struct mm_struct init_mm; | 500 | extern struct mm_struct init_mm; |
| 495 | 501 | ||
| 502 | /* Pointer magic because the dynamic array size confuses some compilers. */ | ||
| 496 | static inline void mm_init_cpumask(struct mm_struct *mm) | 503 | static inline void mm_init_cpumask(struct mm_struct *mm) |
| 497 | { | 504 | { |
| 498 | #ifdef CONFIG_CPUMASK_OFFSTACK | 505 | unsigned long cpu_bitmap = (unsigned long)mm; |
| 499 | mm->cpu_vm_mask_var = &mm->cpumask_allocation; | 506 | |
| 500 | #endif | 507 | cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); |
| 501 | cpumask_clear(mm->cpu_vm_mask_var); | 508 | cpumask_clear((struct cpumask *)cpu_bitmap); |
| 502 | } | 509 | } |
| 503 | 510 | ||
| 504 | /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ | 511 | /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ |
| 505 | static inline cpumask_t *mm_cpumask(struct mm_struct *mm) | 512 | static inline cpumask_t *mm_cpumask(struct mm_struct *mm) |
| 506 | { | 513 | { |
| 507 | return mm->cpu_vm_mask_var; | 514 | return (struct cpumask *)&mm->cpu_bitmap; |
| 508 | } | 515 | } |
| 509 | 516 | ||
| 510 | struct mmu_gather; | 517 | struct mmu_gather; |
diff --git a/kernel/fork.c b/kernel/fork.c index 1b27babc4c78..9d8d0e016fc6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -2276,6 +2276,8 @@ static void sighand_ctor(void *data) | |||
| 2276 | 2276 | ||
| 2277 | void __init proc_caches_init(void) | 2277 | void __init proc_caches_init(void) |
| 2278 | { | 2278 | { |
| 2279 | unsigned int mm_size; | ||
| 2280 | |||
| 2279 | sighand_cachep = kmem_cache_create("sighand_cache", | 2281 | sighand_cachep = kmem_cache_create("sighand_cache", |
| 2280 | sizeof(struct sighand_struct), 0, | 2282 | sizeof(struct sighand_struct), 0, |
| 2281 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| | 2283 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| |
| @@ -2292,15 +2294,16 @@ void __init proc_caches_init(void) | |||
| 2292 | sizeof(struct fs_struct), 0, | 2294 | sizeof(struct fs_struct), 0, |
| 2293 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, | 2295 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, |
| 2294 | NULL); | 2296 | NULL); |
| 2297 | |||
| 2295 | /* | 2298 | /* |
| 2296 | * FIXME! The "sizeof(struct mm_struct)" currently includes the | 2299 | * The mm_cpumask is located at the end of mm_struct, and is |
| 2297 | * whole struct cpumask for the OFFSTACK case. We could change | 2300 | * dynamically sized based on the maximum CPU number this system |
| 2298 | * this to *only* allocate as much of it as required by the | 2301 | * can have, taking hotplug into account (nr_cpu_ids). |
| 2299 | * maximum number of CPU's we can ever have. The cpumask_allocation | ||
| 2300 | * is at the end of the structure, exactly for that reason. | ||
| 2301 | */ | 2302 | */ |
| 2303 | mm_size = sizeof(struct mm_struct) + cpumask_size(); | ||
| 2304 | |||
| 2302 | mm_cachep = kmem_cache_create_usercopy("mm_struct", | 2305 | mm_cachep = kmem_cache_create_usercopy("mm_struct", |
| 2303 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | 2306 | mm_size, ARCH_MIN_MMSTRUCT_ALIGN, |
| 2304 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, | 2307 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, |
| 2305 | offsetof(struct mm_struct, saved_auxv), | 2308 | offsetof(struct mm_struct, saved_auxv), |
| 2306 | sizeof_field(struct mm_struct, saved_auxv), | 2309 | sizeof_field(struct mm_struct, saved_auxv), |
diff --git a/lib/ioremap.c b/lib/ioremap.c index 54e5bbaa3200..517f5853ffed 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c | |||
| @@ -92,7 +92,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, | |||
| 92 | if (ioremap_pmd_enabled() && | 92 | if (ioremap_pmd_enabled() && |
| 93 | ((next - addr) == PMD_SIZE) && | 93 | ((next - addr) == PMD_SIZE) && |
| 94 | IS_ALIGNED(phys_addr + addr, PMD_SIZE) && | 94 | IS_ALIGNED(phys_addr + addr, PMD_SIZE) && |
| 95 | pmd_free_pte_page(pmd)) { | 95 | pmd_free_pte_page(pmd, addr)) { |
| 96 | if (pmd_set_huge(pmd, phys_addr + addr, prot)) | 96 | if (pmd_set_huge(pmd, phys_addr + addr, prot)) |
| 97 | continue; | 97 | continue; |
| 98 | } | 98 | } |
| @@ -119,7 +119,7 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, | |||
| 119 | if (ioremap_pud_enabled() && | 119 | if (ioremap_pud_enabled() && |
| 120 | ((next - addr) == PUD_SIZE) && | 120 | ((next - addr) == PUD_SIZE) && |
| 121 | IS_ALIGNED(phys_addr + addr, PUD_SIZE) && | 121 | IS_ALIGNED(phys_addr + addr, PUD_SIZE) && |
| 122 | pud_free_pmd_page(pud)) { | 122 | pud_free_pmd_page(pud, addr)) { |
| 123 | if (pud_set_huge(pud, phys_addr + addr, prot)) | 123 | if (pud_set_huge(pud, phys_addr + addr, prot)) |
| 124 | continue; | 124 | continue; |
| 125 | } | 125 | } |
diff --git a/mm/init-mm.c b/mm/init-mm.c index f0179c9c04c2..a787a319211e 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c | |||
| @@ -15,6 +15,16 @@ | |||
| 15 | #define INIT_MM_CONTEXT(name) | 15 | #define INIT_MM_CONTEXT(name) |
| 16 | #endif | 16 | #endif |
| 17 | 17 | ||
| 18 | /* | ||
| 19 | * For dynamically allocated mm_structs, there is a dynamically sized cpumask | ||
| 20 | * at the end of the structure, the size of which depends on the maximum CPU | ||
| 21 | * number the system can see. That way we allocate only as much memory for | ||
| 22 | * mm_cpumask() as needed for the hundreds, or thousands of processes that | ||
| 23 | * a system typically runs. | ||
| 24 | * | ||
| 25 | * Since there is only one init_mm in the entire system, keep it simple | ||
| 26 | * and size this cpu_bitmask to NR_CPUS. | ||
| 27 | */ | ||
| 18 | struct mm_struct init_mm = { | 28 | struct mm_struct init_mm = { |
| 19 | .mm_rb = RB_ROOT, | 29 | .mm_rb = RB_ROOT, |
| 20 | .pgd = swapper_pg_dir, | 30 | .pgd = swapper_pg_dir, |
| @@ -25,5 +35,6 @@ struct mm_struct init_mm = { | |||
| 25 | .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), | 35 | .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), |
| 26 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), | 36 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), |
| 27 | .user_ns = &init_user_ns, | 37 | .user_ns = &init_user_ns, |
| 38 | .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, | ||
| 28 | INIT_MM_CONTEXT(init_mm) | 39 | INIT_MM_CONTEXT(init_mm) |
| 29 | }; | 40 | }; |
diff --git a/mm/memory.c b/mm/memory.c index c5e87a3a82ba..3d0a74ab70f2 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ | |||
| 326 | 326 | ||
| 327 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | 327 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE |
| 328 | 328 | ||
| 329 | /* | ||
| 330 | * See the comment near struct mmu_table_batch. | ||
| 331 | */ | ||
| 332 | |||
| 333 | static void tlb_remove_table_smp_sync(void *arg) | 329 | static void tlb_remove_table_smp_sync(void *arg) |
| 334 | { | 330 | { |
| 335 | /* Simply deliver the interrupt */ | 331 | struct mm_struct __maybe_unused *mm = arg; |
| 332 | /* | ||
| 333 | * On most architectures this does nothing. Simply delivering the | ||
| 334 | * interrupt is enough to prevent races with software page table | ||
| 335 | * walking like that done in get_user_pages_fast. | ||
| 336 | * | ||
| 337 | * See the comment near struct mmu_table_batch. | ||
| 338 | */ | ||
| 339 | tlb_flush_remove_tables_local(mm); | ||
| 336 | } | 340 | } |
| 337 | 341 | ||
| 338 | static void tlb_remove_table_one(void *table) | 342 | static void tlb_remove_table_one(void *table, struct mmu_gather *tlb) |
| 339 | { | 343 | { |
| 340 | /* | 344 | /* |
| 341 | * This isn't an RCU grace period and hence the page-tables cannot be | 345 | * This isn't an RCU grace period and hence the page-tables cannot be |
| @@ -344,7 +348,7 @@ static void tlb_remove_table_one(void *table) | |||
| 344 | * It is however sufficient for software page-table walkers that rely on | 348 | * It is however sufficient for software page-table walkers that rely on |
| 345 | * IRQ disabling. See the comment near struct mmu_table_batch. | 349 | * IRQ disabling. See the comment near struct mmu_table_batch. |
| 346 | */ | 350 | */ |
| 347 | smp_call_function(tlb_remove_table_smp_sync, NULL, 1); | 351 | smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1); |
| 348 | __tlb_remove_table(table); | 352 | __tlb_remove_table(table); |
| 349 | } | 353 | } |
| 350 | 354 | ||
| @@ -365,6 +369,8 @@ void tlb_table_flush(struct mmu_gather *tlb) | |||
| 365 | { | 369 | { |
| 366 | struct mmu_table_batch **batch = &tlb->batch; | 370 | struct mmu_table_batch **batch = &tlb->batch; |
| 367 | 371 | ||
| 372 | tlb_flush_remove_tables(tlb->mm); | ||
| 373 | |||
| 368 | if (*batch) { | 374 | if (*batch) { |
| 369 | call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); | 375 | call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); |
| 370 | *batch = NULL; | 376 | *batch = NULL; |
| @@ -387,7 +393,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) | |||
| 387 | if (*batch == NULL) { | 393 | if (*batch == NULL) { |
| 388 | *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); | 394 | *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); |
| 389 | if (*batch == NULL) { | 395 | if (*batch == NULL) { |
| 390 | tlb_remove_table_one(table); | 396 | tlb_remove_table_one(table, tlb); |
| 391 | return; | 397 | return; |
| 392 | } | 398 | } |
| 393 | (*batch)->nr = 0; | 399 | (*batch)->nr = 0; |
