aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2013-12-18 20:08:44 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2014-01-09 15:24:23 -0500
commitd303cf4624824971d94b4e2c7c95df052d14aa81 (patch)
tree710211f3a52e8b0ac893dba86bbbb2afcb3df3c9
parent57f74b6ecebf59991677dd2da0f0433e8be6c945 (diff)
mm: fix TLB flush race between migration, and change_protection_range
commit 20841405940e7be0617612d521e206e4b6b325db upstream. There are a few subtle races, between change_protection_range (used by mprotect and change_prot_numa) on one side, and NUMA page migration and compaction on the other side. The basic race is that there is a time window between when the PTE gets made non-present (PROT_NONE or NUMA), and the TLB is flushed. During that time, a CPU may continue writing to the page. This is fine most of the time, however compaction or the NUMA migration code may come in, and migrate the page away. When that happens, the CPU may continue writing, through the cached translation, to what is no longer the current memory location of the process. This only affects x86, which has a somewhat optimistic pte_accessible. All other architectures appear to be safe, and will either always flush, or flush whenever there is a valid mapping, even with no permissions (SPARC). The basic race looks like this: CPU A CPU B CPU C load TLB entry make entry PTE/PMD_NUMA fault on entry read/write old page start migrating page change PTE/PMD to new page read/write old page [*] flush TLB reload TLB from new entry read/write new page lose data [*] the old page may belong to a new user at this point! The obvious fix is to flush remote TLB entries, by making sure that pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may still be accessible if there is a TLB flush pending for the mm. This should fix both NUMA migration and compaction. [mgorman@suse.de: fix build] Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Alex Thorlton <athorlton@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--arch/sparc/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/pgtable.h11
-rw-r--r--include/asm-generic/pgtable.h2
-rw-r--r--include/linux/mm_types.h44
-rw-r--r--kernel/fork.c1
-rw-r--r--mm/huge_memory.c7
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/pgtable-generic.c5
8 files changed, 69 insertions, 7 deletions
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 7619f2f792af..dfb0019bf05b 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -616,7 +616,7 @@ static inline unsigned long pte_present(pte_t pte)
616} 616}
617 617
618#define pte_accessible pte_accessible 618#define pte_accessible pte_accessible
619static inline unsigned long pte_accessible(pte_t a) 619static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a)
620{ 620{
621 return pte_val(a) & _PAGE_VALID; 621 return pte_val(a) & _PAGE_VALID;
622} 622}
@@ -806,7 +806,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
806 * SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U 806 * SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U
807 * and SUN4V pte layout, so this inline test is fine. 807 * and SUN4V pte layout, so this inline test is fine.
808 */ 808 */
809 if (likely(mm != &init_mm) && pte_accessible(orig)) 809 if (likely(mm != &init_mm) && pte_accessible(mm, orig))
810 tlb_batch_add(mm, addr, ptep, orig, fullmm); 810 tlb_batch_add(mm, addr, ptep, orig, fullmm);
811} 811}
812 812
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1e672234c4ff..5460bf923e16 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -415,9 +415,16 @@ static inline int pte_present(pte_t a)
415} 415}
416 416
417#define pte_accessible pte_accessible 417#define pte_accessible pte_accessible
418static inline int pte_accessible(pte_t a) 418static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
419{ 419{
420 return pte_flags(a) & _PAGE_PRESENT; 420 if (pte_flags(a) & _PAGE_PRESENT)
421 return true;
422
423 if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
424 mm_tlb_flush_pending(mm))
425 return true;
426
427 return false;
421} 428}
422 429
423static inline int pte_hidden(pte_t pte) 430static inline int pte_hidden(pte_t pte)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a59ff51b0166..b58268a5ddd4 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -220,7 +220,7 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
220#endif 220#endif
221 221
222#ifndef pte_accessible 222#ifndef pte_accessible
223# define pte_accessible(pte) ((void)(pte),1) 223# define pte_accessible(mm, pte) ((void)(pte), 1)
224#endif 224#endif
225 225
226#ifndef flush_tlb_fix_spurious_fault 226#ifndef flush_tlb_fix_spurious_fault
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4a189ba6b128..49f0ada525a8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -437,6 +437,14 @@ struct mm_struct {
437 */ 437 */
438 int first_nid; 438 int first_nid;
439#endif 439#endif
440#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
441 /*
442 * An operation with batched TLB flushing is going on. Anything that
443 * can move process memory needs to flush the TLB when moving a
444 * PROT_NONE or PROT_NUMA mapped page.
445 */
446 bool tlb_flush_pending;
447#endif
440 struct uprobes_state uprobes_state; 448 struct uprobes_state uprobes_state;
441}; 449};
442 450
@@ -457,4 +465,40 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
457 return mm->cpu_vm_mask_var; 465 return mm->cpu_vm_mask_var;
458} 466}
459 467
468#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
469/*
470 * Memory barriers to keep this state in sync are graciously provided by
471 * the page table locks, outside of which no page table modifications happen.
472 * The barriers below prevent the compiler from re-ordering the instructions
473 * around the memory barriers that are already present in the code.
474 */
475static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
476{
477 barrier();
478 return mm->tlb_flush_pending;
479}
480static inline void set_tlb_flush_pending(struct mm_struct *mm)
481{
482 mm->tlb_flush_pending = true;
483 barrier();
484}
485/* Clearing is done after a TLB flush, which also provides a barrier. */
486static inline void clear_tlb_flush_pending(struct mm_struct *mm)
487{
488 barrier();
489 mm->tlb_flush_pending = false;
490}
491#else
492static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
493{
494 return false;
495}
496static inline void set_tlb_flush_pending(struct mm_struct *mm)
497{
498}
499static inline void clear_tlb_flush_pending(struct mm_struct *mm)
500{
501}
502#endif
503
460#endif /* _LINUX_MM_TYPES_H */ 504#endif /* _LINUX_MM_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 80d92e987f21..ff7be9dac4c1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -544,6 +544,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
544 mm->cached_hole_size = ~0UL; 544 mm->cached_hole_size = ~0UL;
545 mm_init_aio(mm); 545 mm_init_aio(mm);
546 mm_init_owner(mm, p); 546 mm_init_owner(mm, p);
547 clear_tlb_flush_pending(mm);
547 548
548 if (likely(!mm_alloc_pgd(mm))) { 549 if (likely(!mm_alloc_pgd(mm))) {
549 mm->def_flags = 0; 550 mm->def_flags = 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b2e803e14ea9..6bd22902d289 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1352,6 +1352,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1352 } 1352 }
1353 1353
1354 /* 1354 /*
1355 * The page_table_lock above provides a memory barrier
1356 * with change_protection_range.
1357 */
1358 if (mm_tlb_flush_pending(mm))
1359 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
1360
1361 /*
1355 * Migrate the THP to the requested node, returns with page unlocked 1362 * Migrate the THP to the requested node, returns with page unlocked
1356 * and pmd_numa cleared. 1363 * and pmd_numa cleared.
1357 */ 1364 */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index d4d5399c7aba..e9f65aaa3182 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -206,6 +206,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
206 BUG_ON(addr >= end); 206 BUG_ON(addr >= end);
207 pgd = pgd_offset(mm, addr); 207 pgd = pgd_offset(mm, addr);
208 flush_cache_range(vma, addr, end); 208 flush_cache_range(vma, addr, end);
209 set_tlb_flush_pending(mm);
209 do { 210 do {
210 next = pgd_addr_end(addr, end); 211 next = pgd_addr_end(addr, end);
211 if (pgd_none_or_clear_bad(pgd)) 212 if (pgd_none_or_clear_bad(pgd))
@@ -217,6 +218,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
217 /* Only flush the TLB if we actually modified any entries: */ 218 /* Only flush the TLB if we actually modified any entries: */
218 if (pages) 219 if (pages)
219 flush_tlb_range(vma, start, end); 220 flush_tlb_range(vma, start, end);
221 clear_tlb_flush_pending(mm);
220 222
221 return pages; 223 return pages;
222} 224}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index eb900bbaa713..4b62a16fc3c1 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -86,9 +86,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
86pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, 86pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
87 pte_t *ptep) 87 pte_t *ptep)
88{ 88{
89 struct mm_struct *mm = (vma)->vm_mm;
89 pte_t pte; 90 pte_t pte;
90 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); 91 pte = ptep_get_and_clear(mm, address, ptep);
91 if (pte_accessible(pte)) 92 if (pte_accessible(mm, pte))
92 flush_tlb_page(vma, address); 93 flush_tlb_page(vma, address);
93 return pte; 94 return pte;
94} 95}