aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2013-12-18 20:08:44 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-12-18 22:04:51 -0500
commit20841405940e7be0617612d521e206e4b6b325db (patch)
treeff60aa7674876d90e25db4046d9916f73680682b /mm
parentde466bd628e8d663fdf3f791bc8db318ee85c714 (diff)
mm: fix TLB flush race between migration, and change_protection_range
There are a few subtle races, between change_protection_range (used by mprotect and change_prot_numa) on one side, and NUMA page migration and compaction on the other side. The basic race is that there is a time window between when the PTE gets made non-present (PROT_NONE or NUMA), and the TLB is flushed. During that time, a CPU may continue writing to the page. This is fine most of the time, however compaction or the NUMA migration code may come in, and migrate the page away. When that happens, the CPU may continue writing, through the cached translation, to what is no longer the current memory location of the process. This only affects x86, which has a somewhat optimistic pte_accessible. All other architectures appear to be safe, and will either always flush, or flush whenever there is a valid mapping, even with no permissions (SPARC). The basic race looks like this: CPU A CPU B CPU C load TLB entry make entry PTE/PMD_NUMA fault on entry read/write old page start migrating page change PTE/PMD to new page read/write old page [*] flush TLB reload TLB from new entry read/write new page lose data [*] the old page may belong to a new user at this point! The obvious fix is to flush remote TLB entries, by making sure that pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may still be accessible if there is a TLB flush pending for the mm. This should fix both NUMA migration and compaction. [mgorman@suse.de: fix build] Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Alex Thorlton <athorlton@sgi.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c7
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/pgtable-generic.c5
3 files changed, 12 insertions, 2 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7de1bf85f683..3d2783e10596 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1377,6 +1377,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1377 } 1377 }
1378 1378
1379 /* 1379 /*
1380 * The page_table_lock above provides a memory barrier
1381 * with change_protection_range.
1382 */
1383 if (mm_tlb_flush_pending(mm))
1384 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
1385
1386 /*
1380 * Migrate the THP to the requested node, returns with page unlocked 1387 * Migrate the THP to the requested node, returns with page unlocked
1381 * and pmd_numa cleared. 1388 * and pmd_numa cleared.
1382 */ 1389 */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index f8421722acb9..bb53a6591aea 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -188,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
188 BUG_ON(addr >= end); 188 BUG_ON(addr >= end);
189 pgd = pgd_offset(mm, addr); 189 pgd = pgd_offset(mm, addr);
190 flush_cache_range(vma, addr, end); 190 flush_cache_range(vma, addr, end);
191 set_tlb_flush_pending(mm);
191 do { 192 do {
192 next = pgd_addr_end(addr, end); 193 next = pgd_addr_end(addr, end);
193 if (pgd_none_or_clear_bad(pgd)) 194 if (pgd_none_or_clear_bad(pgd))
@@ -199,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
199 /* Only flush the TLB if we actually modified any entries: */ 200 /* Only flush the TLB if we actually modified any entries: */
200 if (pages) 201 if (pages)
201 flush_tlb_range(vma, start, end); 202 flush_tlb_range(vma, start, end);
203 clear_tlb_flush_pending(mm);
202 204
203 return pages; 205 return pages;
204} 206}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e84cad27a801..a8b919925934 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
110pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, 110pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
111 pte_t *ptep) 111 pte_t *ptep)
112{ 112{
113 struct mm_struct *mm = (vma)->vm_mm;
113 pte_t pte; 114 pte_t pte;
114 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); 115 pte = ptep_get_and_clear(mm, address, ptep);
115 if (pte_accessible(pte)) 116 if (pte_accessible(mm, pte))
116 flush_tlb_page(vma, address); 117 flush_tlb_page(vma, address);
117 return pte; 118 return pte;
118} 119}