aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAaron Lu <aaron.lu@intel.com>2016-11-10 04:16:33 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-11-17 12:46:56 -0500
commit5d1904204c99596b50a700f092fe49d78edba400 (patch)
treec51b0321e4dd99246d4c61bcb1d7e38fa47aec08
parent961b708e95181041f403251f660bc70be3ff6ba3 (diff)
mremap: fix race between mremap() and page cleanning
Prior to 3.15, there was a race between zap_pte_range() and page_mkclean() where writes to a page could be lost. Dave Hansen discovered by inspection that there is a similar race between move_ptes() and page_mkclean(). We've been able to reproduce the issue by enlarging the race window with a msleep(), but have not been able to hit it without modifying the code. So, we think it's a real issue, but is difficult or impossible to hit in practice. The zap_pte_range() issue is fixed by commit 1cf35d47712d("mm: split 'tlb_flush_mmu()' into tlb flushing and memory freeing parts"). And this patch is to fix the race between page_mkclean() and mremap(). Here is one possible way to hit the race: suppose a process mmapped a file with READ | WRITE and SHARED, it has two threads and they are bound to 2 different CPUs, e.g. CPU1 and CPU2. mmap returned X, then thread 1 did a write to addr X so that CPU1 now has a writable TLB for addr X on it. Thread 2 starts mremaping from addr X to Y while thread 1 cleaned the page and then did another write to the old addr X again. The 2nd write from thread 1 could succeed but the value will get lost. thread 1 thread 2 (bound to CPU1) (bound to CPU2) 1: write 1 to addr X to get a writeable TLB on this CPU 2: mremap starts 3: move_ptes emptied PTE for addr X and setup new PTE for addr Y and then dropped PTL for X and Y 4: page laundering for N by doing fadvise FADV_DONTNEED. When done, pageframe N is deemed clean. 5: *write 2 to addr X 6: tlb flush for addr X 7: munmap (Y, pagesize) to make the page unmapped 8: fadvise with FADV_DONTNEED again to kick the page off the pagecache 9: pread the page from file to verify the value. If 1 is there, it means we have lost the written 2. *the write may or may not cause segmentation fault, it depends on if the TLB is still on the CPU. Please note that this is only one specific way of how the race could occur, it didn't mean that the race could only occur in exact the above config, e.g. more than 2 threads could be involved and fadvise() could be done in another thread, etc. For anonymous pages, they could race between mremap() and page reclaim: THP: a huge PMD is moved by mremap to a new huge PMD, then the new huge PMD gets unmapped/splitted/pagedout before the flush tlb happened for the old huge PMD in move_page_tables() and we could still write data to it. The normal anonymous page has similar situation. To fix this, check for any dirty PTE in move_ptes()/move_huge_pmd() and if any, did the flush before dropping the PTL. If we did the flush for every move_ptes()/move_huge_pmd() call then we do not need to do the flush in move_pages_tables() for the whole range. But if we didn't, we still need to do the whole range flush. Alternatively, we can track which part of the range is flushed in move_ptes()/move_huge_pmd() and which didn't to avoid flushing the whole range in move_page_tables(). But that would require multiple tlb flushes for the different sub-ranges and should be less efficient than the single whole range flush. KBuild test on my Sandybridge desktop doesn't show any noticeable change. v4.9-rc4: real 5m14.048s user 32m19.800s sys 4m50.320s With this commit: real 5m13.888s user 32m19.330s sys 4m51.200s Reported-by: Dave Hansen <dave.hansen@intel.com> Signed-off-by: Aaron Lu <aaron.lu@intel.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--mm/huge_memory.c9
-rw-r--r--mm/mremap.c30
3 files changed, 30 insertions, 11 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9b9f65d99873..e35e6de633b9 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -22,7 +22,7 @@ extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
22 unsigned char *vec); 22 unsigned char *vec);
23extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 23extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
24 unsigned long new_addr, unsigned long old_end, 24 unsigned long new_addr, unsigned long old_end,
25 pmd_t *old_pmd, pmd_t *new_pmd); 25 pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush);
26extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 26extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
27 unsigned long addr, pgprot_t newprot, 27 unsigned long addr, pgprot_t newprot,
28 int prot_numa); 28 int prot_numa);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cdcd25cb30fe..eff3de359d50 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1426,11 +1426,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1426 1426
1427bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 1427bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1428 unsigned long new_addr, unsigned long old_end, 1428 unsigned long new_addr, unsigned long old_end,
1429 pmd_t *old_pmd, pmd_t *new_pmd) 1429 pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
1430{ 1430{
1431 spinlock_t *old_ptl, *new_ptl; 1431 spinlock_t *old_ptl, *new_ptl;
1432 pmd_t pmd; 1432 pmd_t pmd;
1433 struct mm_struct *mm = vma->vm_mm; 1433 struct mm_struct *mm = vma->vm_mm;
1434 bool force_flush = false;
1434 1435
1435 if ((old_addr & ~HPAGE_PMD_MASK) || 1436 if ((old_addr & ~HPAGE_PMD_MASK) ||
1436 (new_addr & ~HPAGE_PMD_MASK) || 1437 (new_addr & ~HPAGE_PMD_MASK) ||
@@ -1455,6 +1456,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1455 new_ptl = pmd_lockptr(mm, new_pmd); 1456 new_ptl = pmd_lockptr(mm, new_pmd);
1456 if (new_ptl != old_ptl) 1457 if (new_ptl != old_ptl)
1457 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 1458 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1459 if (pmd_present(*old_pmd) && pmd_dirty(*old_pmd))
1460 force_flush = true;
1458 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 1461 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
1459 VM_BUG_ON(!pmd_none(*new_pmd)); 1462 VM_BUG_ON(!pmd_none(*new_pmd));
1460 1463
@@ -1467,6 +1470,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1467 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); 1470 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
1468 if (new_ptl != old_ptl) 1471 if (new_ptl != old_ptl)
1469 spin_unlock(new_ptl); 1472 spin_unlock(new_ptl);
1473 if (force_flush)
1474 flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
1475 else
1476 *need_flush = true;
1470 spin_unlock(old_ptl); 1477 spin_unlock(old_ptl);
1471 return true; 1478 return true;
1472 } 1479 }
diff --git a/mm/mremap.c b/mm/mremap.c
index da22ad2a5678..6ccecc03f56a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,11 +104,13 @@ static pte_t move_soft_dirty_pte(pte_t pte)
104static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, 104static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
105 unsigned long old_addr, unsigned long old_end, 105 unsigned long old_addr, unsigned long old_end,
106 struct vm_area_struct *new_vma, pmd_t *new_pmd, 106 struct vm_area_struct *new_vma, pmd_t *new_pmd,
107 unsigned long new_addr, bool need_rmap_locks) 107 unsigned long new_addr, bool need_rmap_locks, bool *need_flush)
108{ 108{
109 struct mm_struct *mm = vma->vm_mm; 109 struct mm_struct *mm = vma->vm_mm;
110 pte_t *old_pte, *new_pte, pte; 110 pte_t *old_pte, *new_pte, pte;
111 spinlock_t *old_ptl, *new_ptl; 111 spinlock_t *old_ptl, *new_ptl;
112 bool force_flush = false;
113 unsigned long len = old_end - old_addr;
112 114
113 /* 115 /*
114 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma 116 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
@@ -146,6 +148,14 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
146 new_pte++, new_addr += PAGE_SIZE) { 148 new_pte++, new_addr += PAGE_SIZE) {
147 if (pte_none(*old_pte)) 149 if (pte_none(*old_pte))
148 continue; 150 continue;
151
152 /*
153 * We are remapping a dirty PTE, make sure to
154 * flush TLB before we drop the PTL for the
155 * old PTE or we may race with page_mkclean().
156 */
157 if (pte_present(*old_pte) && pte_dirty(*old_pte))
158 force_flush = true;
149 pte = ptep_get_and_clear(mm, old_addr, old_pte); 159 pte = ptep_get_and_clear(mm, old_addr, old_pte);
150 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); 160 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
151 pte = move_soft_dirty_pte(pte); 161 pte = move_soft_dirty_pte(pte);
@@ -156,6 +166,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
156 if (new_ptl != old_ptl) 166 if (new_ptl != old_ptl)
157 spin_unlock(new_ptl); 167 spin_unlock(new_ptl);
158 pte_unmap(new_pte - 1); 168 pte_unmap(new_pte - 1);
169 if (force_flush)
170 flush_tlb_range(vma, old_end - len, old_end);
171 else
172 *need_flush = true;
159 pte_unmap_unlock(old_pte - 1, old_ptl); 173 pte_unmap_unlock(old_pte - 1, old_ptl);
160 if (need_rmap_locks) 174 if (need_rmap_locks)
161 drop_rmap_locks(vma); 175 drop_rmap_locks(vma);
@@ -201,13 +215,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
201 if (need_rmap_locks) 215 if (need_rmap_locks)
202 take_rmap_locks(vma); 216 take_rmap_locks(vma);
203 moved = move_huge_pmd(vma, old_addr, new_addr, 217 moved = move_huge_pmd(vma, old_addr, new_addr,
204 old_end, old_pmd, new_pmd); 218 old_end, old_pmd, new_pmd,
219 &need_flush);
205 if (need_rmap_locks) 220 if (need_rmap_locks)
206 drop_rmap_locks(vma); 221 drop_rmap_locks(vma);
207 if (moved) { 222 if (moved)
208 need_flush = true;
209 continue; 223 continue;
210 }
211 } 224 }
212 split_huge_pmd(vma, old_pmd, old_addr); 225 split_huge_pmd(vma, old_pmd, old_addr);
213 if (pmd_trans_unstable(old_pmd)) 226 if (pmd_trans_unstable(old_pmd))
@@ -220,11 +233,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
220 extent = next - new_addr; 233 extent = next - new_addr;
221 if (extent > LATENCY_LIMIT) 234 if (extent > LATENCY_LIMIT)
222 extent = LATENCY_LIMIT; 235 extent = LATENCY_LIMIT;
223 move_ptes(vma, old_pmd, old_addr, old_addr + extent, 236 move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
224 new_vma, new_pmd, new_addr, need_rmap_locks); 237 new_pmd, new_addr, need_rmap_locks, &need_flush);
225 need_flush = true;
226 } 238 }
227 if (likely(need_flush)) 239 if (need_flush)
228 flush_tlb_range(vma, old_end-len, old_addr); 240 flush_tlb_range(vma, old_end-len, old_addr);
229 241
230 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 242 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);