aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMichel Lespinasse <walken@google.com>2012-10-08 19:31:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 03:22:41 -0400
commit108d6642ad81bb1d62b401490a334d2c12397517 (patch)
tree27df7d1777d80b9dddeaefaac928b726ff82a816 /mm
parent9826a516ff77c5820e591211e4f3e58ff36f46be (diff)
mm anon rmap: remove anon_vma_moveto_tail
mremap() had a clever optimization where move_ptes() did not take the anon_vma lock to avoid a race with anon rmap users such as page migration. Instead, the avc's were ordered in such a way that the origin vma was always visited by rmap before the destination. This ordering and the use of page table locks rmap usage safe. However, we want to replace the use of linked lists in anon rmap with an interval tree, and this will make it harder to impose such ordering as the interval tree will always be sorted by the avc->vma->vm_pgoff value. For now, let's replace the anon_vma_moveto_tail() ordering function with proper anon_vma locking in move_ptes(). Once we have the anon interval tree in place, we will re-introduce an optimization to avoid taking these locks in the most common cases. Signed-off-by: Michel Lespinasse <walken@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Daniel Santos <daniel.santos@pobox.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/mremap.c14
-rw-r--r--mm/rmap.c45
3 files changed, 6 insertions, 56 deletions
diff --git a/mm/mmap.c b/mm/mmap.c
index 5ac533f88e99..66984aab7915 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2378,8 +2378,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2378 */ 2378 */
2379 VM_BUG_ON(faulted_in_anon_vma); 2379 VM_BUG_ON(faulted_in_anon_vma);
2380 *vmap = new_vma; 2380 *vmap = new_vma;
2381 } else 2381 }
2382 anon_vma_moveto_tail(new_vma);
2383 } else { 2382 } else {
2384 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2383 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2385 if (new_vma) { 2384 if (new_vma) {
diff --git a/mm/mremap.c b/mm/mremap.c
index cc06d0e48d05..5588bb6e9295 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -74,6 +74,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
74 unsigned long new_addr) 74 unsigned long new_addr)
75{ 75{
76 struct address_space *mapping = NULL; 76 struct address_space *mapping = NULL;
77 struct anon_vma *anon_vma = vma->anon_vma;
77 struct mm_struct *mm = vma->vm_mm; 78 struct mm_struct *mm = vma->vm_mm;
78 pte_t *old_pte, *new_pte, pte; 79 pte_t *old_pte, *new_pte, pte;
79 spinlock_t *old_ptl, *new_ptl; 80 spinlock_t *old_ptl, *new_ptl;
@@ -88,6 +89,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
88 mapping = vma->vm_file->f_mapping; 89 mapping = vma->vm_file->f_mapping;
89 mutex_lock(&mapping->i_mmap_mutex); 90 mutex_lock(&mapping->i_mmap_mutex);
90 } 91 }
92 if (anon_vma)
93 anon_vma_lock(anon_vma);
91 94
92 /* 95 /*
93 * We don't have to worry about the ordering of src and dst 96 * We don't have to worry about the ordering of src and dst
@@ -114,6 +117,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
114 spin_unlock(new_ptl); 117 spin_unlock(new_ptl);
115 pte_unmap(new_pte - 1); 118 pte_unmap(new_pte - 1);
116 pte_unmap_unlock(old_pte - 1, old_ptl); 119 pte_unmap_unlock(old_pte - 1, old_ptl);
120 if (anon_vma)
121 anon_vma_unlock(anon_vma);
117 if (mapping) 122 if (mapping)
118 mutex_unlock(&mapping->i_mmap_mutex); 123 mutex_unlock(&mapping->i_mmap_mutex);
119} 124}
@@ -221,15 +226,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
221 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); 226 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
222 if (moved_len < old_len) { 227 if (moved_len < old_len) {
223 /* 228 /*
224 * Before moving the page tables from the new vma to
225 * the old vma, we need to be sure the old vma is
226 * queued after new vma in the same_anon_vma list to
227 * prevent SMP races with rmap_walk (that could lead
228 * rmap_walk to miss some page table).
229 */
230 anon_vma_moveto_tail(vma);
231
232 /*
233 * On error, move entries back from new area to old, 229 * On error, move entries back from new area to old,
234 * which will succeed since page tables still there, 230 * which will succeed since page tables still there,
235 * and then proceed to unmap new area instead of old. 231 * and then proceed to unmap new area instead of old.
diff --git a/mm/rmap.c b/mm/rmap.c
index 7b5b51d25fc5..8cbd62fde0f1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -269,51 +269,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
269} 269}
270 270
271/* 271/*
272 * Some rmap walk that needs to find all ptes/hugepmds without false
273 * negatives (like migrate and split_huge_page) running concurrent
274 * with operations that copy or move pagetables (like mremap() and
275 * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
276 * list to be in a certain order: the dst_vma must be placed after the
277 * src_vma in the list. This is always guaranteed by fork() but
278 * mremap() needs to call this function to enforce it in case the
279 * dst_vma isn't newly allocated and chained with the anon_vma_clone()
280 * function but just an extension of a pre-existing vma through
281 * vma_merge.
282 *
283 * NOTE: the same_anon_vma list can still be changed by other
284 * processes while mremap runs because mremap doesn't hold the
285 * anon_vma mutex to prevent modifications to the list while it
286 * runs. All we need to enforce is that the relative order of this
287 * process vmas isn't changing (we don't care about other vmas
288 * order). Each vma corresponds to an anon_vma_chain structure so
289 * there's no risk that other processes calling anon_vma_moveto_tail()
290 * and changing the same_anon_vma list under mremap() will screw with
291 * the relative order of this process vmas in the list, because we
292 * they can't alter the order of any vma that belongs to this
293 * process. And there can't be another anon_vma_moveto_tail() running
294 * concurrently with mremap() coming from this process because we hold
295 * the mmap_sem for the whole mremap(). fork() ordering dependency
296 * also shouldn't be affected because fork() only cares that the
297 * parent vmas are placed in the list before the child vmas and
298 * anon_vma_moveto_tail() won't reorder vmas from either the fork()
299 * parent or child.
300 */
301void anon_vma_moveto_tail(struct vm_area_struct *dst)
302{
303 struct anon_vma_chain *pavc;
304 struct anon_vma *root = NULL;
305
306 list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
307 struct anon_vma *anon_vma = pavc->anon_vma;
308 VM_BUG_ON(pavc->vma != dst);
309 root = lock_anon_vma_root(root, anon_vma);
310 list_del(&pavc->same_anon_vma);
311 list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
312 }
313 unlock_anon_vma_root(root);
314}
315
316/*
317 * Attach vma to its own anon_vma, as well as to the anon_vmas that 272 * Attach vma to its own anon_vma, as well as to the anon_vmas that
318 * the corresponding VMA in the parent process is attached to. 273 * the corresponding VMA in the parent process is attached to.
319 * Returns 0 on success, non-zero on failure. 274 * Returns 0 on success, non-zero on failure.