mm: avoid taking rmap locks in move_ptes()

During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse <walken@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Daniel Santos <daniel.santos@pobox.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Michel Lespinasse <walken@google.com> 2012-10-08 19:31:50 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-10-09 03:22:42 -0400
commit: 38a76013ad809beb0b52f60d365c960d035bd83c (patch)
tree: c63ba707ab17dd1ff1e90650faf74570daa3cf9f /mm/mremap.c
parent: 523d4e2008fd4a68b1a164e63e8c75b7b20f07e0 (diff)
1 files changed, 39 insertions, 18 deletions
diff --git a/mm/mremap.c b/mm/mremap.c
index 5588bb6e9295..3b639a4b26bd 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                unsigned long old_addr, unsigned long old_end,
                struct vm_area_struct *new_vma, pmd_t *new_pmd,
-                unsigned long new_addr)
+                unsigned long new_addr, bool need_rmap_locks)
 {
        struct address_space *mapping = NULL;
-        struct anon_vma *anon_vma = vma->anon_vma;
+        struct anon_vma *anon_vma = NULL;
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
-        if (vma->vm_file) {
+        /*
-                /*
+         * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
-                 * Subtle point from Rajesh Venkatasubramanian: before
+         * locks to ensure that rmap will always observe either the old or the
-                 * moving file-based ptes, we must lock truncate_pagecache
+         * new ptes. This is the easiest way to avoid races with
-                 * out, since it might clean the dst vma before the src vma,
+         * truncate_pagecache(), page migration, etc...
-                 * and we propagate stale pages into the dst afterward.
+         *
-                 */
+         * When need_rmap_locks is false, we use other ways to avoid
-                mapping = vma->vm_file->f_mapping;
+         * such races:
-                mutex_lock(&mapping->i_mmap_mutex);
+         *
+         * - During exec() shift_arg_pages(), we use a specially tagged vma
+         *   which rmap call sites look for using is_vma_temporary_stack().
+         *
+         * - During mremap(), new_vma is often known to be placed after vma
+         *   in rmap traversal order. This ensures rmap will always observe
+         *   either the old pte, or the new pte, or both (the page table locks
+         *   serialize access to individual ptes, but only rmap traversal
+         *   order guarantees that we won't miss both the old and new ptes).
+         */
+        if (need_rmap_locks) {
+                if (vma->vm_file) {
+                        mapping = vma->vm_file->f_mapping;
+                        mutex_lock(&mapping->i_mmap_mutex);
+                }
+                if (vma->anon_vma) {
+                        anon_vma = vma->anon_vma;
+                        anon_vma_lock(anon_vma);
+                }
        }
-        if (anon_vma)
-                anon_vma_lock(anon_vma);
        /*
         * We don't have to worry about the ordering of src and dst
@@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
-                unsigned long new_addr, unsigned long len)
+                unsigned long new_addr, unsigned long len,
+                bool need_rmap_locks)
 {
        unsigned long extent, next, old_end;
        pmd_t *old_pmd, *new_pmd;
@@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                if (extent > LATENCY_LIMIT)
                        extent = LATENCY_LIMIT;
                move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-                                new_vma, new_pmd, new_addr);
+                          new_vma, new_pmd, new_addr, need_rmap_locks);
                need_flush = true;
        }
        if (likely(need_flush))
@@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        unsigned long hiwater_vm;
        int split = 0;
        int err;
+        bool need_rmap_locks;
        /*
         * We'd prefer to avoid failure later on in do_munmap:
@@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                return err;
        new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
-        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+                           &need_rmap_locks);
        if (!new_vma)
                return -ENOMEM;
-        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+                                     need_rmap_locks);
        if (moved_len < old_len) {
                /*
                 * On error, move entries back from new area to old,
                 * which will succeed since page tables still there,
                 * and then proceed to unmap new area instead of old.
                 */
-                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+                                 true);
                vma = new_vma;
                old_len = new_len;
                old_addr = new_addr;
author	Michel Lespinasse <walken@google.com>	2012-10-08 19:31:50 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-10-09 03:22:42 -0400
commit	38a76013ad809beb0b52f60d365c960d035bd83c (patch)
tree	c63ba707ab17dd1ff1e90650faf74570daa3cf9f /mm/mremap.c
parent	523d4e2008fd4a68b1a164e63e8c75b7b20f07e0 (diff)