mm: munlock: manual pte walk in fast path instead of follow_page_mask()

Currently munlock_vma_pages_range() calls follow_page_mask() to obtain each individual struct page. This entails repeated full page table translations and page table lock taken for each page separately. This patch avoids the costly follow_page_mask() where possible, by iterating over ptes within single pmd under single page table lock. The first pte is obtained by get_locked_pte() for non-THP page acquired by the initial follow_page_mask(). The rest of the on-stack pagevec for munlock is filled up using pte_walk as long as pte_present() and vm_normal_page() are sufficient to obtain the struct page. After this patch, a 14% speedup was measured for munlocking a 56GB large memory area with THP disabled. Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Cc: Jörn Engel <joern@logfs.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Michel Lespinasse <walken@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Vlastimil Babka <vbabka@suse.cz> 2013-09-11 17:22:35 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-09-11 18:58:01 -0400
commit: 7a8010cd36273ff5f6fea5201ef9232f30cebbd9 (patch)
tree: 3805f3d9a8a1f1c1c555ef31bc1bdb51fb51e33e /mm/mlock.c
parent: 5b40998ae35cf64561868370e6c9f3d3e94b6bf7 (diff)
1 files changed, 79 insertions, 31 deletions
diff --git a/mm/mlock.c b/mm/mlock.c
index 19a934dce5d6..d63802663242 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -280,8 +280,7 @@ static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
 * The second phase finishes the munlock only for pages where isolation
 * succeeded.
 *
- * Note that pvec is modified during the process. Before returning
+ * Note that the pagevec may be modified during the process.
- * pagevec_reinit() is called on it.
 */
 static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 {
@@ -356,8 +355,60 @@ skip_munlock:
         */
        if (pagevec_count(&pvec_putback))
                __putback_lru_fast(&pvec_putback, pgrescued);
+}
+/*
+ * Fill up pagevec for __munlock_pagevec using pte walk
+ *
+ * The function expects that the struct page corresponding to @start address is
+ * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
+ *
+ * The rest of @pvec is filled by subsequent pages within the same pmd and same
+ * zone, as long as the pte's are present and vm_normal_page() succeeds. These
+ * pages also get pinned.
+ *
+ * Returns the address of the next page that should be scanned. This equals
+ * @start + PAGE_SIZE when no page could be added by the pte walk.
+ */
+static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
+                struct vm_area_struct *vma, int zoneid, unsigned long start,
+                unsigned long end)
+{
+        pte_t *pte;
+        spinlock_t *ptl;
+        /*
+         * Initialize pte walk starting at the already pinned page where we
+         * are sure that there is a pte.
+         */
+        pte = get_locked_pte(vma->vm_mm, start, &ptl);
+        end = min(end, pmd_addr_end(start, end));
+        /* The page next to the pinned page is the first we will try to get */
+        start += PAGE_SIZE;
+        while (start < end) {
+                struct page *page = NULL;
+                pte++;
+                if (pte_present(*pte))
+                        page = vm_normal_page(vma, start, *pte);
+                /*
+                 * Break if page could not be obtained or the page's node+zone does not
+                 * match
+                 */
+                if (!page || page_zone_id(page) != zoneid)
+                        break;
-        pagevec_reinit(pvec);
+                get_page(page);
+                /*
+                 * Increase the address that will be returned *before* the
+                 * eventual break due to pvec becoming full by adding the page
+                 */
+                start += PAGE_SIZE;
+                if (pagevec_add(pvec, page) == 0)
+                        break;
+        }
+        pte_unmap_unlock(pte, ptl);
+        return start;
 }
 /*
@@ -381,17 +432,16 @@ skip_munlock:
 void munlock_vma_pages_range(struct vm_area_struct *vma,
                             unsigned long start, unsigned long end)
 {
-        struct pagevec pvec;
-        struct zone *zone = NULL;
-        pagevec_init(&pvec, 0);
        vma->vm_flags &= ~VM_LOCKED;
        while (start < end) {
-                struct page *page;
+                struct page *page = NULL;
                unsigned int page_mask, page_increm;
-                struct zone *pagezone;
+                struct pagevec pvec;
+                struct zone *zone;
+                int zoneid;
+                pagevec_init(&pvec, 0);
                /*
                 * Although FOLL_DUMP is intended for get_dump_page(),
                 * it just so happens that its special treatment of the
@@ -400,22 +450,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                 * has sneaked into the range, we won't oops here: great).
                 */
                page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
-                                        &page_mask);
+                                &page_mask);
                if (page && !IS_ERR(page)) {
-                        pagezone = page_zone(page);
-                        /* The whole pagevec must be in the same zone */
-                        if (pagezone != zone) {
-                                if (pagevec_count(&pvec))
-                                        __munlock_pagevec(&pvec, zone);
-                                zone = pagezone;
-                        }
                        if (PageTransHuge(page)) {
-                                /*
-                                 * THP pages are not handled by pagevec due
-                                 * to their possible split (see below).
-                                 */
-                                if (pagevec_count(&pvec))
-                                        __munlock_pagevec(&pvec, zone);
                                lock_page(page);
                                /*
                                 * Any THP page found by follow_page_mask() may
@@ -428,21 +466,31 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                                put_page(page); /* follow_page_mask() */
                        } else {
                                /*
-                                 * Non-huge pages are handled in batches
+                                 * Non-huge pages are handled in batches via
-                                 * via pagevec. The pin from
+                                 * pagevec. The pin from follow_page_mask()
-                                 * follow_page_mask() prevents them from
+                                 * prevents them from collapsing by THP.
-                                 * collapsing by THP.
+                                 */
+                                pagevec_add(&pvec, page);
+                                zone = page_zone(page);
+                                zoneid = page_zone_id(page);
+                                /*
+                                 * Try to fill the rest of pagevec using fast
+                                 * pte walk. This will also update start to
+                                 * the next page to process. Then munlock the
+                                 * pagevec.
                                 */
-                                if (pagevec_add(&pvec, page) == 0)
+                                start = __munlock_pagevec_fill(&pvec, vma,
-                                        __munlock_pagevec(&pvec, zone);
+                                                zoneid, start, end);
+                                __munlock_pagevec(&pvec, zone);
+                                goto next;
                        }
                }
                page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
                start += page_increm * PAGE_SIZE;
+next:
                cond_resched();
        }
-        if (pagevec_count(&pvec))
-                __munlock_pagevec(&pvec, zone);
 }
 /*
author	Vlastimil Babka <vbabka@suse.cz>	2013-09-11 17:22:35 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-09-11 18:58:01 -0400
commit	7a8010cd36273ff5f6fea5201ef9232f30cebbd9 (patch)
tree	3805f3d9a8a1f1c1c555ef31bc1bdb51fb51e33e /mm/mlock.c
parent	5b40998ae35cf64561868370e6c9f3d3e94b6bf7 (diff)