aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mlock.c
diff options
context:
space:
mode:
authorVlastimil Babka <vbabka@suse.cz>2013-09-11 17:22:35 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-11 18:58:01 -0400
commit7a8010cd36273ff5f6fea5201ef9232f30cebbd9 (patch)
tree3805f3d9a8a1f1c1c555ef31bc1bdb51fb51e33e /mm/mlock.c
parent5b40998ae35cf64561868370e6c9f3d3e94b6bf7 (diff)
mm: munlock: manual pte walk in fast path instead of follow_page_mask()
Currently munlock_vma_pages_range() calls follow_page_mask() to obtain each individual struct page. This entails repeated full page table translations and page table lock taken for each page separately. This patch avoids the costly follow_page_mask() where possible, by iterating over ptes within single pmd under single page table lock. The first pte is obtained by get_locked_pte() for non-THP page acquired by the initial follow_page_mask(). The rest of the on-stack pagevec for munlock is filled up using pte_walk as long as pte_present() and vm_normal_page() are sufficient to obtain the struct page. After this patch, a 14% speedup was measured for munlocking a 56GB large memory area with THP disabled. Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Cc: Jörn Engel <joern@logfs.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Michel Lespinasse <walken@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mlock.c')
-rw-r--r--mm/mlock.c110
1 files changed, 79 insertions, 31 deletions
diff --git a/mm/mlock.c b/mm/mlock.c
index 19a934dce5d6..d63802663242 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -280,8 +280,7 @@ static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
280 * The second phase finishes the munlock only for pages where isolation 280 * The second phase finishes the munlock only for pages where isolation
281 * succeeded. 281 * succeeded.
282 * 282 *
283 * Note that pvec is modified during the process. Before returning 283 * Note that the pagevec may be modified during the process.
284 * pagevec_reinit() is called on it.
285 */ 284 */
286static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) 285static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
287{ 286{
@@ -356,8 +355,60 @@ skip_munlock:
356 */ 355 */
357 if (pagevec_count(&pvec_putback)) 356 if (pagevec_count(&pvec_putback))
358 __putback_lru_fast(&pvec_putback, pgrescued); 357 __putback_lru_fast(&pvec_putback, pgrescued);
358}
359
360/*
361 * Fill up pagevec for __munlock_pagevec using pte walk
362 *
363 * The function expects that the struct page corresponding to @start address is
364 * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
365 *
366 * The rest of @pvec is filled by subsequent pages within the same pmd and same
367 * zone, as long as the pte's are present and vm_normal_page() succeeds. These
368 * pages also get pinned.
369 *
370 * Returns the address of the next page that should be scanned. This equals
371 * @start + PAGE_SIZE when no page could be added by the pte walk.
372 */
373static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
374 struct vm_area_struct *vma, int zoneid, unsigned long start,
375 unsigned long end)
376{
377 pte_t *pte;
378 spinlock_t *ptl;
379
380 /*
381 * Initialize pte walk starting at the already pinned page where we
382 * are sure that there is a pte.
383 */
384 pte = get_locked_pte(vma->vm_mm, start, &ptl);
385 end = min(end, pmd_addr_end(start, end));
386
387 /* The page next to the pinned page is the first we will try to get */
388 start += PAGE_SIZE;
389 while (start < end) {
390 struct page *page = NULL;
391 pte++;
392 if (pte_present(*pte))
393 page = vm_normal_page(vma, start, *pte);
394 /*
395 * Break if page could not be obtained or the page's node+zone does not
396 * match
397 */
398 if (!page || page_zone_id(page) != zoneid)
399 break;
359 400
360 pagevec_reinit(pvec); 401 get_page(page);
402 /*
403 * Increase the address that will be returned *before* the
404 * eventual break due to pvec becoming full by adding the page
405 */
406 start += PAGE_SIZE;
407 if (pagevec_add(pvec, page) == 0)
408 break;
409 }
410 pte_unmap_unlock(pte, ptl);
411 return start;
361} 412}
362 413
363/* 414/*
@@ -381,17 +432,16 @@ skip_munlock:
381void munlock_vma_pages_range(struct vm_area_struct *vma, 432void munlock_vma_pages_range(struct vm_area_struct *vma,
382 unsigned long start, unsigned long end) 433 unsigned long start, unsigned long end)
383{ 434{
384 struct pagevec pvec;
385 struct zone *zone = NULL;
386
387 pagevec_init(&pvec, 0);
388 vma->vm_flags &= ~VM_LOCKED; 435 vma->vm_flags &= ~VM_LOCKED;
389 436
390 while (start < end) { 437 while (start < end) {
391 struct page *page; 438 struct page *page = NULL;
392 unsigned int page_mask, page_increm; 439 unsigned int page_mask, page_increm;
393 struct zone *pagezone; 440 struct pagevec pvec;
441 struct zone *zone;
442 int zoneid;
394 443
444 pagevec_init(&pvec, 0);
395 /* 445 /*
396 * Although FOLL_DUMP is intended for get_dump_page(), 446 * Although FOLL_DUMP is intended for get_dump_page(),
397 * it just so happens that its special treatment of the 447 * it just so happens that its special treatment of the
@@ -400,22 +450,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
400 * has sneaked into the range, we won't oops here: great). 450 * has sneaked into the range, we won't oops here: great).
401 */ 451 */
402 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, 452 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
403 &page_mask); 453 &page_mask);
454
404 if (page && !IS_ERR(page)) { 455 if (page && !IS_ERR(page)) {
405 pagezone = page_zone(page);
406 /* The whole pagevec must be in the same zone */
407 if (pagezone != zone) {
408 if (pagevec_count(&pvec))
409 __munlock_pagevec(&pvec, zone);
410 zone = pagezone;
411 }
412 if (PageTransHuge(page)) { 456 if (PageTransHuge(page)) {
413 /*
414 * THP pages are not handled by pagevec due
415 * to their possible split (see below).
416 */
417 if (pagevec_count(&pvec))
418 __munlock_pagevec(&pvec, zone);
419 lock_page(page); 457 lock_page(page);
420 /* 458 /*
421 * Any THP page found by follow_page_mask() may 459 * Any THP page found by follow_page_mask() may
@@ -428,21 +466,31 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
428 put_page(page); /* follow_page_mask() */ 466 put_page(page); /* follow_page_mask() */
429 } else { 467 } else {
430 /* 468 /*
431 * Non-huge pages are handled in batches 469 * Non-huge pages are handled in batches via
432 * via pagevec. The pin from 470 * pagevec. The pin from follow_page_mask()
433 * follow_page_mask() prevents them from 471 * prevents them from collapsing by THP.
434 * collapsing by THP. 472 */
473 pagevec_add(&pvec, page);
474 zone = page_zone(page);
475 zoneid = page_zone_id(page);
476
477 /*
478 * Try to fill the rest of pagevec using fast
479 * pte walk. This will also update start to
480 * the next page to process. Then munlock the
481 * pagevec.
435 */ 482 */
436 if (pagevec_add(&pvec, page) == 0) 483 start = __munlock_pagevec_fill(&pvec, vma,
437 __munlock_pagevec(&pvec, zone); 484 zoneid, start, end);
485 __munlock_pagevec(&pvec, zone);
486 goto next;
438 } 487 }
439 } 488 }
440 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); 489 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
441 start += page_increm * PAGE_SIZE; 490 start += page_increm * PAGE_SIZE;
491next:
442 cond_resched(); 492 cond_resched();
443 } 493 }
444 if (pagevec_count(&pvec))
445 __munlock_pagevec(&pvec, zone);
446} 494}
447 495
448/* 496/*