diff options
author | Vlastimil Babka <vbabka@suse.cz> | 2013-09-11 17:22:35 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-11 18:58:01 -0400 |
commit | 7a8010cd36273ff5f6fea5201ef9232f30cebbd9 (patch) | |
tree | 3805f3d9a8a1f1c1c555ef31bc1bdb51fb51e33e /mm/mlock.c | |
parent | 5b40998ae35cf64561868370e6c9f3d3e94b6bf7 (diff) |
mm: munlock: manual pte walk in fast path instead of follow_page_mask()
Currently munlock_vma_pages_range() calls follow_page_mask() to obtain
each individual struct page. This entails repeated full page table
translations and page table lock taken for each page separately.
This patch avoids the costly follow_page_mask() where possible, by
iterating over ptes within single pmd under single page table lock. The
first pte is obtained by get_locked_pte() for non-THP page acquired by the
initial follow_page_mask(). The rest of the on-stack pagevec for munlock
is filled up using pte_walk as long as pte_present() and vm_normal_page()
are sufficient to obtain the struct page.
After this patch, a 14% speedup was measured for munlocking a 56GB large
memory area with THP disabled.
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jörn Engel <joern@logfs.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mlock.c')
-rw-r--r-- | mm/mlock.c | 110 |
1 files changed, 79 insertions, 31 deletions
diff --git a/mm/mlock.c b/mm/mlock.c index 19a934dce5d6..d63802663242 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -280,8 +280,7 @@ static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) | |||
280 | * The second phase finishes the munlock only for pages where isolation | 280 | * The second phase finishes the munlock only for pages where isolation |
281 | * succeeded. | 281 | * succeeded. |
282 | * | 282 | * |
283 | * Note that pvec is modified during the process. Before returning | 283 | * Note that the pagevec may be modified during the process. |
284 | * pagevec_reinit() is called on it. | ||
285 | */ | 284 | */ |
286 | static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) | 285 | static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) |
287 | { | 286 | { |
@@ -356,8 +355,60 @@ skip_munlock: | |||
356 | */ | 355 | */ |
357 | if (pagevec_count(&pvec_putback)) | 356 | if (pagevec_count(&pvec_putback)) |
358 | __putback_lru_fast(&pvec_putback, pgrescued); | 357 | __putback_lru_fast(&pvec_putback, pgrescued); |
358 | } | ||
359 | |||
360 | /* | ||
361 | * Fill up pagevec for __munlock_pagevec using pte walk | ||
362 | * | ||
363 | * The function expects that the struct page corresponding to @start address is | ||
364 | * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. | ||
365 | * | ||
366 | * The rest of @pvec is filled by subsequent pages within the same pmd and same | ||
367 | * zone, as long as the pte's are present and vm_normal_page() succeeds. These | ||
368 | * pages also get pinned. | ||
369 | * | ||
370 | * Returns the address of the next page that should be scanned. This equals | ||
371 | * @start + PAGE_SIZE when no page could be added by the pte walk. | ||
372 | */ | ||
373 | static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, | ||
374 | struct vm_area_struct *vma, int zoneid, unsigned long start, | ||
375 | unsigned long end) | ||
376 | { | ||
377 | pte_t *pte; | ||
378 | spinlock_t *ptl; | ||
379 | |||
380 | /* | ||
381 | * Initialize pte walk starting at the already pinned page where we | ||
382 | * are sure that there is a pte. | ||
383 | */ | ||
384 | pte = get_locked_pte(vma->vm_mm, start, &ptl); | ||
385 | end = min(end, pmd_addr_end(start, end)); | ||
386 | |||
387 | /* The page next to the pinned page is the first we will try to get */ | ||
388 | start += PAGE_SIZE; | ||
389 | while (start < end) { | ||
390 | struct page *page = NULL; | ||
391 | pte++; | ||
392 | if (pte_present(*pte)) | ||
393 | page = vm_normal_page(vma, start, *pte); | ||
394 | /* | ||
395 | * Break if page could not be obtained or the page's node+zone does not | ||
396 | * match | ||
397 | */ | ||
398 | if (!page || page_zone_id(page) != zoneid) | ||
399 | break; | ||
359 | 400 | ||
360 | pagevec_reinit(pvec); | 401 | get_page(page); |
402 | /* | ||
403 | * Increase the address that will be returned *before* the | ||
404 | * eventual break due to pvec becoming full by adding the page | ||
405 | */ | ||
406 | start += PAGE_SIZE; | ||
407 | if (pagevec_add(pvec, page) == 0) | ||
408 | break; | ||
409 | } | ||
410 | pte_unmap_unlock(pte, ptl); | ||
411 | return start; | ||
361 | } | 412 | } |
362 | 413 | ||
363 | /* | 414 | /* |
@@ -381,17 +432,16 @@ skip_munlock: | |||
381 | void munlock_vma_pages_range(struct vm_area_struct *vma, | 432 | void munlock_vma_pages_range(struct vm_area_struct *vma, |
382 | unsigned long start, unsigned long end) | 433 | unsigned long start, unsigned long end) |
383 | { | 434 | { |
384 | struct pagevec pvec; | ||
385 | struct zone *zone = NULL; | ||
386 | |||
387 | pagevec_init(&pvec, 0); | ||
388 | vma->vm_flags &= ~VM_LOCKED; | 435 | vma->vm_flags &= ~VM_LOCKED; |
389 | 436 | ||
390 | while (start < end) { | 437 | while (start < end) { |
391 | struct page *page; | 438 | struct page *page = NULL; |
392 | unsigned int page_mask, page_increm; | 439 | unsigned int page_mask, page_increm; |
393 | struct zone *pagezone; | 440 | struct pagevec pvec; |
441 | struct zone *zone; | ||
442 | int zoneid; | ||
394 | 443 | ||
444 | pagevec_init(&pvec, 0); | ||
395 | /* | 445 | /* |
396 | * Although FOLL_DUMP is intended for get_dump_page(), | 446 | * Although FOLL_DUMP is intended for get_dump_page(), |
397 | * it just so happens that its special treatment of the | 447 | * it just so happens that its special treatment of the |
@@ -400,22 +450,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
400 | * has sneaked into the range, we won't oops here: great). | 450 | * has sneaked into the range, we won't oops here: great). |
401 | */ | 451 | */ |
402 | page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, | 452 | page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, |
403 | &page_mask); | 453 | &page_mask); |
454 | |||
404 | if (page && !IS_ERR(page)) { | 455 | if (page && !IS_ERR(page)) { |
405 | pagezone = page_zone(page); | ||
406 | /* The whole pagevec must be in the same zone */ | ||
407 | if (pagezone != zone) { | ||
408 | if (pagevec_count(&pvec)) | ||
409 | __munlock_pagevec(&pvec, zone); | ||
410 | zone = pagezone; | ||
411 | } | ||
412 | if (PageTransHuge(page)) { | 456 | if (PageTransHuge(page)) { |
413 | /* | ||
414 | * THP pages are not handled by pagevec due | ||
415 | * to their possible split (see below). | ||
416 | */ | ||
417 | if (pagevec_count(&pvec)) | ||
418 | __munlock_pagevec(&pvec, zone); | ||
419 | lock_page(page); | 457 | lock_page(page); |
420 | /* | 458 | /* |
421 | * Any THP page found by follow_page_mask() may | 459 | * Any THP page found by follow_page_mask() may |
@@ -428,21 +466,31 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
428 | put_page(page); /* follow_page_mask() */ | 466 | put_page(page); /* follow_page_mask() */ |
429 | } else { | 467 | } else { |
430 | /* | 468 | /* |
431 | * Non-huge pages are handled in batches | 469 | * Non-huge pages are handled in batches via |
432 | * via pagevec. The pin from | 470 | * pagevec. The pin from follow_page_mask() |
433 | * follow_page_mask() prevents them from | 471 | * prevents them from collapsing by THP. |
434 | * collapsing by THP. | 472 | */ |
473 | pagevec_add(&pvec, page); | ||
474 | zone = page_zone(page); | ||
475 | zoneid = page_zone_id(page); | ||
476 | |||
477 | /* | ||
478 | * Try to fill the rest of pagevec using fast | ||
479 | * pte walk. This will also update start to | ||
480 | * the next page to process. Then munlock the | ||
481 | * pagevec. | ||
435 | */ | 482 | */ |
436 | if (pagevec_add(&pvec, page) == 0) | 483 | start = __munlock_pagevec_fill(&pvec, vma, |
437 | __munlock_pagevec(&pvec, zone); | 484 | zoneid, start, end); |
485 | __munlock_pagevec(&pvec, zone); | ||
486 | goto next; | ||
438 | } | 487 | } |
439 | } | 488 | } |
440 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); | 489 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); |
441 | start += page_increm * PAGE_SIZE; | 490 | start += page_increm * PAGE_SIZE; |
491 | next: | ||
442 | cond_resched(); | 492 | cond_resched(); |
443 | } | 493 | } |
444 | if (pagevec_count(&pvec)) | ||
445 | __munlock_pagevec(&pvec, zone); | ||
446 | } | 494 | } |
447 | 495 | ||
448 | /* | 496 | /* |