summaryrefslogtreecommitdiffstats
path: root/mm/swap_state.c
diff options
context:
space:
mode:
authorHuang Ying <ying.huang@intel.com>2017-09-06 19:24:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 20:27:29 -0400
commitec560175c0b6fce86994bdf036754d48122c5c87 (patch)
tree7aacd0beae098c785452a8a8361e13e7ffe2bc73 /mm/swap_state.c
parentc4fa63092f216737b60c789968371d9960a598e5 (diff)
mm, swap: VMA based swap readahead
The swap readahead is an important mechanism to reduce the swap in latency. Although pure sequential memory access pattern isn't very popular for anonymous memory, the space locality is still considered valid. In the original swap readahead implementation, the consecutive blocks in swap device are readahead based on the global space locality estimation. But the consecutive blocks in swap device just reflect the order of page reclaiming, don't necessarily reflect the access pattern in virtual memory. And the different tasks in the system may have different access patterns, which makes the global space locality estimation incorrect. In this patch, when page fault occurs, the virtual pages near the fault address will be readahead instead of the swap slots near the fault swap slot in swap device. This avoid to readahead the unrelated swap slots. At the same time, the swap readahead is changed to work on per-VMA from globally. So that the different access patterns of the different VMAs could be distinguished, and the different readahead policy could be applied accordingly. The original core readahead detection and scaling algorithm is reused, because it is an effect algorithm to detect the space locality. The test and result is as follow, Common test condition ===================== Test Machine: Xeon E5 v3 (2 sockets, 72 threads, 32G RAM) Swap device: NVMe disk Micro-benchmark with combined access pattern ============================================ vm-scalability, sequential swap test case, 4 processes to eat 50G virtual memory space, repeat the sequential memory writing until 300 seconds. The first round writing will trigger swap out, the following rounds will trigger sequential swap in and out. At the same time, run vm-scalability random swap test case in background, 8 processes to eat 30G virtual memory space, repeat the random memory write until 300 seconds. This will trigger random swap-in in the background. This is a combined workload with sequential and random memory accessing at the same time. The result (for sequential workload) is as follow, Base Optimized ---- --------- throughput 345413 KB/s 414029 KB/s (+19.9%) latency.average 97.14 us 61.06 us (-37.1%) latency.50th 2 us 1 us latency.60th 2 us 1 us latency.70th 98 us 2 us latency.80th 160 us 2 us latency.90th 260 us 217 us latency.95th 346 us 369 us latency.99th 1.34 ms 1.09 ms ra_hit% 52.69% 99.98% The original swap readahead algorithm is confused by the background random access workload, so readahead hit rate is lower. The VMA-base readahead algorithm works much better. Linpack ======= The test memory size is bigger than RAM to trigger swapping. Base Optimized ---- --------- elapsed_time 393.49 s 329.88 s (-16.2%) ra_hit% 86.21% 98.82% The score of base and optimized kernel hasn't visible changes. But the elapsed time reduced and readahead hit rate improved, so the optimized kernel runs better for startup and tear down stages. And the absolute value of readahead hit rate is high, shows that the space locality is still valid in some practical workloads. Link: http://lkml.kernel.org/r/20170807054038.1843-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@redhat.com> Cc: Shaohua Li <shli@kernel.org> Cc: Hugh Dickins <hughd@google.com> Cc: Fengguang Wu <fengguang.wu@intel.com> Cc: Tim Chen <tim.c.chen@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swap_state.c')
-rw-r--r--mm/swap_state.c215
1 files changed, 198 insertions, 17 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c
index a901afe9da61..3885fef7bdf5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = {
37 37
38struct address_space *swapper_spaces[MAX_SWAPFILES]; 38struct address_space *swapper_spaces[MAX_SWAPFILES];
39static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; 39static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
40bool swap_vma_readahead = true;
41
42#define SWAP_RA_MAX_ORDER_DEFAULT 3
43
44static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT;
45
46#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
47#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
48#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK
49#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
50
51#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK)
52#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
53#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK)
54
55#define SWAP_RA_VAL(addr, win, hits) \
56 (((addr) & PAGE_MASK) | \
57 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \
58 ((hits) & SWAP_RA_HITS_MASK))
59
60/* Initial readahead hits is 4 to start up with a small window */
61#define GET_SWAP_RA_VAL(vma) \
62 (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
40 63
41#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 64#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
42#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) 65#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
@@ -297,21 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
297 * lock getting page table operations atomic even if we drop the page 320 * lock getting page table operations atomic even if we drop the page
298 * lock before returning. 321 * lock before returning.
299 */ 322 */
300struct page * lookup_swap_cache(swp_entry_t entry) 323struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
324 unsigned long addr)
301{ 325{
302 struct page *page; 326 struct page *page;
327 unsigned long ra_info;
328 int win, hits, readahead;
303 329
304 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 330 page = find_get_page(swap_address_space(entry), swp_offset(entry));
305 331
306 if (page && likely(!PageTransCompound(page))) { 332 INC_CACHE_INFO(find_total);
333 if (page) {
307 INC_CACHE_INFO(find_success); 334 INC_CACHE_INFO(find_success);
308 if (TestClearPageReadahead(page)) { 335 if (unlikely(PageTransCompound(page)))
309 atomic_inc(&swapin_readahead_hits); 336 return page;
337 readahead = TestClearPageReadahead(page);
338 if (vma) {
339 ra_info = GET_SWAP_RA_VAL(vma);
340 win = SWAP_RA_WIN(ra_info);
341 hits = SWAP_RA_HITS(ra_info);
342 if (readahead)
343 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
344 atomic_long_set(&vma->swap_readahead_info,
345 SWAP_RA_VAL(addr, win, hits));
346 }
347 if (readahead) {
310 count_vm_event(SWAP_RA_HIT); 348 count_vm_event(SWAP_RA_HIT);
349 if (!vma)
350 atomic_inc(&swapin_readahead_hits);
311 } 351 }
312 } 352 }
313
314 INC_CACHE_INFO(find_total);
315 return page; 353 return page;
316} 354}
317 355
@@ -426,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
426 return retpage; 464 return retpage;
427} 465}
428 466
429static unsigned long swapin_nr_pages(unsigned long offset) 467static unsigned int __swapin_nr_pages(unsigned long prev_offset,
468 unsigned long offset,
469 int hits,
470 int max_pages,
471 int prev_win)
430{ 472{
431 static unsigned long prev_offset; 473 unsigned int pages, last_ra;
432 unsigned int pages, max_pages, last_ra;
433 static atomic_t last_readahead_pages;
434
435 max_pages = 1 << READ_ONCE(page_cluster);
436 if (max_pages <= 1)
437 return 1;
438 474
439 /* 475 /*
440 * This heuristic has been found to work well on both sequential and 476 * This heuristic has been found to work well on both sequential and
441 * random loads, swapping to hard disk or to SSD: please don't ask 477 * random loads, swapping to hard disk or to SSD: please don't ask
442 * what the "+ 2" means, it just happens to work well, that's all. 478 * what the "+ 2" means, it just happens to work well, that's all.
443 */ 479 */
444 pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; 480 pages = hits + 2;
445 if (pages == 2) { 481 if (pages == 2) {
446 /* 482 /*
447 * We can have no readahead hits to judge by: but must not get 483 * We can have no readahead hits to judge by: but must not get
@@ -450,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset)
450 */ 486 */
451 if (offset != prev_offset + 1 && offset != prev_offset - 1) 487 if (offset != prev_offset + 1 && offset != prev_offset - 1)
452 pages = 1; 488 pages = 1;
453 prev_offset = offset;
454 } else { 489 } else {
455 unsigned int roundup = 4; 490 unsigned int roundup = 4;
456 while (roundup < pages) 491 while (roundup < pages)
@@ -462,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset)
462 pages = max_pages; 497 pages = max_pages;
463 498
464 /* Don't shrink readahead too fast */ 499 /* Don't shrink readahead too fast */
465 last_ra = atomic_read(&last_readahead_pages) / 2; 500 last_ra = prev_win / 2;
466 if (pages < last_ra) 501 if (pages < last_ra)
467 pages = last_ra; 502 pages = last_ra;
503
504 return pages;
505}
506
507static unsigned long swapin_nr_pages(unsigned long offset)
508{
509 static unsigned long prev_offset;
510 unsigned int hits, pages, max_pages;
511 static atomic_t last_readahead_pages;
512
513 max_pages = 1 << READ_ONCE(page_cluster);
514 if (max_pages <= 1)
515 return 1;
516
517 hits = atomic_xchg(&swapin_readahead_hits, 0);
518 pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
519 atomic_read(&last_readahead_pages));
520 if (!hits)
521 prev_offset = offset;
468 atomic_set(&last_readahead_pages, pages); 522 atomic_set(&last_readahead_pages, pages);
469 523
470 return pages; 524 return pages;
@@ -570,3 +624,130 @@ void exit_swap_address_space(unsigned int type)
570 synchronize_rcu(); 624 synchronize_rcu();
571 kvfree(spaces); 625 kvfree(spaces);
572} 626}
627
628static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
629 unsigned long faddr,
630 unsigned long lpfn,
631 unsigned long rpfn,
632 unsigned long *start,
633 unsigned long *end)
634{
635 *start = max3(lpfn, PFN_DOWN(vma->vm_start),
636 PFN_DOWN(faddr & PMD_MASK));
637 *end = min3(rpfn, PFN_DOWN(vma->vm_end),
638 PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
639}
640
641struct page *swap_readahead_detect(struct vm_fault *vmf,
642 struct vma_swap_readahead *swap_ra)
643{
644 struct vm_area_struct *vma = vmf->vma;
645 unsigned long swap_ra_info;
646 struct page *page;
647 swp_entry_t entry;
648 unsigned long faddr, pfn, fpfn;
649 unsigned long start, end;
650 pte_t *pte;
651 unsigned int max_win, hits, prev_win, win, left;
652#ifndef CONFIG_64BIT
653 pte_t *tpte;
654#endif
655
656 faddr = vmf->address;
657 entry = pte_to_swp_entry(vmf->orig_pte);
658 if ((unlikely(non_swap_entry(entry))))
659 return NULL;
660 page = lookup_swap_cache(entry, vma, faddr);
661 if (page)
662 return page;
663
664 max_win = 1 << READ_ONCE(swap_ra_max_order);
665 if (max_win == 1) {
666 swap_ra->win = 1;
667 return NULL;
668 }
669
670 fpfn = PFN_DOWN(faddr);
671 swap_ra_info = GET_SWAP_RA_VAL(vma);
672 pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
673 prev_win = SWAP_RA_WIN(swap_ra_info);
674 hits = SWAP_RA_HITS(swap_ra_info);
675 swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
676 max_win, prev_win);
677 atomic_long_set(&vma->swap_readahead_info,
678 SWAP_RA_VAL(faddr, win, 0));
679
680 if (win == 1)
681 return NULL;
682
683 /* Copy the PTEs because the page table may be unmapped */
684 if (fpfn == pfn + 1)
685 swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
686 else if (pfn == fpfn + 1)
687 swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
688 &start, &end);
689 else {
690 left = (win - 1) / 2;
691 swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
692 &start, &end);
693 }
694 swap_ra->nr_pte = end - start;
695 swap_ra->offset = fpfn - start;
696 pte = vmf->pte - swap_ra->offset;
697#ifdef CONFIG_64BIT
698 swap_ra->ptes = pte;
699#else
700 tpte = swap_ra->ptes;
701 for (pfn = start; pfn != end; pfn++)
702 *tpte++ = *pte++;
703#endif
704
705 return NULL;
706}
707
708struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
709 struct vm_fault *vmf,
710 struct vma_swap_readahead *swap_ra)
711{
712 struct blk_plug plug;
713 struct vm_area_struct *vma = vmf->vma;
714 struct page *page;
715 pte_t *pte, pentry;
716 swp_entry_t entry;
717 unsigned int i;
718 bool page_allocated;
719
720 if (swap_ra->win == 1)
721 goto skip;
722
723 blk_start_plug(&plug);
724 for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
725 i++, pte++) {
726 pentry = *pte;
727 if (pte_none(pentry))
728 continue;
729 if (pte_present(pentry))
730 continue;
731 entry = pte_to_swp_entry(pentry);
732 if (unlikely(non_swap_entry(entry)))
733 continue;
734 page = __read_swap_cache_async(entry, gfp_mask, vma,
735 vmf->address, &page_allocated);
736 if (!page)
737 continue;
738 if (page_allocated) {
739 swap_readpage(page, false);
740 if (i != swap_ra->offset &&
741 likely(!PageTransCompound(page))) {
742 SetPageReadahead(page);
743 count_vm_event(SWAP_RA);
744 }
745 }
746 put_page(page);
747 }
748 blk_finish_plug(&plug);
749 lru_add_drain();
750skip:
751 return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
752 swap_ra->win == 1);
753}