summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMinchan Kim <minchan@kernel.org>2019-09-25 19:49:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-25 20:51:41 -0400
commit1a4e58cce84ee88129d5d49c064bd2852b481357 (patch)
treedbada87e6437233779c7d7b0c4434ddbb04a8830
parent8940b34a4e082ae11498ddae8432f2ac07685d1c (diff)
mm: introduce MADV_PAGEOUT
When a process expects no accesses to a certain memory range for a long time, it could hint kernel that the pages can be reclaimed instantly but data should be preserved for future use. This could reduce workingset eviction so it ends up increasing performance. This patch introduces the new MADV_PAGEOUT hint to madvise(2) syscall. MADV_PAGEOUT can be used by a process to mark a memory range as not expected to be used for a long time so that kernel reclaims *any LRU* pages instantly. The hint can help kernel in deciding which pages to evict proactively. A note: It doesn't apply SWAP_CLUSTER_MAX LRU page isolation limit intentionally because it's automatically bounded by PMD size. If PMD size(e.g., 256) makes some trouble, we could fix it later by limit it to SWAP_CLUSTER_MAX[1]. - man-page material MADV_PAGEOUT (since Linux x.x) Do not expect access in the near future so pages in the specified regions could be reclaimed instantly regardless of memory pressure. Thus, access in the range after successful operation could cause major page fault but never lose the up-to-date contents unlike MADV_DONTNEED. Pages belonging to a shared mapping are only processed if a write access is allowed for the calling process. MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP pages. [1] https://lore.kernel.org/lkml/20190710194719.GS29695@dhcp22.suse.cz/ [minchan@kernel.org: clear PG_active on MADV_PAGEOUT] Link: http://lkml.kernel.org/r/20190802200643.GA181880@google.com [akpm@linux-foundation.org: resolve conflicts with hmm.git] Link: http://lkml.kernel.org/r/20190726023435.214162-5-minchan@kernel.org Signed-off-by: Minchan Kim <minchan@kernel.org> Reported-by: kbuild test robot <lkp@intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com> Cc: Richard Henderson <rth@twiddle.net> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Chris Zankel <chris@zankel.net> Cc: Daniel Colascione <dancol@google.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Joel Fernandes (Google) <joel@joelfernandes.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Oleksandr Natalenko <oleksandr@redhat.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Sonny Rao <sonnyrao@google.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tim Murray <timmurray@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/alpha/include/uapi/asm/mman.h1
-rw-r--r--arch/mips/include/uapi/asm/mman.h1
-rw-r--r--arch/parisc/include/uapi/asm/mman.h1
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h1
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/uapi/asm-generic/mman-common.h1
-rw-r--r--mm/madvise.c189
-rw-r--r--mm/vmscan.c56
8 files changed, 251 insertions, 0 deletions
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index f3258fbf03d0..a18ec7f63888 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -69,6 +69,7 @@
69#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ 69#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
70 70
71#define MADV_COLD 20 /* deactivate these pages */ 71#define MADV_COLD 20 /* deactivate these pages */
72#define MADV_PAGEOUT 21 /* reclaim these pages */
72 73
73/* compatibility flags */ 74/* compatibility flags */
74#define MAP_FILE 0 75#define MAP_FILE 0
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 00ad09fc5eb1..57dc2ac4f8bd 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -96,6 +96,7 @@
96#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ 96#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
97 97
98#define MADV_COLD 20 /* deactivate these pages */ 98#define MADV_COLD 20 /* deactivate these pages */
99#define MADV_PAGEOUT 21 /* reclaim these pages */
99 100
100/* compatibility flags */ 101/* compatibility flags */
101#define MAP_FILE 0 102#define MAP_FILE 0
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index eb14e3a7b8f3..6fd8871e4081 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -49,6 +49,7 @@
49#define MADV_DOFORK 11 /* do inherit across fork */ 49#define MADV_DOFORK 11 /* do inherit across fork */
50 50
51#define MADV_COLD 20 /* deactivate these pages */ 51#define MADV_COLD 20 /* deactivate these pages */
52#define MADV_PAGEOUT 21 /* reclaim these pages */
52 53
53#define MADV_MERGEABLE 65 /* KSM may merge identical pages */ 54#define MADV_MERGEABLE 65 /* KSM may merge identical pages */
54#define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ 55#define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index f926b00ff11f..e5e643752947 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -104,6 +104,7 @@
104#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ 104#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
105 105
106#define MADV_COLD 20 /* deactivate these pages */ 106#define MADV_COLD 20 /* deactivate these pages */
107#define MADV_PAGEOUT 21 /* reclaim these pages */
107 108
108/* compatibility flags */ 109/* compatibility flags */
109#define MAP_FILE 0 110#define MAP_FILE 0
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0ce997edb8bb..063c0c1e112b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -365,6 +365,7 @@ extern int vm_swappiness;
365extern int remove_mapping(struct address_space *mapping, struct page *page); 365extern int remove_mapping(struct address_space *mapping, struct page *page);
366extern unsigned long vm_total_pages; 366extern unsigned long vm_total_pages;
367 367
368extern unsigned long reclaim_pages(struct list_head *page_list);
368#ifdef CONFIG_NUMA 369#ifdef CONFIG_NUMA
369extern int node_reclaim_mode; 370extern int node_reclaim_mode;
370extern int sysctl_min_unmapped_ratio; 371extern int sysctl_min_unmapped_ratio;
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 23431faf0eb6..c160a5354eb6 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -68,6 +68,7 @@
68#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ 68#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
69 69
70#define MADV_COLD 20 /* deactivate these pages */ 70#define MADV_COLD 20 /* deactivate these pages */
71#define MADV_PAGEOUT 21 /* reclaim these pages */
71 72
72/* compatibility flags */ 73/* compatibility flags */
73#define MAP_FILE 0 74#define MAP_FILE 0
diff --git a/mm/madvise.c b/mm/madvise.c
index e1aee62967c3..54c5639774b6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -44,6 +44,7 @@ static int madvise_need_mmap_write(int behavior)
44 case MADV_WILLNEED: 44 case MADV_WILLNEED:
45 case MADV_DONTNEED: 45 case MADV_DONTNEED:
46 case MADV_COLD: 46 case MADV_COLD:
47 case MADV_PAGEOUT:
47 case MADV_FREE: 48 case MADV_FREE:
48 return 0; 49 return 0;
49 default: 50 default:
@@ -461,6 +462,191 @@ static long madvise_cold(struct vm_area_struct *vma,
461 return 0; 462 return 0;
462} 463}
463 464
465static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
466 unsigned long end, struct mm_walk *walk)
467{
468 struct mmu_gather *tlb = walk->private;
469 struct mm_struct *mm = tlb->mm;
470 struct vm_area_struct *vma = walk->vma;
471 pte_t *orig_pte, *pte, ptent;
472 spinlock_t *ptl;
473 LIST_HEAD(page_list);
474 struct page *page;
475
476 if (fatal_signal_pending(current))
477 return -EINTR;
478
479#ifdef CONFIG_TRANSPARENT_HUGEPAGE
480 if (pmd_trans_huge(*pmd)) {
481 pmd_t orig_pmd;
482 unsigned long next = pmd_addr_end(addr, end);
483
484 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
485 ptl = pmd_trans_huge_lock(pmd, vma);
486 if (!ptl)
487 return 0;
488
489 orig_pmd = *pmd;
490 if (is_huge_zero_pmd(orig_pmd))
491 goto huge_unlock;
492
493 if (unlikely(!pmd_present(orig_pmd))) {
494 VM_BUG_ON(thp_migration_supported() &&
495 !is_pmd_migration_entry(orig_pmd));
496 goto huge_unlock;
497 }
498
499 page = pmd_page(orig_pmd);
500 if (next - addr != HPAGE_PMD_SIZE) {
501 int err;
502
503 if (page_mapcount(page) != 1)
504 goto huge_unlock;
505 get_page(page);
506 spin_unlock(ptl);
507 lock_page(page);
508 err = split_huge_page(page);
509 unlock_page(page);
510 put_page(page);
511 if (!err)
512 goto regular_page;
513 return 0;
514 }
515
516 if (pmd_young(orig_pmd)) {
517 pmdp_invalidate(vma, addr, pmd);
518 orig_pmd = pmd_mkold(orig_pmd);
519
520 set_pmd_at(mm, addr, pmd, orig_pmd);
521 tlb_remove_tlb_entry(tlb, pmd, addr);
522 }
523
524 ClearPageReferenced(page);
525 test_and_clear_page_young(page);
526
527 if (!isolate_lru_page(page))
528 list_add(&page->lru, &page_list);
529huge_unlock:
530 spin_unlock(ptl);
531 reclaim_pages(&page_list);
532 return 0;
533 }
534
535 if (pmd_trans_unstable(pmd))
536 return 0;
537regular_page:
538#endif
539 tlb_change_page_size(tlb, PAGE_SIZE);
540 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
541 flush_tlb_batched_pending(mm);
542 arch_enter_lazy_mmu_mode();
543 for (; addr < end; pte++, addr += PAGE_SIZE) {
544 ptent = *pte;
545 if (!pte_present(ptent))
546 continue;
547
548 page = vm_normal_page(vma, addr, ptent);
549 if (!page)
550 continue;
551
552 /*
553 * creating a THP page is expensive so split it only if we
554 * are sure it's worth. Split it if we are only owner.
555 */
556 if (PageTransCompound(page)) {
557 if (page_mapcount(page) != 1)
558 break;
559 get_page(page);
560 if (!trylock_page(page)) {
561 put_page(page);
562 break;
563 }
564 pte_unmap_unlock(orig_pte, ptl);
565 if (split_huge_page(page)) {
566 unlock_page(page);
567 put_page(page);
568 pte_offset_map_lock(mm, pmd, addr, &ptl);
569 break;
570 }
571 unlock_page(page);
572 put_page(page);
573 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
574 pte--;
575 addr -= PAGE_SIZE;
576 continue;
577 }
578
579 VM_BUG_ON_PAGE(PageTransCompound(page), page);
580
581 if (pte_young(ptent)) {
582 ptent = ptep_get_and_clear_full(mm, addr, pte,
583 tlb->fullmm);
584 ptent = pte_mkold(ptent);
585 set_pte_at(mm, addr, pte, ptent);
586 tlb_remove_tlb_entry(tlb, pte, addr);
587 }
588 ClearPageReferenced(page);
589 test_and_clear_page_young(page);
590
591 if (!isolate_lru_page(page))
592 list_add(&page->lru, &page_list);
593 }
594
595 arch_leave_lazy_mmu_mode();
596 pte_unmap_unlock(orig_pte, ptl);
597 reclaim_pages(&page_list);
598 cond_resched();
599
600 return 0;
601}
602
603static void madvise_pageout_page_range(struct mmu_gather *tlb,
604 struct vm_area_struct *vma,
605 unsigned long addr, unsigned long end)
606{
607 tlb_start_vma(tlb, vma);
608 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL);
609 tlb_end_vma(tlb, vma);
610}
611
612static inline bool can_do_pageout(struct vm_area_struct *vma)
613{
614 if (vma_is_anonymous(vma))
615 return true;
616 if (!vma->vm_file)
617 return false;
618 /*
619 * paging out pagecache only for non-anonymous mappings that correspond
620 * to the files the calling process could (if tried) open for writing;
621 * otherwise we'd be including shared non-exclusive mappings, which
622 * opens a side channel.
623 */
624 return inode_owner_or_capable(file_inode(vma->vm_file)) ||
625 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
626}
627
628static long madvise_pageout(struct vm_area_struct *vma,
629 struct vm_area_struct **prev,
630 unsigned long start_addr, unsigned long end_addr)
631{
632 struct mm_struct *mm = vma->vm_mm;
633 struct mmu_gather tlb;
634
635 *prev = vma;
636 if (!can_madv_lru_vma(vma))
637 return -EINVAL;
638
639 if (!can_do_pageout(vma))
640 return 0;
641
642 lru_add_drain();
643 tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
644 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
645 tlb_finish_mmu(&tlb, start_addr, end_addr);
646
647 return 0;
648}
649
464static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 650static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
465 unsigned long end, struct mm_walk *walk) 651 unsigned long end, struct mm_walk *walk)
466 652
@@ -843,6 +1029,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
843 return madvise_willneed(vma, prev, start, end); 1029 return madvise_willneed(vma, prev, start, end);
844 case MADV_COLD: 1030 case MADV_COLD:
845 return madvise_cold(vma, prev, start, end); 1031 return madvise_cold(vma, prev, start, end);
1032 case MADV_PAGEOUT:
1033 return madvise_pageout(vma, prev, start, end);
846 case MADV_FREE: 1034 case MADV_FREE:
847 case MADV_DONTNEED: 1035 case MADV_DONTNEED:
848 return madvise_dontneed_free(vma, prev, start, end, behavior); 1036 return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -865,6 +1053,7 @@ madvise_behavior_valid(int behavior)
865 case MADV_DONTNEED: 1053 case MADV_DONTNEED:
866 case MADV_FREE: 1054 case MADV_FREE:
867 case MADV_COLD: 1055 case MADV_COLD:
1056 case MADV_PAGEOUT:
868#ifdef CONFIG_KSM 1057#ifdef CONFIG_KSM
869 case MADV_MERGEABLE: 1058 case MADV_MERGEABLE:
870 case MADV_UNMERGEABLE: 1059 case MADV_UNMERGEABLE:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d8bbaf068c35..e5d52d6a24af 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2145,6 +2145,62 @@ static void shrink_active_list(unsigned long nr_to_scan,
2145 nr_deactivate, nr_rotated, sc->priority, file); 2145 nr_deactivate, nr_rotated, sc->priority, file);
2146} 2146}
2147 2147
2148unsigned long reclaim_pages(struct list_head *page_list)
2149{
2150 int nid = -1;
2151 unsigned long nr_reclaimed = 0;
2152 LIST_HEAD(node_page_list);
2153 struct reclaim_stat dummy_stat;
2154 struct page *page;
2155 struct scan_control sc = {
2156 .gfp_mask = GFP_KERNEL,
2157 .priority = DEF_PRIORITY,
2158 .may_writepage = 1,
2159 .may_unmap = 1,
2160 .may_swap = 1,
2161 };
2162
2163 while (!list_empty(page_list)) {
2164 page = lru_to_page(page_list);
2165 if (nid == -1) {
2166 nid = page_to_nid(page);
2167 INIT_LIST_HEAD(&node_page_list);
2168 }
2169
2170 if (nid == page_to_nid(page)) {
2171 ClearPageActive(page);
2172 list_move(&page->lru, &node_page_list);
2173 continue;
2174 }
2175
2176 nr_reclaimed += shrink_page_list(&node_page_list,
2177 NODE_DATA(nid),
2178 &sc, 0,
2179 &dummy_stat, false);
2180 while (!list_empty(&node_page_list)) {
2181 page = lru_to_page(&node_page_list);
2182 list_del(&page->lru);
2183 putback_lru_page(page);
2184 }
2185
2186 nid = -1;
2187 }
2188
2189 if (!list_empty(&node_page_list)) {
2190 nr_reclaimed += shrink_page_list(&node_page_list,
2191 NODE_DATA(nid),
2192 &sc, 0,
2193 &dummy_stat, false);
2194 while (!list_empty(&node_page_list)) {
2195 page = lru_to_page(&node_page_list);
2196 list_del(&page->lru);
2197 putback_lru_page(page);
2198 }
2199 }
2200
2201 return nr_reclaimed;
2202}
2203
2148/* 2204/*
2149 * The inactive anon list should be small enough that the VM never has 2205 * The inactive anon list should be small enough that the VM never has
2150 * to do too much work. 2206 * to do too much work.