diff options
author | Ebru Akagunduz <ebru.akagunduz@gmail.com> | 2016-07-26 18:26:46 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-26 19:19:19 -0400 |
commit | 0db501f7a34c11d3b964205e5b6d00692a648035 (patch) | |
tree | 1e0abb724957a37157af899319770086e10e1dc5 | |
parent | 47f863ea220067f5c87096893876f44500fcc8c9 (diff) |
mm, thp: convert from optimistic swapin collapsing to conservative
To detect whether khugepaged swapin is worthwhile, this patch checks the
amount of young pages. There should be at least half of HPAGE_PMD_NR to
swapin.
Link: http://lkml.kernel.org/r/1468109451-1615-1-git-send-email-ebru.akagunduz@gmail.com
Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Suggested-by: Minchan Kim <minchan@kernel.org>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/trace/events/huge_memory.h | 19 | ||||
-rw-r--r-- | mm/khugepaged.c | 38 |
2 files changed, 34 insertions, 23 deletions
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 830d47d5ca41..04f58acda8e8 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h | |||
@@ -13,7 +13,7 @@ | |||
13 | EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ | 13 | EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ |
14 | EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ | 14 | EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ |
15 | EM( SCAN_PAGE_RO, "no_writable_page") \ | 15 | EM( SCAN_PAGE_RO, "no_writable_page") \ |
16 | EM( SCAN_NO_REFERENCED_PAGE, "no_referenced_page") \ | 16 | EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \ |
17 | EM( SCAN_PAGE_NULL, "page_null") \ | 17 | EM( SCAN_PAGE_NULL, "page_null") \ |
18 | EM( SCAN_SCAN_ABORT, "scan_aborted") \ | 18 | EM( SCAN_SCAN_ABORT, "scan_aborted") \ |
19 | EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \ | 19 | EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \ |
@@ -47,7 +47,7 @@ SCAN_STATUS | |||
47 | TRACE_EVENT(mm_khugepaged_scan_pmd, | 47 | TRACE_EVENT(mm_khugepaged_scan_pmd, |
48 | 48 | ||
49 | TP_PROTO(struct mm_struct *mm, struct page *page, bool writable, | 49 | TP_PROTO(struct mm_struct *mm, struct page *page, bool writable, |
50 | bool referenced, int none_or_zero, int status, int unmapped), | 50 | int referenced, int none_or_zero, int status, int unmapped), |
51 | 51 | ||
52 | TP_ARGS(mm, page, writable, referenced, none_or_zero, status, unmapped), | 52 | TP_ARGS(mm, page, writable, referenced, none_or_zero, status, unmapped), |
53 | 53 | ||
@@ -55,7 +55,7 @@ TRACE_EVENT(mm_khugepaged_scan_pmd, | |||
55 | __field(struct mm_struct *, mm) | 55 | __field(struct mm_struct *, mm) |
56 | __field(unsigned long, pfn) | 56 | __field(unsigned long, pfn) |
57 | __field(bool, writable) | 57 | __field(bool, writable) |
58 | __field(bool, referenced) | 58 | __field(int, referenced) |
59 | __field(int, none_or_zero) | 59 | __field(int, none_or_zero) |
60 | __field(int, status) | 60 | __field(int, status) |
61 | __field(int, unmapped) | 61 | __field(int, unmapped) |
@@ -108,14 +108,14 @@ TRACE_EVENT(mm_collapse_huge_page, | |||
108 | TRACE_EVENT(mm_collapse_huge_page_isolate, | 108 | TRACE_EVENT(mm_collapse_huge_page_isolate, |
109 | 109 | ||
110 | TP_PROTO(struct page *page, int none_or_zero, | 110 | TP_PROTO(struct page *page, int none_or_zero, |
111 | bool referenced, bool writable, int status), | 111 | int referenced, bool writable, int status), |
112 | 112 | ||
113 | TP_ARGS(page, none_or_zero, referenced, writable, status), | 113 | TP_ARGS(page, none_or_zero, referenced, writable, status), |
114 | 114 | ||
115 | TP_STRUCT__entry( | 115 | TP_STRUCT__entry( |
116 | __field(unsigned long, pfn) | 116 | __field(unsigned long, pfn) |
117 | __field(int, none_or_zero) | 117 | __field(int, none_or_zero) |
118 | __field(bool, referenced) | 118 | __field(int, referenced) |
119 | __field(bool, writable) | 119 | __field(bool, writable) |
120 | __field(int, status) | 120 | __field(int, status) |
121 | ), | 121 | ), |
@@ -138,25 +138,28 @@ TRACE_EVENT(mm_collapse_huge_page_isolate, | |||
138 | 138 | ||
139 | TRACE_EVENT(mm_collapse_huge_page_swapin, | 139 | TRACE_EVENT(mm_collapse_huge_page_swapin, |
140 | 140 | ||
141 | TP_PROTO(struct mm_struct *mm, int swapped_in, int ret), | 141 | TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret), |
142 | 142 | ||
143 | TP_ARGS(mm, swapped_in, ret), | 143 | TP_ARGS(mm, swapped_in, referenced, ret), |
144 | 144 | ||
145 | TP_STRUCT__entry( | 145 | TP_STRUCT__entry( |
146 | __field(struct mm_struct *, mm) | 146 | __field(struct mm_struct *, mm) |
147 | __field(int, swapped_in) | 147 | __field(int, swapped_in) |
148 | __field(int, referenced) | ||
148 | __field(int, ret) | 149 | __field(int, ret) |
149 | ), | 150 | ), |
150 | 151 | ||
151 | TP_fast_assign( | 152 | TP_fast_assign( |
152 | __entry->mm = mm; | 153 | __entry->mm = mm; |
153 | __entry->swapped_in = swapped_in; | 154 | __entry->swapped_in = swapped_in; |
155 | __entry->referenced = referenced; | ||
154 | __entry->ret = ret; | 156 | __entry->ret = ret; |
155 | ), | 157 | ), |
156 | 158 | ||
157 | TP_printk("mm=%p, swapped_in=%d, ret=%d", | 159 | TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d", |
158 | __entry->mm, | 160 | __entry->mm, |
159 | __entry->swapped_in, | 161 | __entry->swapped_in, |
162 | __entry->referenced, | ||
160 | __entry->ret) | 163 | __entry->ret) |
161 | ); | 164 | ); |
162 | 165 | ||
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 566148489e33..7dbee698d6aa 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c | |||
@@ -27,7 +27,7 @@ enum scan_result { | |||
27 | SCAN_EXCEED_NONE_PTE, | 27 | SCAN_EXCEED_NONE_PTE, |
28 | SCAN_PTE_NON_PRESENT, | 28 | SCAN_PTE_NON_PRESENT, |
29 | SCAN_PAGE_RO, | 29 | SCAN_PAGE_RO, |
30 | SCAN_NO_REFERENCED_PAGE, | 30 | SCAN_LACK_REFERENCED_PAGE, |
31 | SCAN_PAGE_NULL, | 31 | SCAN_PAGE_NULL, |
32 | SCAN_SCAN_ABORT, | 32 | SCAN_SCAN_ABORT, |
33 | SCAN_PAGE_COUNT, | 33 | SCAN_PAGE_COUNT, |
@@ -500,8 +500,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
500 | { | 500 | { |
501 | struct page *page = NULL; | 501 | struct page *page = NULL; |
502 | pte_t *_pte; | 502 | pte_t *_pte; |
503 | int none_or_zero = 0, result = 0; | 503 | int none_or_zero = 0, result = 0, referenced = 0; |
504 | bool referenced = false, writable = false; | 504 | bool writable = false; |
505 | 505 | ||
506 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | 506 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; |
507 | _pte++, address += PAGE_SIZE) { | 507 | _pte++, address += PAGE_SIZE) { |
@@ -580,11 +580,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
580 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 580 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
581 | VM_BUG_ON_PAGE(PageLRU(page), page); | 581 | VM_BUG_ON_PAGE(PageLRU(page), page); |
582 | 582 | ||
583 | /* If there is no mapped pte young don't collapse the page */ | 583 | /* There should be enough young pte to collapse the page */ |
584 | if (pte_young(pteval) || | 584 | if (pte_young(pteval) || |
585 | page_is_young(page) || PageReferenced(page) || | 585 | page_is_young(page) || PageReferenced(page) || |
586 | mmu_notifier_test_young(vma->vm_mm, address)) | 586 | mmu_notifier_test_young(vma->vm_mm, address)) |
587 | referenced = true; | 587 | referenced++; |
588 | } | 588 | } |
589 | if (likely(writable)) { | 589 | if (likely(writable)) { |
590 | if (likely(referenced)) { | 590 | if (likely(referenced)) { |
@@ -869,7 +869,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) | |||
869 | 869 | ||
870 | static bool __collapse_huge_page_swapin(struct mm_struct *mm, | 870 | static bool __collapse_huge_page_swapin(struct mm_struct *mm, |
871 | struct vm_area_struct *vma, | 871 | struct vm_area_struct *vma, |
872 | unsigned long address, pmd_t *pmd) | 872 | unsigned long address, pmd_t *pmd, |
873 | int referenced) | ||
873 | { | 874 | { |
874 | pte_t pteval; | 875 | pte_t pteval; |
875 | int swapped_in = 0, ret = 0; | 876 | int swapped_in = 0, ret = 0; |
@@ -887,12 +888,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, | |||
887 | if (!is_swap_pte(pteval)) | 888 | if (!is_swap_pte(pteval)) |
888 | continue; | 889 | continue; |
889 | swapped_in++; | 890 | swapped_in++; |
891 | /* we only decide to swapin, if there is enough young ptes */ | ||
892 | if (referenced < HPAGE_PMD_NR/2) { | ||
893 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); | ||
894 | return false; | ||
895 | } | ||
890 | ret = do_swap_page(&fe, pteval); | 896 | ret = do_swap_page(&fe, pteval); |
897 | |||
891 | /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ | 898 | /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ |
892 | if (ret & VM_FAULT_RETRY) { | 899 | if (ret & VM_FAULT_RETRY) { |
893 | down_read(&mm->mmap_sem); | 900 | down_read(&mm->mmap_sem); |
894 | if (hugepage_vma_revalidate(mm, address)) { | 901 | if (hugepage_vma_revalidate(mm, address)) { |
895 | /* vma is no longer available, don't continue to swapin */ | 902 | /* vma is no longer available, don't continue to swapin */ |
903 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); | ||
896 | return false; | 904 | return false; |
897 | } | 905 | } |
898 | /* check if the pmd is still valid */ | 906 | /* check if the pmd is still valid */ |
@@ -900,7 +908,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, | |||
900 | return false; | 908 | return false; |
901 | } | 909 | } |
902 | if (ret & VM_FAULT_ERROR) { | 910 | if (ret & VM_FAULT_ERROR) { |
903 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0); | 911 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); |
904 | return false; | 912 | return false; |
905 | } | 913 | } |
906 | /* pte is unmapped now, we need to map it */ | 914 | /* pte is unmapped now, we need to map it */ |
@@ -908,7 +916,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, | |||
908 | } | 916 | } |
909 | fe.pte--; | 917 | fe.pte--; |
910 | pte_unmap(fe.pte); | 918 | pte_unmap(fe.pte); |
911 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1); | 919 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); |
912 | return true; | 920 | return true; |
913 | } | 921 | } |
914 | 922 | ||
@@ -916,7 +924,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
916 | unsigned long address, | 924 | unsigned long address, |
917 | struct page **hpage, | 925 | struct page **hpage, |
918 | struct vm_area_struct *vma, | 926 | struct vm_area_struct *vma, |
919 | int node) | 927 | int node, int referenced) |
920 | { | 928 | { |
921 | pmd_t *pmd, _pmd; | 929 | pmd_t *pmd, _pmd; |
922 | pte_t *pte; | 930 | pte_t *pte; |
@@ -973,7 +981,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
973 | * If it fails, we release mmap_sem and jump out_nolock. | 981 | * If it fails, we release mmap_sem and jump out_nolock. |
974 | * Continuing to collapse causes inconsistency. | 982 | * Continuing to collapse causes inconsistency. |
975 | */ | 983 | */ |
976 | if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) { | 984 | if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { |
977 | mem_cgroup_cancel_charge(new_page, memcg, true); | 985 | mem_cgroup_cancel_charge(new_page, memcg, true); |
978 | up_read(&mm->mmap_sem); | 986 | up_read(&mm->mmap_sem); |
979 | goto out_nolock; | 987 | goto out_nolock; |
@@ -1084,12 +1092,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
1084 | { | 1092 | { |
1085 | pmd_t *pmd; | 1093 | pmd_t *pmd; |
1086 | pte_t *pte, *_pte; | 1094 | pte_t *pte, *_pte; |
1087 | int ret = 0, none_or_zero = 0, result = 0; | 1095 | int ret = 0, none_or_zero = 0, result = 0, referenced = 0; |
1088 | struct page *page = NULL; | 1096 | struct page *page = NULL; |
1089 | unsigned long _address; | 1097 | unsigned long _address; |
1090 | spinlock_t *ptl; | 1098 | spinlock_t *ptl; |
1091 | int node = NUMA_NO_NODE, unmapped = 0; | 1099 | int node = NUMA_NO_NODE, unmapped = 0; |
1092 | bool writable = false, referenced = false; | 1100 | bool writable = false; |
1093 | 1101 | ||
1094 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 1102 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
1095 | 1103 | ||
@@ -1177,14 +1185,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
1177 | if (pte_young(pteval) || | 1185 | if (pte_young(pteval) || |
1178 | page_is_young(page) || PageReferenced(page) || | 1186 | page_is_young(page) || PageReferenced(page) || |
1179 | mmu_notifier_test_young(vma->vm_mm, address)) | 1187 | mmu_notifier_test_young(vma->vm_mm, address)) |
1180 | referenced = true; | 1188 | referenced++; |
1181 | } | 1189 | } |
1182 | if (writable) { | 1190 | if (writable) { |
1183 | if (referenced) { | 1191 | if (referenced) { |
1184 | result = SCAN_SUCCEED; | 1192 | result = SCAN_SUCCEED; |
1185 | ret = 1; | 1193 | ret = 1; |
1186 | } else { | 1194 | } else { |
1187 | result = SCAN_NO_REFERENCED_PAGE; | 1195 | result = SCAN_LACK_REFERENCED_PAGE; |
1188 | } | 1196 | } |
1189 | } else { | 1197 | } else { |
1190 | result = SCAN_PAGE_RO; | 1198 | result = SCAN_PAGE_RO; |
@@ -1194,7 +1202,7 @@ out_unmap: | |||
1194 | if (ret) { | 1202 | if (ret) { |
1195 | node = khugepaged_find_target_node(); | 1203 | node = khugepaged_find_target_node(); |
1196 | /* collapse_huge_page will return with the mmap_sem released */ | 1204 | /* collapse_huge_page will return with the mmap_sem released */ |
1197 | collapse_huge_page(mm, address, hpage, vma, node); | 1205 | collapse_huge_page(mm, address, hpage, vma, node, referenced); |
1198 | } | 1206 | } |
1199 | out: | 1207 | out: |
1200 | trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, | 1208 | trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, |