aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEbru Akagunduz <ebru.akagunduz@gmail.com>2016-07-26 18:25:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-26 19:19:19 -0400
commit8a966ed746d63c8103d496da85973eeeec01d77f (patch)
treea96bcdb72409c34910440549c2e4fa248a468c4e
parent70652f6ec0566ae6b4147d88c6d043c68484227f (diff)
mm: make swapin readahead to improve thp collapse rate
This patch makes swapin readahead to improve thp collapse rate. When khugepaged scanned pages, there can be a few of the pages in swap area. With the patch THP can collapse 4kB pages into a THP when there are up to max_ptes_swap swap ptes in a 2MB range. The patch was tested with a test program that allocates 400B of memory, writes to it, and then sleeps. I force the system to swap out all. Afterwards, the test program touches the area by writing, it skips a page in each 20 pages of the area. Without the patch, system did not swap in readahead. THP rate was %65 of the program of the memory, it did not change over time. With this patch, after 10 minutes of waiting khugepaged had collapsed %99 of the program's memory. [kirill.shutemov@linux.intel.com: trivial cleanup of exit path of the function] [kirill.shutemov@linux.intel.com: __collapse_huge_page_swapin(): drop unused 'pte' parameter] [kirill.shutemov@linux.intel.com: do not hold anon_vma lock during swap in] Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com> Acked-by: Rik van Riel <riel@redhat.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Xie XiuQi <xiexiuqi@huawei.com> Cc: Cyrill Gorcunov <gorcunov@openvz.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: David Rientjes <rientjes@google.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/trace/events/huge_memory.h24
-rw-r--r--mm/huge_memory.c43
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memory.c2
4 files changed, 69 insertions, 4 deletions
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index fad6539c9d68..bda21183eb05 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -135,5 +135,29 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
135 __print_symbolic(__entry->status, SCAN_STATUS)) 135 __print_symbolic(__entry->status, SCAN_STATUS))
136); 136);
137 137
138TRACE_EVENT(mm_collapse_huge_page_swapin,
139
140 TP_PROTO(struct mm_struct *mm, int swapped_in, int ret),
141
142 TP_ARGS(mm, swapped_in, ret),
143
144 TP_STRUCT__entry(
145 __field(struct mm_struct *, mm)
146 __field(int, swapped_in)
147 __field(int, ret)
148 ),
149
150 TP_fast_assign(
151 __entry->mm = mm;
152 __entry->swapped_in = swapped_in;
153 __entry->ret = ret;
154 ),
155
156 TP_printk("mm=%p, swapped_in=%d, ret=%d",
157 __entry->mm,
158 __entry->swapped_in,
159 __entry->ret)
160);
161
138#endif /* __HUGE_MEMORY_H */ 162#endif /* __HUGE_MEMORY_H */
139#include <trace/define_trace.h> 163#include <trace/define_trace.h>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ed474483a620..b11351579e7a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2373,6 +2373,44 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
2373 return !(vma->vm_flags & VM_NO_THP); 2373 return !(vma->vm_flags & VM_NO_THP);
2374} 2374}
2375 2375
2376/*
2377 * Bring missing pages in from swap, to complete THP collapse.
2378 * Only done if khugepaged_scan_pmd believes it is worthwhile.
2379 *
2380 * Called and returns without pte mapped or spinlocks held,
2381 * but with mmap_sem held to protect against vma changes.
2382 */
2383
2384static void __collapse_huge_page_swapin(struct mm_struct *mm,
2385 struct vm_area_struct *vma,
2386 unsigned long address, pmd_t *pmd)
2387{
2388 unsigned long _address;
2389 pte_t *pte, pteval;
2390 int swapped_in = 0, ret = 0;
2391
2392 pte = pte_offset_map(pmd, address);
2393 for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
2394 pte++, _address += PAGE_SIZE) {
2395 pteval = *pte;
2396 if (!is_swap_pte(pteval))
2397 continue;
2398 swapped_in++;
2399 ret = do_swap_page(mm, vma, _address, pte, pmd,
2400 FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
2401 pteval);
2402 if (ret & VM_FAULT_ERROR) {
2403 trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
2404 return;
2405 }
2406 /* pte is unmapped now, we need to map it */
2407 pte = pte_offset_map(pmd, _address);
2408 }
2409 pte--;
2410 pte_unmap(pte);
2411 trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
2412}
2413
2376static void collapse_huge_page(struct mm_struct *mm, 2414static void collapse_huge_page(struct mm_struct *mm,
2377 unsigned long address, 2415 unsigned long address,
2378 struct page **hpage, 2416 struct page **hpage,
@@ -2440,6 +2478,8 @@ static void collapse_huge_page(struct mm_struct *mm,
2440 goto out; 2478 goto out;
2441 } 2479 }
2442 2480
2481 __collapse_huge_page_swapin(mm, vma, address, pmd);
2482
2443 anon_vma_lock_write(vma->anon_vma); 2483 anon_vma_lock_write(vma->anon_vma);
2444 2484
2445 pte = pte_offset_map(pmd, address); 2485 pte = pte_offset_map(pmd, address);
@@ -2516,9 +2556,6 @@ static void collapse_huge_page(struct mm_struct *mm,
2516 result = SCAN_SUCCEED; 2556 result = SCAN_SUCCEED;
2517out_up_write: 2557out_up_write:
2518 up_write(&mm->mmap_sem); 2558 up_write(&mm->mmap_sem);
2519 trace_mm_collapse_huge_page(mm, isolated, result);
2520 return;
2521
2522out_nolock: 2559out_nolock:
2523 trace_mm_collapse_huge_page(mm, isolated, result); 2560 trace_mm_collapse_huge_page(mm, isolated, result);
2524 return; 2561 return;
diff --git a/mm/internal.h b/mm/internal.h
index fbfba0cc2c35..e1531758122b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,6 +36,10 @@
36/* Do not use these with a slab allocator */ 36/* Do not use these with a slab allocator */
37#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) 37#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
38 38
39extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
40 unsigned long address, pte_t *page_table, pmd_t *pmd,
41 unsigned int flags, pte_t orig_pte);
42
39void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 43void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
40 unsigned long floor, unsigned long ceiling); 44 unsigned long floor, unsigned long ceiling);
41 45
diff --git a/mm/memory.c b/mm/memory.c
index a329149e1c54..5e6eadd127e7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2522,7 +2522,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
2522 * We return with the mmap_sem locked or unlocked in the same cases 2522 * We return with the mmap_sem locked or unlocked in the same cases
2523 * as does filemap_fault(). 2523 * as does filemap_fault().
2524 */ 2524 */
2525static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 2525int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2526 unsigned long address, pte_t *page_table, pmd_t *pmd, 2526 unsigned long address, pte_t *page_table, pmd_t *pmd,
2527 unsigned int flags, pte_t orig_pte) 2527 unsigned int flags, pte_t orig_pte)
2528{ 2528{