diff options
author | Ebru Akagunduz <ebru.akagunduz@gmail.com> | 2016-07-26 18:25:03 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-26 19:19:19 -0400 |
commit | 8a966ed746d63c8103d496da85973eeeec01d77f (patch) | |
tree | a96bcdb72409c34910440549c2e4fa248a468c4e | |
parent | 70652f6ec0566ae6b4147d88c6d043c68484227f (diff) |
mm: make swapin readahead to improve thp collapse rate
This patch makes swapin readahead to improve thp collapse rate. When
khugepaged scanned pages, there can be a few of the pages in swap area.
With the patch THP can collapse 4kB pages into a THP when there are up
to max_ptes_swap swap ptes in a 2MB range.
The patch was tested with a test program that allocates 400B of memory,
writes to it, and then sleeps. I force the system to swap out all.
Afterwards, the test program touches the area by writing, it skips a
page in each 20 pages of the area.
Without the patch, system did not swap in readahead. THP rate was %65
of the program of the memory, it did not change over time.
With this patch, after 10 minutes of waiting khugepaged had collapsed
%99 of the program's memory.
[kirill.shutemov@linux.intel.com: trivial cleanup of exit path of the function]
[kirill.shutemov@linux.intel.com: __collapse_huge_page_swapin(): drop unused 'pte' parameter]
[kirill.shutemov@linux.intel.com: do not hold anon_vma lock during swap in]
Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Xie XiuQi <xiexiuqi@huawei.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/trace/events/huge_memory.h | 24 | ||||
-rw-r--r-- | mm/huge_memory.c | 43 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/memory.c | 2 |
4 files changed, 69 insertions, 4 deletions
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index fad6539c9d68..bda21183eb05 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h | |||
@@ -135,5 +135,29 @@ TRACE_EVENT(mm_collapse_huge_page_isolate, | |||
135 | __print_symbolic(__entry->status, SCAN_STATUS)) | 135 | __print_symbolic(__entry->status, SCAN_STATUS)) |
136 | ); | 136 | ); |
137 | 137 | ||
138 | TRACE_EVENT(mm_collapse_huge_page_swapin, | ||
139 | |||
140 | TP_PROTO(struct mm_struct *mm, int swapped_in, int ret), | ||
141 | |||
142 | TP_ARGS(mm, swapped_in, ret), | ||
143 | |||
144 | TP_STRUCT__entry( | ||
145 | __field(struct mm_struct *, mm) | ||
146 | __field(int, swapped_in) | ||
147 | __field(int, ret) | ||
148 | ), | ||
149 | |||
150 | TP_fast_assign( | ||
151 | __entry->mm = mm; | ||
152 | __entry->swapped_in = swapped_in; | ||
153 | __entry->ret = ret; | ||
154 | ), | ||
155 | |||
156 | TP_printk("mm=%p, swapped_in=%d, ret=%d", | ||
157 | __entry->mm, | ||
158 | __entry->swapped_in, | ||
159 | __entry->ret) | ||
160 | ); | ||
161 | |||
138 | #endif /* __HUGE_MEMORY_H */ | 162 | #endif /* __HUGE_MEMORY_H */ |
139 | #include <trace/define_trace.h> | 163 | #include <trace/define_trace.h> |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ed474483a620..b11351579e7a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -2373,6 +2373,44 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) | |||
2373 | return !(vma->vm_flags & VM_NO_THP); | 2373 | return !(vma->vm_flags & VM_NO_THP); |
2374 | } | 2374 | } |
2375 | 2375 | ||
2376 | /* | ||
2377 | * Bring missing pages in from swap, to complete THP collapse. | ||
2378 | * Only done if khugepaged_scan_pmd believes it is worthwhile. | ||
2379 | * | ||
2380 | * Called and returns without pte mapped or spinlocks held, | ||
2381 | * but with mmap_sem held to protect against vma changes. | ||
2382 | */ | ||
2383 | |||
2384 | static void __collapse_huge_page_swapin(struct mm_struct *mm, | ||
2385 | struct vm_area_struct *vma, | ||
2386 | unsigned long address, pmd_t *pmd) | ||
2387 | { | ||
2388 | unsigned long _address; | ||
2389 | pte_t *pte, pteval; | ||
2390 | int swapped_in = 0, ret = 0; | ||
2391 | |||
2392 | pte = pte_offset_map(pmd, address); | ||
2393 | for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE; | ||
2394 | pte++, _address += PAGE_SIZE) { | ||
2395 | pteval = *pte; | ||
2396 | if (!is_swap_pte(pteval)) | ||
2397 | continue; | ||
2398 | swapped_in++; | ||
2399 | ret = do_swap_page(mm, vma, _address, pte, pmd, | ||
2400 | FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT, | ||
2401 | pteval); | ||
2402 | if (ret & VM_FAULT_ERROR) { | ||
2403 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0); | ||
2404 | return; | ||
2405 | } | ||
2406 | /* pte is unmapped now, we need to map it */ | ||
2407 | pte = pte_offset_map(pmd, _address); | ||
2408 | } | ||
2409 | pte--; | ||
2410 | pte_unmap(pte); | ||
2411 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1); | ||
2412 | } | ||
2413 | |||
2376 | static void collapse_huge_page(struct mm_struct *mm, | 2414 | static void collapse_huge_page(struct mm_struct *mm, |
2377 | unsigned long address, | 2415 | unsigned long address, |
2378 | struct page **hpage, | 2416 | struct page **hpage, |
@@ -2440,6 +2478,8 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2440 | goto out; | 2478 | goto out; |
2441 | } | 2479 | } |
2442 | 2480 | ||
2481 | __collapse_huge_page_swapin(mm, vma, address, pmd); | ||
2482 | |||
2443 | anon_vma_lock_write(vma->anon_vma); | 2483 | anon_vma_lock_write(vma->anon_vma); |
2444 | 2484 | ||
2445 | pte = pte_offset_map(pmd, address); | 2485 | pte = pte_offset_map(pmd, address); |
@@ -2516,9 +2556,6 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2516 | result = SCAN_SUCCEED; | 2556 | result = SCAN_SUCCEED; |
2517 | out_up_write: | 2557 | out_up_write: |
2518 | up_write(&mm->mmap_sem); | 2558 | up_write(&mm->mmap_sem); |
2519 | trace_mm_collapse_huge_page(mm, isolated, result); | ||
2520 | return; | ||
2521 | |||
2522 | out_nolock: | 2559 | out_nolock: |
2523 | trace_mm_collapse_huge_page(mm, isolated, result); | 2560 | trace_mm_collapse_huge_page(mm, isolated, result); |
2524 | return; | 2561 | return; |
diff --git a/mm/internal.h b/mm/internal.h index fbfba0cc2c35..e1531758122b 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -36,6 +36,10 @@ | |||
36 | /* Do not use these with a slab allocator */ | 36 | /* Do not use these with a slab allocator */ |
37 | #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) | 37 | #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) |
38 | 38 | ||
39 | extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
40 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
41 | unsigned int flags, pte_t orig_pte); | ||
42 | |||
39 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 43 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
40 | unsigned long floor, unsigned long ceiling); | 44 | unsigned long floor, unsigned long ceiling); |
41 | 45 | ||
diff --git a/mm/memory.c b/mm/memory.c index a329149e1c54..5e6eadd127e7 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2522,7 +2522,7 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
2522 | * We return with the mmap_sem locked or unlocked in the same cases | 2522 | * We return with the mmap_sem locked or unlocked in the same cases |
2523 | * as does filemap_fault(). | 2523 | * as does filemap_fault(). |
2524 | */ | 2524 | */ |
2525 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2525 | int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2526 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2526 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2527 | unsigned int flags, pte_t orig_pte) | 2527 | unsigned int flags, pte_t orig_pte) |
2528 | { | 2528 | { |