aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2016-05-12 18:42:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-12 18:52:50 -0400
commit6d0a07edd17cfc12fdc1f36de8072fa17cc3666f (patch)
treea80f20857e658de5aaa8ffa769f32d2f3bf7a9a5
parent7496fea9a6bf644afe360af795b121a77635b37d (diff)
mm: thp: calculate the mapcount correctly for THP pages during WP faults
This will provide fully accuracy to the mapcount calculation in the write protect faults, so page pinning will not get broken by false positive copy-on-writes. total_mapcount() isn't the right calculation needed in reuse_swap_page(), so this introduces a page_trans_huge_mapcount() that is effectively the full accurate return value for page_mapcount() if dealing with Transparent Hugepages, however we only use the page_trans_huge_mapcount() during COW faults where it strictly needed, due to its higher runtime cost. This also provide at practical zero cost the total_mapcount information which is needed to know if we can still relocate the page anon_vma to the local vma. If page_trans_huge_mapcount() returns 1 we can reuse the page no matter if it's a pte or a pmd_trans_huge triggering the fault, but we can only relocate the page anon_vma to the local vma->anon_vma if we're sure it's only this "vma" mapping the whole THP physical range. Kirill A. Shutemov discovered the problem with moving the page anon_vma to the local vma->anon_vma in a previous version of this patch and another problem in the way page_move_anon_rmap() was called. Andrew Morton discovered that CONFIG_SWAP=n wouldn't build in a previous version, because reuse_swap_page must be a macro to call page_trans_huge_mapcount from swap.h, so this uses a macro again instead of an inline function. With this change at least it's a less dangerous usage than it was before, because "page" is used only once now, while with the previous code reuse_swap_page(page++) would have called page_mapcount on page+1 and it would have increased page twice instead of just once. Dean Luick noticed an uninitialized variable that could result in a rmap inefficiency for the non-THP case in a previous version. Mike Marciniszyn said: : Our RDMA tests are seeing an issue with memory locking that bisects to : commit 61f5d698cc97 ("mm: re-enable THP") : : The test program registers two rather large MRs (512M) and RDMA : writes data to a passive peer using the first and RDMA reads it back : into the second MR and compares that data. The sizes are chosen randomly : between 0 and 1024 bytes. : : The test will get through a few (<= 4 iterations) and then gets a : compare error. : : Tracing indicates the kernel logical addresses associated with the individual : pages at registration ARE correct , the data in the "RDMA read response only" : packets ARE correct. : : The "corruption" occurs when the packet crosse two pages that are not physically : contiguous. The second page reads back as zero in the program. : : It looks like the user VA at the point of the compare error no longer points to : the same physical address as was registered. : : This patch totally resolves the issue! Link: http://lkml.kernel.org/r/1462547040-1737-2-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reviewed-by: "Kirill A. Shutemov" <kirill@shutemov.name> Reviewed-by: Dean Luick <dean.luick@intel.com> Tested-by: Alex Williamson <alex.williamson@redhat.com> Tested-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Tested-by: Josh Collier <josh.d.collier@intel.com> Cc: Marc Haber <mh+linux-kernel@zugschlus.de> Cc: <stable@vger.kernel.org> [4.5] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mm.h9
-rw-r--r--include/linux/swap.h6
-rw-r--r--mm/huge_memory.c71
-rw-r--r--mm/memory.c22
-rw-r--r--mm/swapfile.c13
5 files changed, 95 insertions, 26 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 864d7221de84..8f468e0d2534 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -500,11 +500,20 @@ static inline int page_mapcount(struct page *page)
500 500
501#ifdef CONFIG_TRANSPARENT_HUGEPAGE 501#ifdef CONFIG_TRANSPARENT_HUGEPAGE
502int total_mapcount(struct page *page); 502int total_mapcount(struct page *page);
503int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
503#else 504#else
504static inline int total_mapcount(struct page *page) 505static inline int total_mapcount(struct page *page)
505{ 506{
506 return page_mapcount(page); 507 return page_mapcount(page);
507} 508}
509static inline int page_trans_huge_mapcount(struct page *page,
510 int *total_mapcount)
511{
512 int mapcount = page_mapcount(page);
513 if (total_mapcount)
514 *total_mapcount = mapcount;
515 return mapcount;
516}
508#endif 517#endif
509 518
510static inline struct page *virt_to_head_page(const void *x) 519static inline struct page *virt_to_head_page(const void *x)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0a4cd4703f40..ad220359f1b0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -418,7 +418,7 @@ extern sector_t swapdev_block(int, pgoff_t);
418extern int page_swapcount(struct page *); 418extern int page_swapcount(struct page *);
419extern int swp_swapcount(swp_entry_t entry); 419extern int swp_swapcount(swp_entry_t entry);
420extern struct swap_info_struct *page_swap_info(struct page *); 420extern struct swap_info_struct *page_swap_info(struct page *);
421extern int reuse_swap_page(struct page *); 421extern bool reuse_swap_page(struct page *, int *);
422extern int try_to_free_swap(struct page *); 422extern int try_to_free_swap(struct page *);
423struct backing_dev_info; 423struct backing_dev_info;
424 424
@@ -513,8 +513,8 @@ static inline int swp_swapcount(swp_entry_t entry)
513 return 0; 513 return 0;
514} 514}
515 515
516#define reuse_swap_page(page) \ 516#define reuse_swap_page(page, total_mapcount) \
517 (!PageTransCompound(page) && page_mapcount(page) == 1) 517 (page_trans_huge_mapcount(page, total_mapcount) == 1)
518 518
519static inline int try_to_free_swap(struct page *page) 519static inline int try_to_free_swap(struct page *page)
520{ 520{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f7daa7de8f48..b49ee126d4d1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1298,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1298 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); 1298 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
1299 /* 1299 /*
1300 * We can only reuse the page if nobody else maps the huge page or it's 1300 * We can only reuse the page if nobody else maps the huge page or it's
1301 * part. We can do it by checking page_mapcount() on each sub-page, but 1301 * part.
1302 * it's expensive.
1303 * The cheaper way is to check page_count() to be equal 1: every
1304 * mapcount takes page reference reference, so this way we can
1305 * guarantee, that the PMD is the only mapping.
1306 * This can give false negative if somebody pinned the page, but that's
1307 * fine.
1308 */ 1302 */
1309 if (page_mapcount(page) == 1 && page_count(page) == 1) { 1303 if (page_trans_huge_mapcount(page, NULL) == 1) {
1310 pmd_t entry; 1304 pmd_t entry;
1311 entry = pmd_mkyoung(orig_pmd); 1305 entry = pmd_mkyoung(orig_pmd);
1312 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1306 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -2079,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2079 if (pte_write(pteval)) { 2073 if (pte_write(pteval)) {
2080 writable = true; 2074 writable = true;
2081 } else { 2075 } else {
2082 if (PageSwapCache(page) && !reuse_swap_page(page)) { 2076 if (PageSwapCache(page) &&
2077 !reuse_swap_page(page, NULL)) {
2083 unlock_page(page); 2078 unlock_page(page);
2084 result = SCAN_SWAP_CACHE_PAGE; 2079 result = SCAN_SWAP_CACHE_PAGE;
2085 goto out; 2080 goto out;
@@ -3223,6 +3218,64 @@ int total_mapcount(struct page *page)
3223} 3218}
3224 3219
3225/* 3220/*
3221 * This calculates accurately how many mappings a transparent hugepage
3222 * has (unlike page_mapcount() which isn't fully accurate). This full
3223 * accuracy is primarily needed to know if copy-on-write faults can
3224 * reuse the page and change the mapping to read-write instead of
3225 * copying them. At the same time this returns the total_mapcount too.
3226 *
3227 * The function returns the highest mapcount any one of the subpages
3228 * has. If the return value is one, even if different processes are
3229 * mapping different subpages of the transparent hugepage, they can
3230 * all reuse it, because each process is reusing a different subpage.
3231 *
3232 * The total_mapcount is instead counting all virtual mappings of the
3233 * subpages. If the total_mapcount is equal to "one", it tells the
3234 * caller all mappings belong to the same "mm" and in turn the
3235 * anon_vma of the transparent hugepage can become the vma->anon_vma
3236 * local one as no other process may be mapping any of the subpages.
3237 *
3238 * It would be more accurate to replace page_mapcount() with
3239 * page_trans_huge_mapcount(), however we only use
3240 * page_trans_huge_mapcount() in the copy-on-write faults where we
3241 * need full accuracy to avoid breaking page pinning, because
3242 * page_trans_huge_mapcount() is slower than page_mapcount().
3243 */
3244int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
3245{
3246 int i, ret, _total_mapcount, mapcount;
3247
3248 /* hugetlbfs shouldn't call it */
3249 VM_BUG_ON_PAGE(PageHuge(page), page);
3250
3251 if (likely(!PageTransCompound(page))) {
3252 mapcount = atomic_read(&page->_mapcount) + 1;
3253 if (total_mapcount)
3254 *total_mapcount = mapcount;
3255 return mapcount;
3256 }
3257
3258 page = compound_head(page);
3259
3260 _total_mapcount = ret = 0;
3261 for (i = 0; i < HPAGE_PMD_NR; i++) {
3262 mapcount = atomic_read(&page[i]._mapcount) + 1;
3263 ret = max(ret, mapcount);
3264 _total_mapcount += mapcount;
3265 }
3266 if (PageDoubleMap(page)) {
3267 ret -= 1;
3268 _total_mapcount -= HPAGE_PMD_NR;
3269 }
3270 mapcount = compound_mapcount(page);
3271 ret += mapcount;
3272 _total_mapcount += mapcount;
3273 if (total_mapcount)
3274 *total_mapcount = _total_mapcount;
3275 return ret;
3276}
3277
3278/*
3226 * This function splits huge page into normal pages. @page can point to any 3279 * This function splits huge page into normal pages. @page can point to any
3227 * subpage of huge page to split. Split doesn't change the position of @page. 3280 * subpage of huge page to split. Split doesn't change the position of @page.
3228 * 3281 *
diff --git a/mm/memory.c b/mm/memory.c
index 52c218e2b724..07493e34ab7e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2373,6 +2373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2373 * not dirty accountable. 2373 * not dirty accountable.
2374 */ 2374 */
2375 if (PageAnon(old_page) && !PageKsm(old_page)) { 2375 if (PageAnon(old_page) && !PageKsm(old_page)) {
2376 int total_mapcount;
2376 if (!trylock_page(old_page)) { 2377 if (!trylock_page(old_page)) {
2377 get_page(old_page); 2378 get_page(old_page);
2378 pte_unmap_unlock(page_table, ptl); 2379 pte_unmap_unlock(page_table, ptl);
@@ -2387,13 +2388,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2387 } 2388 }
2388 put_page(old_page); 2389 put_page(old_page);
2389 } 2390 }
2390 if (reuse_swap_page(old_page)) { 2391 if (reuse_swap_page(old_page, &total_mapcount)) {
2391 /* 2392 if (total_mapcount == 1) {
2392 * The page is all ours. Move it to our anon_vma so 2393 /*
2393 * the rmap code will not search our parent or siblings. 2394 * The page is all ours. Move it to
2394 * Protected against the rmap code by the page lock. 2395 * our anon_vma so the rmap code will
2395 */ 2396 * not search our parent or siblings.
2396 page_move_anon_rmap(old_page, vma, address); 2397 * Protected against the rmap code by
2398 * the page lock.
2399 */
2400 page_move_anon_rmap(compound_head(old_page),
2401 vma, address);
2402 }
2397 unlock_page(old_page); 2403 unlock_page(old_page);
2398 return wp_page_reuse(mm, vma, address, page_table, ptl, 2404 return wp_page_reuse(mm, vma, address, page_table, ptl,
2399 orig_pte, old_page, 0, 0); 2405 orig_pte, old_page, 0, 0);
@@ -2617,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2617 inc_mm_counter_fast(mm, MM_ANONPAGES); 2623 inc_mm_counter_fast(mm, MM_ANONPAGES);
2618 dec_mm_counter_fast(mm, MM_SWAPENTS); 2624 dec_mm_counter_fast(mm, MM_SWAPENTS);
2619 pte = mk_pte(page, vma->vm_page_prot); 2625 pte = mk_pte(page, vma->vm_page_prot);
2620 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2626 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2621 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2627 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2622 flags &= ~FAULT_FLAG_WRITE; 2628 flags &= ~FAULT_FLAG_WRITE;
2623 ret |= VM_FAULT_WRITE; 2629 ret |= VM_FAULT_WRITE;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 83874eced5bf..031713ab40ce 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -922,18 +922,19 @@ out:
922 * to it. And as a side-effect, free up its swap: because the old content 922 * to it. And as a side-effect, free up its swap: because the old content
923 * on disk will never be read, and seeking back there to write new content 923 * on disk will never be read, and seeking back there to write new content
924 * later would only waste time away from clustering. 924 * later would only waste time away from clustering.
925 *
926 * NOTE: total_mapcount should not be relied upon by the caller if
927 * reuse_swap_page() returns false, but it may be always overwritten
928 * (see the other implementation for CONFIG_SWAP=n).
925 */ 929 */
926int reuse_swap_page(struct page *page) 930bool reuse_swap_page(struct page *page, int *total_mapcount)
927{ 931{
928 int count; 932 int count;
929 933
930 VM_BUG_ON_PAGE(!PageLocked(page), page); 934 VM_BUG_ON_PAGE(!PageLocked(page), page);
931 if (unlikely(PageKsm(page))) 935 if (unlikely(PageKsm(page)))
932 return 0; 936 return false;
933 /* The page is part of THP and cannot be reused */ 937 count = page_trans_huge_mapcount(page, total_mapcount);
934 if (PageTransCompound(page))
935 return 0;
936 count = page_mapcount(page);
937 if (count <= 1 && PageSwapCache(page)) { 938 if (count <= 1 && PageSwapCache(page)) {
938 count += page_swapcount(page); 939 count += page_swapcount(page);
939 if (count == 1 && !PageWriteback(page)) { 940 if (count == 1 && !PageWriteback(page)) {