aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2016-01-15 19:54:33 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-15 20:56:32 -0500
commite90309c9f7722db4ff5bce3b9e6e04d1460f2553 (patch)
tree79a9d09bdfcc8c653877f82b84de4f48e0472ab1
parenta46e63764eb6d0252ab4e96f96ad447594673274 (diff)
thp: allow mlocked THP again
Before THP refcounting rework, THP was not allowed to cross VMA boundary. So, if we have THP and we split it, PG_mlocked can be safely transferred to small pages. With new THP refcounting and naive approach to mlocking we can end up with this scenario: 1. we have a mlocked THP, which belong to one VM_LOCKED VMA. 2. the process does munlock() on the *part* of the THP: - the VMA is split into two, one of them VM_LOCKED; - huge PMD split into PTE table; - THP is still mlocked; 3. split_huge_page(): - it transfers PG_mlocked to *all* small pages regrardless if it blong to any VM_LOCKED VMA. We probably could munlock() all small pages on split_huge_page(), but I think we have accounting issue already on step two. Instead of forbidding mlocked pages altogether, we just avoid mlocking PTE-mapped THPs and munlock THPs on split_huge_pmd(). This means PTE-mapped THPs will be on normal lru lists and will be split under memory pressure by vmscan. After the split vmscan will detect unevictable small pages and mlock them. With this approach we shouldn't hit situation like described above. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Sasha Levin <sasha.levin@oracle.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Cc: Jerome Marchand <jmarchan@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Steve Capper <steve.capper@linaro.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/gup.c6
-rw-r--r--mm/huge_memory.c37
-rw-r--r--mm/memory.c6
-rw-r--r--mm/mlock.c68
-rw-r--r--mm/rmap.c3
-rw-r--r--mm/swap.c1
6 files changed, 88 insertions, 33 deletions
diff --git a/mm/gup.c b/mm/gup.c
index 70d65e4015a4..e95b0cb6ed81 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -143,6 +143,10 @@ retry:
143 mark_page_accessed(page); 143 mark_page_accessed(page);
144 } 144 }
145 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 145 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
146 /* Do not mlock pte-mapped THP */
147 if (PageTransCompound(page))
148 goto out;
149
146 /* 150 /*
147 * The preliminary mapping check is mainly to avoid the 151 * The preliminary mapping check is mainly to avoid the
148 * pointless overhead of lock_page on the ZERO_PAGE 152 * pointless overhead of lock_page on the ZERO_PAGE
@@ -920,8 +924,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
920 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; 924 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
921 if (vma->vm_flags & VM_LOCKONFAULT) 925 if (vma->vm_flags & VM_LOCKONFAULT)
922 gup_flags &= ~FOLL_POPULATE; 926 gup_flags &= ~FOLL_POPULATE;
923 if (vma->vm_flags & VM_LOCKED)
924 gup_flags |= FOLL_SPLIT;
925 /* 927 /*
926 * We want to touch writable mappings with a write fault in order 928 * We want to touch writable mappings with a write fault in order
927 * to break COW, except for shared mappings because these don't COW 929 * to break COW, except for shared mappings because these don't COW
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4acf55b31f7c..f283cb7c480e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -874,8 +874,6 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
874 874
875 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) 875 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
876 return VM_FAULT_FALLBACK; 876 return VM_FAULT_FALLBACK;
877 if (vma->vm_flags & VM_LOCKED)
878 return VM_FAULT_FALLBACK;
879 if (unlikely(anon_vma_prepare(vma))) 877 if (unlikely(anon_vma_prepare(vma)))
880 return VM_FAULT_OOM; 878 return VM_FAULT_OOM;
881 if (unlikely(khugepaged_enter(vma, vma->vm_flags))) 879 if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
@@ -1344,7 +1342,20 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1344 update_mmu_cache_pmd(vma, addr, pmd); 1342 update_mmu_cache_pmd(vma, addr, pmd);
1345 } 1343 }
1346 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1344 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1347 if (page->mapping && trylock_page(page)) { 1345 /*
1346 * We don't mlock() pte-mapped THPs. This way we can avoid
1347 * leaking mlocked pages into non-VM_LOCKED VMAs.
1348 *
1349 * In most cases the pmd is the only mapping of the page as we
1350 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
1351 * writable private mappings in populate_vma_page_range().
1352 *
1353 * The only scenario when we have the page shared here is if we
1354 * mlocking read-only mapping shared over fork(). We skip
1355 * mlocking such pages.
1356 */
1357 if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
1358 page->mapping && trylock_page(page)) {
1348 lru_add_drain(); 1359 lru_add_drain();
1349 if (page->mapping) 1360 if (page->mapping)
1350 mlock_vma_page(page); 1361 mlock_vma_page(page);
@@ -2209,8 +2220,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
2209 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || 2220 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
2210 (vma->vm_flags & VM_NOHUGEPAGE)) 2221 (vma->vm_flags & VM_NOHUGEPAGE))
2211 return false; 2222 return false;
2212 if (vma->vm_flags & VM_LOCKED)
2213 return false;
2214 if (!vma->anon_vma || vma->vm_ops) 2223 if (!vma->anon_vma || vma->vm_ops)
2215 return false; 2224 return false;
2216 if (is_vma_temporary_stack(vma)) 2225 if (is_vma_temporary_stack(vma))
@@ -2851,14 +2860,28 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2851{ 2860{
2852 spinlock_t *ptl; 2861 spinlock_t *ptl;
2853 struct mm_struct *mm = vma->vm_mm; 2862 struct mm_struct *mm = vma->vm_mm;
2863 struct page *page = NULL;
2854 unsigned long haddr = address & HPAGE_PMD_MASK; 2864 unsigned long haddr = address & HPAGE_PMD_MASK;
2855 2865
2856 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); 2866 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
2857 ptl = pmd_lock(mm, pmd); 2867 ptl = pmd_lock(mm, pmd);
2858 if (likely(pmd_trans_huge(*pmd))) 2868 if (unlikely(!pmd_trans_huge(*pmd)))
2859 __split_huge_pmd_locked(vma, pmd, haddr, false); 2869 goto out;
2870 page = pmd_page(*pmd);
2871 __split_huge_pmd_locked(vma, pmd, haddr, false);
2872 if (PageMlocked(page))
2873 get_page(page);
2874 else
2875 page = NULL;
2876out:
2860 spin_unlock(ptl); 2877 spin_unlock(ptl);
2861 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); 2878 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
2879 if (page) {
2880 lock_page(page);
2881 munlock_vma_page(page);
2882 unlock_page(page);
2883 put_page(page);
2884 }
2862} 2885}
2863 2886
2864static void split_huge_pmd_address(struct vm_area_struct *vma, 2887static void split_huge_pmd_address(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 9d5b40892d4d..5a73c6ed8e5c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2160,15 +2160,15 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2160 2160
2161 pte_unmap_unlock(page_table, ptl); 2161 pte_unmap_unlock(page_table, ptl);
2162 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2162 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2163 /* THP pages are never mlocked */ 2163 if (old_page) {
2164 if (old_page && !PageTransCompound(old_page)) {
2165 /* 2164 /*
2166 * Don't let another task, with possibly unlocked vma, 2165 * Don't let another task, with possibly unlocked vma,
2167 * keep the mlocked page. 2166 * keep the mlocked page.
2168 */ 2167 */
2169 if (page_copied && (vma->vm_flags & VM_LOCKED)) { 2168 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2170 lock_page(old_page); /* LRU manipulation */ 2169 lock_page(old_page); /* LRU manipulation */
2171 munlock_vma_page(old_page); 2170 if (PageMlocked(old_page))
2171 munlock_vma_page(old_page);
2172 unlock_page(old_page); 2172 unlock_page(old_page);
2173 } 2173 }
2174 page_cache_release(old_page); 2174 page_cache_release(old_page);
diff --git a/mm/mlock.c b/mm/mlock.c
index c6b139ad356a..9197b6721a1e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page)
82 /* Serialize with page migration */ 82 /* Serialize with page migration */
83 BUG_ON(!PageLocked(page)); 83 BUG_ON(!PageLocked(page));
84 84
85 VM_BUG_ON_PAGE(PageTail(page), page);
86 VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
87
85 if (!TestSetPageMlocked(page)) { 88 if (!TestSetPageMlocked(page)) {
86 mod_zone_page_state(page_zone(page), NR_MLOCK, 89 mod_zone_page_state(page_zone(page), NR_MLOCK,
87 hpage_nr_pages(page)); 90 hpage_nr_pages(page));
@@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page)
178 /* For try_to_munlock() and to serialize with page migration */ 181 /* For try_to_munlock() and to serialize with page migration */
179 BUG_ON(!PageLocked(page)); 182 BUG_ON(!PageLocked(page));
180 183
184 VM_BUG_ON_PAGE(PageTail(page), page);
185
181 /* 186 /*
182 * Serialize with any parallel __split_huge_page_refcount() which 187 * Serialize with any parallel __split_huge_page_refcount() which
183 * might otherwise copy PageMlocked to part of the tail pages before 188 * might otherwise copy PageMlocked to part of the tail pages before
@@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
388 if (!page || page_zone_id(page) != zoneid) 393 if (!page || page_zone_id(page) != zoneid)
389 break; 394 break;
390 395
396 /*
397 * Do not use pagevec for PTE-mapped THP,
398 * munlock_vma_pages_range() will handle them.
399 */
400 if (PageTransCompound(page))
401 break;
402
391 get_page(page); 403 get_page(page);
392 /* 404 /*
393 * Increase the address that will be returned *before* the 405 * Increase the address that will be returned *before* the
@@ -443,29 +455,43 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
443 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, 455 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
444 &page_mask); 456 &page_mask);
445 457
446 if (page && !IS_ERR(page) && !PageTransCompound(page)) { 458 if (page && !IS_ERR(page)) {
447 /* 459 if (PageTransTail(page)) {
448 * Non-huge pages are handled in batches via 460 VM_BUG_ON_PAGE(PageMlocked(page), page);
449 * pagevec. The pin from follow_page_mask() 461 put_page(page); /* follow_page_mask() */
450 * prevents them from collapsing by THP. 462 } else if (PageTransHuge(page)) {
451 */ 463 lock_page(page);
452 pagevec_add(&pvec, page); 464 /*
453 zone = page_zone(page); 465 * Any THP page found by follow_page_mask() may
454 zoneid = page_zone_id(page); 466 * have gotten split before reaching
467 * munlock_vma_page(), so we need to recompute
468 * the page_mask here.
469 */
470 page_mask = munlock_vma_page(page);
471 unlock_page(page);
472 put_page(page); /* follow_page_mask() */
473 } else {
474 /*
475 * Non-huge pages are handled in batches via
476 * pagevec. The pin from follow_page_mask()
477 * prevents them from collapsing by THP.
478 */
479 pagevec_add(&pvec, page);
480 zone = page_zone(page);
481 zoneid = page_zone_id(page);
455 482
456 /* 483 /*
457 * Try to fill the rest of pagevec using fast 484 * Try to fill the rest of pagevec using fast
458 * pte walk. This will also update start to 485 * pte walk. This will also update start to
459 * the next page to process. Then munlock the 486 * the next page to process. Then munlock the
460 * pagevec. 487 * pagevec.
461 */ 488 */
462 start = __munlock_pagevec_fill(&pvec, vma, 489 start = __munlock_pagevec_fill(&pvec, vma,
463 zoneid, start, end); 490 zoneid, start, end);
464 __munlock_pagevec(&pvec, zone); 491 __munlock_pagevec(&pvec, zone);
465 goto next; 492 goto next;
493 }
466 } 494 }
467 /* It's a bug to munlock in the middle of a THP page */
468 VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
469 page_increm = 1 + page_mask; 495 page_increm = 1 + page_mask;
470 start += page_increm * PAGE_SIZE; 496 start += page_increm * PAGE_SIZE;
471next: 497next:
diff --git a/mm/rmap.c b/mm/rmap.c
index 84271cc39d1e..31d8866fb562 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1282,6 +1282,9 @@ static void page_remove_anon_compound_rmap(struct page *page)
1282 nr = HPAGE_PMD_NR; 1282 nr = HPAGE_PMD_NR;
1283 } 1283 }
1284 1284
1285 if (unlikely(PageMlocked(page)))
1286 clear_page_mlock(page);
1287
1285 if (nr) { 1288 if (nr) {
1286 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); 1289 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
1287 deferred_split_huge_page(page); 1290 deferred_split_huge_page(page);
diff --git a/mm/swap.c b/mm/swap.c
index 3d65480422e8..abffc33bb975 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -358,6 +358,7 @@ static void __lru_cache_activate_page(struct page *page)
358 */ 358 */
359void mark_page_accessed(struct page *page) 359void mark_page_accessed(struct page *page)
360{ 360{
361 page = compound_head(page);
361 if (!PageActive(page) && !PageUnevictable(page) && 362 if (!PageActive(page) && !PageUnevictable(page) &&
362 PageReferenced(page)) { 363 PageReferenced(page)) {
363 364