aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c156
1 files changed, 76 insertions, 80 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 817a875f2b8c..fc00c8cb5a82 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -171,12 +171,7 @@ static int start_khugepaged(void)
171} 171}
172 172
173static atomic_t huge_zero_refcount; 173static atomic_t huge_zero_refcount;
174static struct page *huge_zero_page __read_mostly; 174struct page *huge_zero_page __read_mostly;
175
176static inline bool is_huge_zero_page(struct page *page)
177{
178 return ACCESS_ONCE(huge_zero_page) == page;
179}
180 175
181static inline bool is_huge_zero_pmd(pmd_t pmd) 176static inline bool is_huge_zero_pmd(pmd_t pmd)
182{ 177{
@@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
766 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; 761 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
767} 762}
768 763
769static inline struct page *alloc_hugepage_vma(int defrag,
770 struct vm_area_struct *vma,
771 unsigned long haddr, int nd,
772 gfp_t extra_gfp)
773{
774 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
775 HPAGE_PMD_ORDER, vma, haddr, nd);
776}
777
778/* Caller must hold page table lock. */ 764/* Caller must hold page table lock. */
779static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 765static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
780 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 766 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
795 unsigned long address, pmd_t *pmd, 781 unsigned long address, pmd_t *pmd,
796 unsigned int flags) 782 unsigned int flags)
797{ 783{
784 gfp_t gfp;
798 struct page *page; 785 struct page *page;
799 unsigned long haddr = address & HPAGE_PMD_MASK; 786 unsigned long haddr = address & HPAGE_PMD_MASK;
800 787
@@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
829 } 816 }
830 return 0; 817 return 0;
831 } 818 }
832 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 819 gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
833 vma, haddr, numa_node_id(), 0); 820 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
834 if (unlikely(!page)) { 821 if (unlikely(!page)) {
835 count_vm_event(THP_FAULT_FALLBACK); 822 count_vm_event(THP_FAULT_FALLBACK);
836 return VM_FAULT_FALLBACK; 823 return VM_FAULT_FALLBACK;
@@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1118 spin_unlock(ptl); 1105 spin_unlock(ptl);
1119alloc: 1106alloc:
1120 if (transparent_hugepage_enabled(vma) && 1107 if (transparent_hugepage_enabled(vma) &&
1121 !transparent_hugepage_debug_cow()) 1108 !transparent_hugepage_debug_cow()) {
1122 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1109 gfp_t gfp;
1123 vma, haddr, numa_node_id(), 0); 1110
1124 else 1111 gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
1112 new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
1113 } else
1125 new_page = NULL; 1114 new_page = NULL;
1126 1115
1127 if (unlikely(!new_page)) { 1116 if (unlikely(!new_page)) {
@@ -1222,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1222 return ERR_PTR(-EFAULT); 1211 return ERR_PTR(-EFAULT);
1223 1212
1224 /* Full NUMA hinting faults to serialise migration in fault paths */ 1213 /* Full NUMA hinting faults to serialise migration in fault paths */
1225 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 1214 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
1226 goto out; 1215 goto out;
1227 1216
1228 page = pmd_page(*pmd); 1217 page = pmd_page(*pmd);
@@ -1273,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1273 bool migrated = false; 1262 bool migrated = false;
1274 int flags = 0; 1263 int flags = 0;
1275 1264
1265 /* A PROT_NONE fault should not end up here */
1266 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
1267
1276 ptl = pmd_lock(mm, pmdp); 1268 ptl = pmd_lock(mm, pmdp);
1277 if (unlikely(!pmd_same(pmd, *pmdp))) 1269 if (unlikely(!pmd_same(pmd, *pmdp)))
1278 goto out_unlock; 1270 goto out_unlock;
@@ -1283,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1283 * check_same as the page may no longer be mapped. 1275 * check_same as the page may no longer be mapped.
1284 */ 1276 */
1285 if (unlikely(pmd_trans_migrating(*pmdp))) { 1277 if (unlikely(pmd_trans_migrating(*pmdp))) {
1278 page = pmd_page(*pmdp);
1286 spin_unlock(ptl); 1279 spin_unlock(ptl);
1287 wait_migrate_huge_page(vma->anon_vma, pmdp); 1280 wait_on_page_locked(page);
1288 goto out; 1281 goto out;
1289 } 1282 }
1290 1283
@@ -1352,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1352 1345
1353 /* 1346 /*
1354 * Migrate the THP to the requested node, returns with page unlocked 1347 * Migrate the THP to the requested node, returns with page unlocked
1355 * and pmd_numa cleared. 1348 * and access rights restored.
1356 */ 1349 */
1357 spin_unlock(ptl); 1350 spin_unlock(ptl);
1358 migrated = migrate_misplaced_transhuge_page(mm, vma, 1351 migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1365,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1365 goto out; 1358 goto out;
1366clear_pmdnuma: 1359clear_pmdnuma:
1367 BUG_ON(!PageLocked(page)); 1360 BUG_ON(!PageLocked(page));
1368 pmd = pmd_mknonnuma(pmd); 1361 pmd = pmd_modify(pmd, vma->vm_page_prot);
1369 set_pmd_at(mm, haddr, pmdp, pmd); 1362 set_pmd_at(mm, haddr, pmdp, pmd);
1370 VM_BUG_ON(pmd_numa(*pmdp));
1371 update_mmu_cache_pmd(vma, addr, pmdp); 1363 update_mmu_cache_pmd(vma, addr, pmdp);
1372 unlock_page(page); 1364 unlock_page(page);
1373out_unlock: 1365out_unlock:
@@ -1423,26 +1415,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1423 return ret; 1415 return ret;
1424} 1416}
1425 1417
1426int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1427 unsigned long addr, unsigned long end,
1428 unsigned char *vec)
1429{
1430 spinlock_t *ptl;
1431 int ret = 0;
1432
1433 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1434 /*
1435 * All logical pages in the range are present
1436 * if backed by a huge page.
1437 */
1438 spin_unlock(ptl);
1439 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1440 ret = 1;
1441 }
1442
1443 return ret;
1444}
1445
1446int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, 1418int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1447 unsigned long old_addr, 1419 unsigned long old_addr,
1448 unsigned long new_addr, unsigned long old_end, 1420 unsigned long new_addr, unsigned long old_end,
@@ -1510,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1510 1482
1511 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1483 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1512 pmd_t entry; 1484 pmd_t entry;
1513 ret = 1; 1485
1514 if (!prot_numa) { 1486 /*
1487 * Avoid trapping faults against the zero page. The read-only
1488 * data is likely to be read-cached on the local CPU and
1489 * local/remote hits to the zero page are not interesting.
1490 */
1491 if (prot_numa && is_huge_zero_pmd(*pmd)) {
1492 spin_unlock(ptl);
1493 return 0;
1494 }
1495
1496 if (!prot_numa || !pmd_protnone(*pmd)) {
1497 ret = 1;
1515 entry = pmdp_get_and_clear_notify(mm, addr, pmd); 1498 entry = pmdp_get_and_clear_notify(mm, addr, pmd);
1516 if (pmd_numa(entry))
1517 entry = pmd_mknonnuma(entry);
1518 entry = pmd_modify(entry, newprot); 1499 entry = pmd_modify(entry, newprot);
1519 ret = HPAGE_PMD_NR; 1500 ret = HPAGE_PMD_NR;
1520 set_pmd_at(mm, addr, pmd, entry); 1501 set_pmd_at(mm, addr, pmd, entry);
1521 BUG_ON(pmd_write(entry)); 1502 BUG_ON(pmd_write(entry));
1522 } else {
1523 struct page *page = pmd_page(*pmd);
1524
1525 /*
1526 * Do not trap faults against the zero page. The
1527 * read-only data is likely to be read-cached on the
1528 * local CPU cache and it is less useful to know about
1529 * local vs remote hits on the zero page.
1530 */
1531 if (!is_huge_zero_page(page) &&
1532 !pmd_numa(*pmd)) {
1533 pmdp_set_numa(mm, addr, pmd);
1534 ret = HPAGE_PMD_NR;
1535 }
1536 } 1503 }
1537 spin_unlock(ptl); 1504 spin_unlock(ptl);
1538 } 1505 }
@@ -1797,9 +1764,9 @@ static int __split_huge_page_map(struct page *page,
1797 pte_t *pte, entry; 1764 pte_t *pte, entry;
1798 BUG_ON(PageCompound(page+i)); 1765 BUG_ON(PageCompound(page+i));
1799 /* 1766 /*
1800 * Note that pmd_numa is not transferred deliberately 1767 * Note that NUMA hinting access restrictions are not
1801 * to avoid any possibility that pte_numa leaks to 1768 * transferred to avoid any possibility of altering
1802 * a PROT_NONE VMA by accident. 1769 * permissions across VMAs.
1803 */ 1770 */
1804 entry = mk_pte(page + i, vma->vm_page_prot); 1771 entry = mk_pte(page + i, vma->vm_page_prot);
1805 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1772 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2148,7 +2115,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2148{ 2115{
2149 struct page *page; 2116 struct page *page;
2150 pte_t *_pte; 2117 pte_t *_pte;
2151 int referenced = 0, none = 0; 2118 int none = 0;
2119 bool referenced = false, writable = false;
2152 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2120 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
2153 _pte++, address += PAGE_SIZE) { 2121 _pte++, address += PAGE_SIZE) {
2154 pte_t pteval = *_pte; 2122 pte_t pteval = *_pte;
@@ -2158,7 +2126,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2158 else 2126 else
2159 goto out; 2127 goto out;
2160 } 2128 }
2161 if (!pte_present(pteval) || !pte_write(pteval)) 2129 if (!pte_present(pteval))
2162 goto out; 2130 goto out;
2163 page = vm_normal_page(vma, address, pteval); 2131 page = vm_normal_page(vma, address, pteval);
2164 if (unlikely(!page)) 2132 if (unlikely(!page))
@@ -2168,9 +2136,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2168 VM_BUG_ON_PAGE(!PageAnon(page), page); 2136 VM_BUG_ON_PAGE(!PageAnon(page), page);
2169 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 2137 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
2170 2138
2171 /* cannot use mapcount: can't collapse if there's a gup pin */
2172 if (page_count(page) != 1)
2173 goto out;
2174 /* 2139 /*
2175 * We can do it before isolate_lru_page because the 2140 * We can do it before isolate_lru_page because the
2176 * page can't be freed from under us. NOTE: PG_lock 2141 * page can't be freed from under us. NOTE: PG_lock
@@ -2179,6 +2144,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2179 */ 2144 */
2180 if (!trylock_page(page)) 2145 if (!trylock_page(page))
2181 goto out; 2146 goto out;
2147
2148 /*
2149 * cannot use mapcount: can't collapse if there's a gup pin.
2150 * The page must only be referenced by the scanned process
2151 * and page swap cache.
2152 */
2153 if (page_count(page) != 1 + !!PageSwapCache(page)) {
2154 unlock_page(page);
2155 goto out;
2156 }
2157 if (pte_write(pteval)) {
2158 writable = true;
2159 } else {
2160 if (PageSwapCache(page) && !reuse_swap_page(page)) {
2161 unlock_page(page);
2162 goto out;
2163 }
2164 /*
2165 * Page is not in the swap cache. It can be collapsed
2166 * into a THP.
2167 */
2168 }
2169
2182 /* 2170 /*
2183 * Isolate the page to avoid collapsing an hugepage 2171 * Isolate the page to avoid collapsing an hugepage
2184 * currently in use by the VM. 2172 * currently in use by the VM.
@@ -2195,9 +2183,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2195 /* If there is no mapped pte young don't collapse the page */ 2183 /* If there is no mapped pte young don't collapse the page */
2196 if (pte_young(pteval) || PageReferenced(page) || 2184 if (pte_young(pteval) || PageReferenced(page) ||
2197 mmu_notifier_test_young(vma->vm_mm, address)) 2185 mmu_notifier_test_young(vma->vm_mm, address))
2198 referenced = 1; 2186 referenced = true;
2199 } 2187 }
2200 if (likely(referenced)) 2188 if (likely(referenced && writable))
2201 return 1; 2189 return 1;
2202out: 2190out:
2203 release_pte_pages(pte, _pte); 2191 release_pte_pages(pte, _pte);
@@ -2550,11 +2538,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2550{ 2538{
2551 pmd_t *pmd; 2539 pmd_t *pmd;
2552 pte_t *pte, *_pte; 2540 pte_t *pte, *_pte;
2553 int ret = 0, referenced = 0, none = 0; 2541 int ret = 0, none = 0;
2554 struct page *page; 2542 struct page *page;
2555 unsigned long _address; 2543 unsigned long _address;
2556 spinlock_t *ptl; 2544 spinlock_t *ptl;
2557 int node = NUMA_NO_NODE; 2545 int node = NUMA_NO_NODE;
2546 bool writable = false, referenced = false;
2558 2547
2559 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2548 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2560 2549
@@ -2573,8 +2562,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2573 else 2562 else
2574 goto out_unmap; 2563 goto out_unmap;
2575 } 2564 }
2576 if (!pte_present(pteval) || !pte_write(pteval)) 2565 if (!pte_present(pteval))
2577 goto out_unmap; 2566 goto out_unmap;
2567 if (pte_write(pteval))
2568 writable = true;
2569
2578 page = vm_normal_page(vma, _address, pteval); 2570 page = vm_normal_page(vma, _address, pteval);
2579 if (unlikely(!page)) 2571 if (unlikely(!page))
2580 goto out_unmap; 2572 goto out_unmap;
@@ -2591,14 +2583,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2591 VM_BUG_ON_PAGE(PageCompound(page), page); 2583 VM_BUG_ON_PAGE(PageCompound(page), page);
2592 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2584 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2593 goto out_unmap; 2585 goto out_unmap;
2594 /* cannot use mapcount: can't collapse if there's a gup pin */ 2586 /*
2595 if (page_count(page) != 1) 2587 * cannot use mapcount: can't collapse if there's a gup pin.
2588 * The page must only be referenced by the scanned process
2589 * and page swap cache.
2590 */
2591 if (page_count(page) != 1 + !!PageSwapCache(page))
2596 goto out_unmap; 2592 goto out_unmap;
2597 if (pte_young(pteval) || PageReferenced(page) || 2593 if (pte_young(pteval) || PageReferenced(page) ||
2598 mmu_notifier_test_young(vma->vm_mm, address)) 2594 mmu_notifier_test_young(vma->vm_mm, address))
2599 referenced = 1; 2595 referenced = true;
2600 } 2596 }
2601 if (referenced) 2597 if (referenced && writable)
2602 ret = 1; 2598 ret = 1;
2603out_unmap: 2599out_unmap:
2604 pte_unmap_unlock(pte, ptl); 2600 pte_unmap_unlock(pte, ptl);