diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 156 |
1 files changed, 76 insertions, 80 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 817a875f2b8c..fc00c8cb5a82 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -171,12 +171,7 @@ static int start_khugepaged(void) | |||
171 | } | 171 | } |
172 | 172 | ||
173 | static atomic_t huge_zero_refcount; | 173 | static atomic_t huge_zero_refcount; |
174 | static struct page *huge_zero_page __read_mostly; | 174 | struct page *huge_zero_page __read_mostly; |
175 | |||
176 | static inline bool is_huge_zero_page(struct page *page) | ||
177 | { | ||
178 | return ACCESS_ONCE(huge_zero_page) == page; | ||
179 | } | ||
180 | 175 | ||
181 | static inline bool is_huge_zero_pmd(pmd_t pmd) | 176 | static inline bool is_huge_zero_pmd(pmd_t pmd) |
182 | { | 177 | { |
@@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) | |||
766 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; | 761 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; |
767 | } | 762 | } |
768 | 763 | ||
769 | static inline struct page *alloc_hugepage_vma(int defrag, | ||
770 | struct vm_area_struct *vma, | ||
771 | unsigned long haddr, int nd, | ||
772 | gfp_t extra_gfp) | ||
773 | { | ||
774 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), | ||
775 | HPAGE_PMD_ORDER, vma, haddr, nd); | ||
776 | } | ||
777 | |||
778 | /* Caller must hold page table lock. */ | 764 | /* Caller must hold page table lock. */ |
779 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 765 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
780 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 766 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
@@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
795 | unsigned long address, pmd_t *pmd, | 781 | unsigned long address, pmd_t *pmd, |
796 | unsigned int flags) | 782 | unsigned int flags) |
797 | { | 783 | { |
784 | gfp_t gfp; | ||
798 | struct page *page; | 785 | struct page *page; |
799 | unsigned long haddr = address & HPAGE_PMD_MASK; | 786 | unsigned long haddr = address & HPAGE_PMD_MASK; |
800 | 787 | ||
@@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
829 | } | 816 | } |
830 | return 0; | 817 | return 0; |
831 | } | 818 | } |
832 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 819 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
833 | vma, haddr, numa_node_id(), 0); | 820 | page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); |
834 | if (unlikely(!page)) { | 821 | if (unlikely(!page)) { |
835 | count_vm_event(THP_FAULT_FALLBACK); | 822 | count_vm_event(THP_FAULT_FALLBACK); |
836 | return VM_FAULT_FALLBACK; | 823 | return VM_FAULT_FALLBACK; |
@@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1118 | spin_unlock(ptl); | 1105 | spin_unlock(ptl); |
1119 | alloc: | 1106 | alloc: |
1120 | if (transparent_hugepage_enabled(vma) && | 1107 | if (transparent_hugepage_enabled(vma) && |
1121 | !transparent_hugepage_debug_cow()) | 1108 | !transparent_hugepage_debug_cow()) { |
1122 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 1109 | gfp_t gfp; |
1123 | vma, haddr, numa_node_id(), 0); | 1110 | |
1124 | else | 1111 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
1112 | new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); | ||
1113 | } else | ||
1125 | new_page = NULL; | 1114 | new_page = NULL; |
1126 | 1115 | ||
1127 | if (unlikely(!new_page)) { | 1116 | if (unlikely(!new_page)) { |
@@ -1222,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1222 | return ERR_PTR(-EFAULT); | 1211 | return ERR_PTR(-EFAULT); |
1223 | 1212 | ||
1224 | /* Full NUMA hinting faults to serialise migration in fault paths */ | 1213 | /* Full NUMA hinting faults to serialise migration in fault paths */ |
1225 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | 1214 | if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) |
1226 | goto out; | 1215 | goto out; |
1227 | 1216 | ||
1228 | page = pmd_page(*pmd); | 1217 | page = pmd_page(*pmd); |
@@ -1273,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1273 | bool migrated = false; | 1262 | bool migrated = false; |
1274 | int flags = 0; | 1263 | int flags = 0; |
1275 | 1264 | ||
1265 | /* A PROT_NONE fault should not end up here */ | ||
1266 | BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); | ||
1267 | |||
1276 | ptl = pmd_lock(mm, pmdp); | 1268 | ptl = pmd_lock(mm, pmdp); |
1277 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1269 | if (unlikely(!pmd_same(pmd, *pmdp))) |
1278 | goto out_unlock; | 1270 | goto out_unlock; |
@@ -1283,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1283 | * check_same as the page may no longer be mapped. | 1275 | * check_same as the page may no longer be mapped. |
1284 | */ | 1276 | */ |
1285 | if (unlikely(pmd_trans_migrating(*pmdp))) { | 1277 | if (unlikely(pmd_trans_migrating(*pmdp))) { |
1278 | page = pmd_page(*pmdp); | ||
1286 | spin_unlock(ptl); | 1279 | spin_unlock(ptl); |
1287 | wait_migrate_huge_page(vma->anon_vma, pmdp); | 1280 | wait_on_page_locked(page); |
1288 | goto out; | 1281 | goto out; |
1289 | } | 1282 | } |
1290 | 1283 | ||
@@ -1352,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1352 | 1345 | ||
1353 | /* | 1346 | /* |
1354 | * Migrate the THP to the requested node, returns with page unlocked | 1347 | * Migrate the THP to the requested node, returns with page unlocked |
1355 | * and pmd_numa cleared. | 1348 | * and access rights restored. |
1356 | */ | 1349 | */ |
1357 | spin_unlock(ptl); | 1350 | spin_unlock(ptl); |
1358 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1351 | migrated = migrate_misplaced_transhuge_page(mm, vma, |
@@ -1365,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1365 | goto out; | 1358 | goto out; |
1366 | clear_pmdnuma: | 1359 | clear_pmdnuma: |
1367 | BUG_ON(!PageLocked(page)); | 1360 | BUG_ON(!PageLocked(page)); |
1368 | pmd = pmd_mknonnuma(pmd); | 1361 | pmd = pmd_modify(pmd, vma->vm_page_prot); |
1369 | set_pmd_at(mm, haddr, pmdp, pmd); | 1362 | set_pmd_at(mm, haddr, pmdp, pmd); |
1370 | VM_BUG_ON(pmd_numa(*pmdp)); | ||
1371 | update_mmu_cache_pmd(vma, addr, pmdp); | 1363 | update_mmu_cache_pmd(vma, addr, pmdp); |
1372 | unlock_page(page); | 1364 | unlock_page(page); |
1373 | out_unlock: | 1365 | out_unlock: |
@@ -1423,26 +1415,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1423 | return ret; | 1415 | return ret; |
1424 | } | 1416 | } |
1425 | 1417 | ||
1426 | int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | ||
1427 | unsigned long addr, unsigned long end, | ||
1428 | unsigned char *vec) | ||
1429 | { | ||
1430 | spinlock_t *ptl; | ||
1431 | int ret = 0; | ||
1432 | |||
1433 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | ||
1434 | /* | ||
1435 | * All logical pages in the range are present | ||
1436 | * if backed by a huge page. | ||
1437 | */ | ||
1438 | spin_unlock(ptl); | ||
1439 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
1440 | ret = 1; | ||
1441 | } | ||
1442 | |||
1443 | return ret; | ||
1444 | } | ||
1445 | |||
1446 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | 1418 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, |
1447 | unsigned long old_addr, | 1419 | unsigned long old_addr, |
1448 | unsigned long new_addr, unsigned long old_end, | 1420 | unsigned long new_addr, unsigned long old_end, |
@@ -1510,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1510 | 1482 | ||
1511 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 1483 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
1512 | pmd_t entry; | 1484 | pmd_t entry; |
1513 | ret = 1; | 1485 | |
1514 | if (!prot_numa) { | 1486 | /* |
1487 | * Avoid trapping faults against the zero page. The read-only | ||
1488 | * data is likely to be read-cached on the local CPU and | ||
1489 | * local/remote hits to the zero page are not interesting. | ||
1490 | */ | ||
1491 | if (prot_numa && is_huge_zero_pmd(*pmd)) { | ||
1492 | spin_unlock(ptl); | ||
1493 | return 0; | ||
1494 | } | ||
1495 | |||
1496 | if (!prot_numa || !pmd_protnone(*pmd)) { | ||
1497 | ret = 1; | ||
1515 | entry = pmdp_get_and_clear_notify(mm, addr, pmd); | 1498 | entry = pmdp_get_and_clear_notify(mm, addr, pmd); |
1516 | if (pmd_numa(entry)) | ||
1517 | entry = pmd_mknonnuma(entry); | ||
1518 | entry = pmd_modify(entry, newprot); | 1499 | entry = pmd_modify(entry, newprot); |
1519 | ret = HPAGE_PMD_NR; | 1500 | ret = HPAGE_PMD_NR; |
1520 | set_pmd_at(mm, addr, pmd, entry); | 1501 | set_pmd_at(mm, addr, pmd, entry); |
1521 | BUG_ON(pmd_write(entry)); | 1502 | BUG_ON(pmd_write(entry)); |
1522 | } else { | ||
1523 | struct page *page = pmd_page(*pmd); | ||
1524 | |||
1525 | /* | ||
1526 | * Do not trap faults against the zero page. The | ||
1527 | * read-only data is likely to be read-cached on the | ||
1528 | * local CPU cache and it is less useful to know about | ||
1529 | * local vs remote hits on the zero page. | ||
1530 | */ | ||
1531 | if (!is_huge_zero_page(page) && | ||
1532 | !pmd_numa(*pmd)) { | ||
1533 | pmdp_set_numa(mm, addr, pmd); | ||
1534 | ret = HPAGE_PMD_NR; | ||
1535 | } | ||
1536 | } | 1503 | } |
1537 | spin_unlock(ptl); | 1504 | spin_unlock(ptl); |
1538 | } | 1505 | } |
@@ -1797,9 +1764,9 @@ static int __split_huge_page_map(struct page *page, | |||
1797 | pte_t *pte, entry; | 1764 | pte_t *pte, entry; |
1798 | BUG_ON(PageCompound(page+i)); | 1765 | BUG_ON(PageCompound(page+i)); |
1799 | /* | 1766 | /* |
1800 | * Note that pmd_numa is not transferred deliberately | 1767 | * Note that NUMA hinting access restrictions are not |
1801 | * to avoid any possibility that pte_numa leaks to | 1768 | * transferred to avoid any possibility of altering |
1802 | * a PROT_NONE VMA by accident. | 1769 | * permissions across VMAs. |
1803 | */ | 1770 | */ |
1804 | entry = mk_pte(page + i, vma->vm_page_prot); | 1771 | entry = mk_pte(page + i, vma->vm_page_prot); |
1805 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1772 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
@@ -2148,7 +2115,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2148 | { | 2115 | { |
2149 | struct page *page; | 2116 | struct page *page; |
2150 | pte_t *_pte; | 2117 | pte_t *_pte; |
2151 | int referenced = 0, none = 0; | 2118 | int none = 0; |
2119 | bool referenced = false, writable = false; | ||
2152 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | 2120 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; |
2153 | _pte++, address += PAGE_SIZE) { | 2121 | _pte++, address += PAGE_SIZE) { |
2154 | pte_t pteval = *_pte; | 2122 | pte_t pteval = *_pte; |
@@ -2158,7 +2126,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2158 | else | 2126 | else |
2159 | goto out; | 2127 | goto out; |
2160 | } | 2128 | } |
2161 | if (!pte_present(pteval) || !pte_write(pteval)) | 2129 | if (!pte_present(pteval)) |
2162 | goto out; | 2130 | goto out; |
2163 | page = vm_normal_page(vma, address, pteval); | 2131 | page = vm_normal_page(vma, address, pteval); |
2164 | if (unlikely(!page)) | 2132 | if (unlikely(!page)) |
@@ -2168,9 +2136,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2168 | VM_BUG_ON_PAGE(!PageAnon(page), page); | 2136 | VM_BUG_ON_PAGE(!PageAnon(page), page); |
2169 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | 2137 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
2170 | 2138 | ||
2171 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
2172 | if (page_count(page) != 1) | ||
2173 | goto out; | ||
2174 | /* | 2139 | /* |
2175 | * We can do it before isolate_lru_page because the | 2140 | * We can do it before isolate_lru_page because the |
2176 | * page can't be freed from under us. NOTE: PG_lock | 2141 | * page can't be freed from under us. NOTE: PG_lock |
@@ -2179,6 +2144,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2179 | */ | 2144 | */ |
2180 | if (!trylock_page(page)) | 2145 | if (!trylock_page(page)) |
2181 | goto out; | 2146 | goto out; |
2147 | |||
2148 | /* | ||
2149 | * cannot use mapcount: can't collapse if there's a gup pin. | ||
2150 | * The page must only be referenced by the scanned process | ||
2151 | * and page swap cache. | ||
2152 | */ | ||
2153 | if (page_count(page) != 1 + !!PageSwapCache(page)) { | ||
2154 | unlock_page(page); | ||
2155 | goto out; | ||
2156 | } | ||
2157 | if (pte_write(pteval)) { | ||
2158 | writable = true; | ||
2159 | } else { | ||
2160 | if (PageSwapCache(page) && !reuse_swap_page(page)) { | ||
2161 | unlock_page(page); | ||
2162 | goto out; | ||
2163 | } | ||
2164 | /* | ||
2165 | * Page is not in the swap cache. It can be collapsed | ||
2166 | * into a THP. | ||
2167 | */ | ||
2168 | } | ||
2169 | |||
2182 | /* | 2170 | /* |
2183 | * Isolate the page to avoid collapsing an hugepage | 2171 | * Isolate the page to avoid collapsing an hugepage |
2184 | * currently in use by the VM. | 2172 | * currently in use by the VM. |
@@ -2195,9 +2183,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2195 | /* If there is no mapped pte young don't collapse the page */ | 2183 | /* If there is no mapped pte young don't collapse the page */ |
2196 | if (pte_young(pteval) || PageReferenced(page) || | 2184 | if (pte_young(pteval) || PageReferenced(page) || |
2197 | mmu_notifier_test_young(vma->vm_mm, address)) | 2185 | mmu_notifier_test_young(vma->vm_mm, address)) |
2198 | referenced = 1; | 2186 | referenced = true; |
2199 | } | 2187 | } |
2200 | if (likely(referenced)) | 2188 | if (likely(referenced && writable)) |
2201 | return 1; | 2189 | return 1; |
2202 | out: | 2190 | out: |
2203 | release_pte_pages(pte, _pte); | 2191 | release_pte_pages(pte, _pte); |
@@ -2550,11 +2538,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2550 | { | 2538 | { |
2551 | pmd_t *pmd; | 2539 | pmd_t *pmd; |
2552 | pte_t *pte, *_pte; | 2540 | pte_t *pte, *_pte; |
2553 | int ret = 0, referenced = 0, none = 0; | 2541 | int ret = 0, none = 0; |
2554 | struct page *page; | 2542 | struct page *page; |
2555 | unsigned long _address; | 2543 | unsigned long _address; |
2556 | spinlock_t *ptl; | 2544 | spinlock_t *ptl; |
2557 | int node = NUMA_NO_NODE; | 2545 | int node = NUMA_NO_NODE; |
2546 | bool writable = false, referenced = false; | ||
2558 | 2547 | ||
2559 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2548 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2560 | 2549 | ||
@@ -2573,8 +2562,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2573 | else | 2562 | else |
2574 | goto out_unmap; | 2563 | goto out_unmap; |
2575 | } | 2564 | } |
2576 | if (!pte_present(pteval) || !pte_write(pteval)) | 2565 | if (!pte_present(pteval)) |
2577 | goto out_unmap; | 2566 | goto out_unmap; |
2567 | if (pte_write(pteval)) | ||
2568 | writable = true; | ||
2569 | |||
2578 | page = vm_normal_page(vma, _address, pteval); | 2570 | page = vm_normal_page(vma, _address, pteval); |
2579 | if (unlikely(!page)) | 2571 | if (unlikely(!page)) |
2580 | goto out_unmap; | 2572 | goto out_unmap; |
@@ -2591,14 +2583,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2591 | VM_BUG_ON_PAGE(PageCompound(page), page); | 2583 | VM_BUG_ON_PAGE(PageCompound(page), page); |
2592 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2584 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
2593 | goto out_unmap; | 2585 | goto out_unmap; |
2594 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2586 | /* |
2595 | if (page_count(page) != 1) | 2587 | * cannot use mapcount: can't collapse if there's a gup pin. |
2588 | * The page must only be referenced by the scanned process | ||
2589 | * and page swap cache. | ||
2590 | */ | ||
2591 | if (page_count(page) != 1 + !!PageSwapCache(page)) | ||
2596 | goto out_unmap; | 2592 | goto out_unmap; |
2597 | if (pte_young(pteval) || PageReferenced(page) || | 2593 | if (pte_young(pteval) || PageReferenced(page) || |
2598 | mmu_notifier_test_young(vma->vm_mm, address)) | 2594 | mmu_notifier_test_young(vma->vm_mm, address)) |
2599 | referenced = 1; | 2595 | referenced = true; |
2600 | } | 2596 | } |
2601 | if (referenced) | 2597 | if (referenced && writable) |
2602 | ret = 1; | 2598 | ret = 1; |
2603 | out_unmap: | 2599 | out_unmap: |
2604 | pte_unmap_unlock(pte, ptl); | 2600 | pte_unmap_unlock(pte, ptl); |