diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 249 |
1 files changed, 249 insertions, 0 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9ecc2aeadfc..85742ac5b32e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -757,6 +757,60 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | |||
757 | } | 757 | } |
758 | EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); | 758 | EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); |
759 | 759 | ||
760 | #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD | ||
761 | static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) | ||
762 | { | ||
763 | if (likely(vma->vm_flags & VM_WRITE)) | ||
764 | pud = pud_mkwrite(pud); | ||
765 | return pud; | ||
766 | } | ||
767 | |||
768 | static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, | ||
769 | pud_t *pud, pfn_t pfn, pgprot_t prot, bool write) | ||
770 | { | ||
771 | struct mm_struct *mm = vma->vm_mm; | ||
772 | pud_t entry; | ||
773 | spinlock_t *ptl; | ||
774 | |||
775 | ptl = pud_lock(mm, pud); | ||
776 | entry = pud_mkhuge(pfn_t_pud(pfn, prot)); | ||
777 | if (pfn_t_devmap(pfn)) | ||
778 | entry = pud_mkdevmap(entry); | ||
779 | if (write) { | ||
780 | entry = pud_mkyoung(pud_mkdirty(entry)); | ||
781 | entry = maybe_pud_mkwrite(entry, vma); | ||
782 | } | ||
783 | set_pud_at(mm, addr, pud, entry); | ||
784 | update_mmu_cache_pud(vma, addr, pud); | ||
785 | spin_unlock(ptl); | ||
786 | } | ||
787 | |||
788 | int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, | ||
789 | pud_t *pud, pfn_t pfn, bool write) | ||
790 | { | ||
791 | pgprot_t pgprot = vma->vm_page_prot; | ||
792 | /* | ||
793 | * If we had pud_special, we could avoid all these restrictions, | ||
794 | * but we need to be consistent with PTEs and architectures that | ||
795 | * can't support a 'special' bit. | ||
796 | */ | ||
797 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); | ||
798 | BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == | ||
799 | (VM_PFNMAP|VM_MIXEDMAP)); | ||
800 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); | ||
801 | BUG_ON(!pfn_t_devmap(pfn)); | ||
802 | |||
803 | if (addr < vma->vm_start || addr >= vma->vm_end) | ||
804 | return VM_FAULT_SIGBUS; | ||
805 | |||
806 | track_pfn_insert(vma, &pgprot, pfn); | ||
807 | |||
808 | insert_pfn_pud(vma, addr, pud, pfn, pgprot, write); | ||
809 | return VM_FAULT_NOPAGE; | ||
810 | } | ||
811 | EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); | ||
812 | #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ | ||
813 | |||
760 | static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, | 814 | static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, |
761 | pmd_t *pmd) | 815 | pmd_t *pmd) |
762 | { | 816 | { |
@@ -887,6 +941,123 @@ out: | |||
887 | return ret; | 941 | return ret; |
888 | } | 942 | } |
889 | 943 | ||
944 | #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD | ||
945 | static void touch_pud(struct vm_area_struct *vma, unsigned long addr, | ||
946 | pud_t *pud) | ||
947 | { | ||
948 | pud_t _pud; | ||
949 | |||
950 | /* | ||
951 | * We should set the dirty bit only for FOLL_WRITE but for now | ||
952 | * the dirty bit in the pud is meaningless. And if the dirty | ||
953 | * bit will become meaningful and we'll only set it with | ||
954 | * FOLL_WRITE, an atomic set_bit will be required on the pud to | ||
955 | * set the young bit, instead of the current set_pud_at. | ||
956 | */ | ||
957 | _pud = pud_mkyoung(pud_mkdirty(*pud)); | ||
958 | if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, | ||
959 | pud, _pud, 1)) | ||
960 | update_mmu_cache_pud(vma, addr, pud); | ||
961 | } | ||
962 | |||
963 | struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, | ||
964 | pud_t *pud, int flags) | ||
965 | { | ||
966 | unsigned long pfn = pud_pfn(*pud); | ||
967 | struct mm_struct *mm = vma->vm_mm; | ||
968 | struct dev_pagemap *pgmap; | ||
969 | struct page *page; | ||
970 | |||
971 | assert_spin_locked(pud_lockptr(mm, pud)); | ||
972 | |||
973 | if (flags & FOLL_WRITE && !pud_write(*pud)) | ||
974 | return NULL; | ||
975 | |||
976 | if (pud_present(*pud) && pud_devmap(*pud)) | ||
977 | /* pass */; | ||
978 | else | ||
979 | return NULL; | ||
980 | |||
981 | if (flags & FOLL_TOUCH) | ||
982 | touch_pud(vma, addr, pud); | ||
983 | |||
984 | /* | ||
985 | * device mapped pages can only be returned if the | ||
986 | * caller will manage the page reference count. | ||
987 | */ | ||
988 | if (!(flags & FOLL_GET)) | ||
989 | return ERR_PTR(-EEXIST); | ||
990 | |||
991 | pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; | ||
992 | pgmap = get_dev_pagemap(pfn, NULL); | ||
993 | if (!pgmap) | ||
994 | return ERR_PTR(-EFAULT); | ||
995 | page = pfn_to_page(pfn); | ||
996 | get_page(page); | ||
997 | put_dev_pagemap(pgmap); | ||
998 | |||
999 | return page; | ||
1000 | } | ||
1001 | |||
1002 | int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||
1003 | pud_t *dst_pud, pud_t *src_pud, unsigned long addr, | ||
1004 | struct vm_area_struct *vma) | ||
1005 | { | ||
1006 | spinlock_t *dst_ptl, *src_ptl; | ||
1007 | pud_t pud; | ||
1008 | int ret; | ||
1009 | |||
1010 | dst_ptl = pud_lock(dst_mm, dst_pud); | ||
1011 | src_ptl = pud_lockptr(src_mm, src_pud); | ||
1012 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | ||
1013 | |||
1014 | ret = -EAGAIN; | ||
1015 | pud = *src_pud; | ||
1016 | if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) | ||
1017 | goto out_unlock; | ||
1018 | |||
1019 | /* | ||
1020 | * When page table lock is held, the huge zero pud should not be | ||
1021 | * under splitting since we don't split the page itself, only pud to | ||
1022 | * a page table. | ||
1023 | */ | ||
1024 | if (is_huge_zero_pud(pud)) { | ||
1025 | /* No huge zero pud yet */ | ||
1026 | } | ||
1027 | |||
1028 | pudp_set_wrprotect(src_mm, addr, src_pud); | ||
1029 | pud = pud_mkold(pud_wrprotect(pud)); | ||
1030 | set_pud_at(dst_mm, addr, dst_pud, pud); | ||
1031 | |||
1032 | ret = 0; | ||
1033 | out_unlock: | ||
1034 | spin_unlock(src_ptl); | ||
1035 | spin_unlock(dst_ptl); | ||
1036 | return ret; | ||
1037 | } | ||
1038 | |||
1039 | void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) | ||
1040 | { | ||
1041 | pud_t entry; | ||
1042 | unsigned long haddr; | ||
1043 | bool write = vmf->flags & FAULT_FLAG_WRITE; | ||
1044 | |||
1045 | vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); | ||
1046 | if (unlikely(!pud_same(*vmf->pud, orig_pud))) | ||
1047 | goto unlock; | ||
1048 | |||
1049 | entry = pud_mkyoung(orig_pud); | ||
1050 | if (write) | ||
1051 | entry = pud_mkdirty(entry); | ||
1052 | haddr = vmf->address & HPAGE_PUD_MASK; | ||
1053 | if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write)) | ||
1054 | update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud); | ||
1055 | |||
1056 | unlock: | ||
1057 | spin_unlock(vmf->ptl); | ||
1058 | } | ||
1059 | #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ | ||
1060 | |||
890 | void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) | 1061 | void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) |
891 | { | 1062 | { |
892 | pmd_t entry; | 1063 | pmd_t entry; |
@@ -1601,6 +1772,84 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) | |||
1601 | return NULL; | 1772 | return NULL; |
1602 | } | 1773 | } |
1603 | 1774 | ||
1775 | /* | ||
1776 | * Returns true if a given pud maps a thp, false otherwise. | ||
1777 | * | ||
1778 | * Note that if it returns true, this routine returns without unlocking page | ||
1779 | * table lock. So callers must unlock it. | ||
1780 | */ | ||
1781 | spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) | ||
1782 | { | ||
1783 | spinlock_t *ptl; | ||
1784 | |||
1785 | ptl = pud_lock(vma->vm_mm, pud); | ||
1786 | if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) | ||
1787 | return ptl; | ||
1788 | spin_unlock(ptl); | ||
1789 | return NULL; | ||
1790 | } | ||
1791 | |||
1792 | #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD | ||
1793 | int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, | ||
1794 | pud_t *pud, unsigned long addr) | ||
1795 | { | ||
1796 | pud_t orig_pud; | ||
1797 | spinlock_t *ptl; | ||
1798 | |||
1799 | ptl = __pud_trans_huge_lock(pud, vma); | ||
1800 | if (!ptl) | ||
1801 | return 0; | ||
1802 | /* | ||
1803 | * For architectures like ppc64 we look at deposited pgtable | ||
1804 | * when calling pudp_huge_get_and_clear. So do the | ||
1805 | * pgtable_trans_huge_withdraw after finishing pudp related | ||
1806 | * operations. | ||
1807 | */ | ||
1808 | orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, | ||
1809 | tlb->fullmm); | ||
1810 | tlb_remove_pud_tlb_entry(tlb, pud, addr); | ||
1811 | if (vma_is_dax(vma)) { | ||
1812 | spin_unlock(ptl); | ||
1813 | /* No zero page support yet */ | ||
1814 | } else { | ||
1815 | /* No support for anonymous PUD pages yet */ | ||
1816 | BUG(); | ||
1817 | } | ||
1818 | return 1; | ||
1819 | } | ||
1820 | |||
1821 | static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, | ||
1822 | unsigned long haddr) | ||
1823 | { | ||
1824 | VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); | ||
1825 | VM_BUG_ON_VMA(vma->vm_start > haddr, vma); | ||
1826 | VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); | ||
1827 | VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); | ||
1828 | |||
1829 | count_vm_event(THP_SPLIT_PMD); | ||
1830 | |||
1831 | pudp_huge_clear_flush_notify(vma, haddr, pud); | ||
1832 | } | ||
1833 | |||
1834 | void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, | ||
1835 | unsigned long address) | ||
1836 | { | ||
1837 | spinlock_t *ptl; | ||
1838 | struct mm_struct *mm = vma->vm_mm; | ||
1839 | unsigned long haddr = address & HPAGE_PUD_MASK; | ||
1840 | |||
1841 | mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE); | ||
1842 | ptl = pud_lock(mm, pud); | ||
1843 | if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) | ||
1844 | goto out; | ||
1845 | __split_huge_pud_locked(vma, pud, haddr); | ||
1846 | |||
1847 | out: | ||
1848 | spin_unlock(ptl); | ||
1849 | mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE); | ||
1850 | } | ||
1851 | #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ | ||
1852 | |||
1604 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | 1853 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, |
1605 | unsigned long haddr, pmd_t *pmd) | 1854 | unsigned long haddr, pmd_t *pmd) |
1606 | { | 1855 | { |