summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorMatthew Wilcox <willy@linux.intel.com>2017-02-24 17:57:02 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-24 20:46:54 -0500
commita00cc7d9dd93d66a3fb83fc52aa57a4bec51c517 (patch)
tree54d78e89c63e519cb9e00fdab9efbf3189ef2f5e /mm/memory.c
parenta2d581675d485eb7188f521f36efc114639a3096 (diff)
mm, x86: add support for PUD-sized transparent hugepages
The current transparent hugepage code only supports PMDs. This patch adds support for transparent use of PUDs with DAX. It does not include support for anonymous pages. x86 support code also added. Most of this patch simply parallels the work that was done for huge PMDs. The only major difference is how the new ->pud_entry method in mm_walk works. The ->pmd_entry method replaces the ->pte_entry method, whereas the ->pud_entry method works along with either ->pmd_entry or ->pte_entry. The pagewalk code takes care of locking the PUD before calling ->pud_walk, so handlers do not need to worry whether the PUD is stable. [dave.jiang@intel.com: fix SMP x86 32bit build for native_pud_clear()] Link: http://lkml.kernel.org/r/148719066814.31111.3239231168815337012.stgit@djiang5-desk3.ch.intel.com [dave.jiang@intel.com: native_pud_clear missing on i386 build] Link: http://lkml.kernel.org/r/148640375195.69754.3315433724330910314.stgit@djiang5-desk3.ch.intel.com Link: http://lkml.kernel.org/r/148545059381.17912.8602162635537598445.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> Signed-off-by: Dave Jiang <dave.jiang@intel.com> Tested-by: Alexander Kapshuk <alexander.kapshuk@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Jan Kara <jack@suse.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c88
1 files changed, 81 insertions, 7 deletions
diff --git a/mm/memory.c b/mm/memory.c
index e721e8eba570..41e2a2d4b2a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1001,7 +1001,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
1001 next = pmd_addr_end(addr, end); 1001 next = pmd_addr_end(addr, end);
1002 if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { 1002 if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
1003 int err; 1003 int err;
1004 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); 1004 VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
1005 err = copy_huge_pmd(dst_mm, src_mm, 1005 err = copy_huge_pmd(dst_mm, src_mm,
1006 dst_pmd, src_pmd, addr, vma); 1006 dst_pmd, src_pmd, addr, vma);
1007 if (err == -ENOMEM) 1007 if (err == -ENOMEM)
@@ -1032,6 +1032,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
1032 src_pud = pud_offset(src_pgd, addr); 1032 src_pud = pud_offset(src_pgd, addr);
1033 do { 1033 do {
1034 next = pud_addr_end(addr, end); 1034 next = pud_addr_end(addr, end);
1035 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
1036 int err;
1037
1038 VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
1039 err = copy_huge_pud(dst_mm, src_mm,
1040 dst_pud, src_pud, addr, vma);
1041 if (err == -ENOMEM)
1042 return -ENOMEM;
1043 if (!err)
1044 continue;
1045 /* fall through */
1046 }
1035 if (pud_none_or_clear_bad(src_pud)) 1047 if (pud_none_or_clear_bad(src_pud))
1036 continue; 1048 continue;
1037 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, 1049 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
@@ -1263,9 +1275,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1263 pud = pud_offset(pgd, addr); 1275 pud = pud_offset(pgd, addr);
1264 do { 1276 do {
1265 next = pud_addr_end(addr, end); 1277 next = pud_addr_end(addr, end);
1278 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
1279 if (next - addr != HPAGE_PUD_SIZE) {
1280 VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1281 split_huge_pud(vma, pud, addr);
1282 } else if (zap_huge_pud(tlb, vma, pud, addr))
1283 goto next;
1284 /* fall through */
1285 }
1266 if (pud_none_or_clear_bad(pud)) 1286 if (pud_none_or_clear_bad(pud))
1267 continue; 1287 continue;
1268 next = zap_pmd_range(tlb, vma, pud, addr, next, details); 1288 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1289next:
1290 cond_resched();
1269 } while (pud++, addr = next, addr != end); 1291 } while (pud++, addr = next, addr != end);
1270 1292
1271 return addr; 1293 return addr;
@@ -3490,6 +3512,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
3490 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); 3512 return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
3491} 3513}
3492 3514
3515static int create_huge_pud(struct vm_fault *vmf)
3516{
3517#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3518 /* No support for anonymous transparent PUD pages yet */
3519 if (vma_is_anonymous(vmf->vma))
3520 return VM_FAULT_FALLBACK;
3521 if (vmf->vma->vm_ops->huge_fault)
3522 return vmf->vma->vm_ops->huge_fault(vmf);
3523#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3524 return VM_FAULT_FALLBACK;
3525}
3526
3527static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
3528{
3529#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3530 /* No support for anonymous transparent PUD pages yet */
3531 if (vma_is_anonymous(vmf->vma))
3532 return VM_FAULT_FALLBACK;
3533 if (vmf->vma->vm_ops->huge_fault)
3534 return vmf->vma->vm_ops->huge_fault(vmf);
3535#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3536 return VM_FAULT_FALLBACK;
3537}
3538
3493/* 3539/*
3494 * These routines also need to handle stuff like marking pages dirty 3540 * These routines also need to handle stuff like marking pages dirty
3495 * and/or accessed for architectures that don't do it in hardware (most 3541 * and/or accessed for architectures that don't do it in hardware (most
@@ -3605,14 +3651,41 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3605 }; 3651 };
3606 struct mm_struct *mm = vma->vm_mm; 3652 struct mm_struct *mm = vma->vm_mm;
3607 pgd_t *pgd; 3653 pgd_t *pgd;
3608 pud_t *pud;
3609 int ret; 3654 int ret;
3610 3655
3611 pgd = pgd_offset(mm, address); 3656 pgd = pgd_offset(mm, address);
3612 pud = pud_alloc(mm, pgd, address); 3657
3613 if (!pud) 3658 vmf.pud = pud_alloc(mm, pgd, address);
3659 if (!vmf.pud)
3614 return VM_FAULT_OOM; 3660 return VM_FAULT_OOM;
3615 vmf.pmd = pmd_alloc(mm, pud, address); 3661 if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
3662 vmf.flags |= FAULT_FLAG_SIZE_PUD;
3663 ret = create_huge_pud(&vmf);
3664 if (!(ret & VM_FAULT_FALLBACK))
3665 return ret;
3666 } else {
3667 pud_t orig_pud = *vmf.pud;
3668
3669 barrier();
3670 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
3671 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3672
3673 vmf.flags |= FAULT_FLAG_SIZE_PUD;
3674
3675 /* NUMA case for anonymous PUDs would go here */
3676
3677 if (dirty && !pud_write(orig_pud)) {
3678 ret = wp_huge_pud(&vmf, orig_pud);
3679 if (!(ret & VM_FAULT_FALLBACK))
3680 return ret;
3681 } else {
3682 huge_pud_set_accessed(&vmf, orig_pud);
3683 return 0;
3684 }
3685 }
3686 }
3687
3688 vmf.pmd = pmd_alloc(mm, vmf.pud, address);
3616 if (!vmf.pmd) 3689 if (!vmf.pmd)
3617 return VM_FAULT_OOM; 3690 return VM_FAULT_OOM;
3618 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { 3691 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
@@ -3743,13 +3816,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3743 */ 3816 */
3744int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 3817int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3745{ 3818{
3819 spinlock_t *ptl;
3746 pmd_t *new = pmd_alloc_one(mm, address); 3820 pmd_t *new = pmd_alloc_one(mm, address);
3747 if (!new) 3821 if (!new)
3748 return -ENOMEM; 3822 return -ENOMEM;
3749 3823
3750 smp_wmb(); /* See comment in __pte_alloc */ 3824 smp_wmb(); /* See comment in __pte_alloc */
3751 3825
3752 spin_lock(&mm->page_table_lock); 3826 ptl = pud_lock(mm, pud);
3753#ifndef __ARCH_HAS_4LEVEL_HACK 3827#ifndef __ARCH_HAS_4LEVEL_HACK
3754 if (!pud_present(*pud)) { 3828 if (!pud_present(*pud)) {
3755 mm_inc_nr_pmds(mm); 3829 mm_inc_nr_pmds(mm);
@@ -3763,7 +3837,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3763 } else /* Another has populated it */ 3837 } else /* Another has populated it */
3764 pmd_free(mm, new); 3838 pmd_free(mm, new);
3765#endif /* __ARCH_HAS_4LEVEL_HACK */ 3839#endif /* __ARCH_HAS_4LEVEL_HACK */
3766 spin_unlock(&mm->page_table_lock); 3840 spin_unlock(ptl);
3767 return 0; 3841 return 0;
3768} 3842}
3769#endif /* __PAGETABLE_PMD_FOLDED */ 3843#endif /* __PAGETABLE_PMD_FOLDED */