mm, x86: add support for PUD-sized transparent hugepages

The current transparent hugepage code only supports PMDs. This patch adds support for transparent use of PUDs with DAX. It does not include support for anonymous pages. x86 support code also added. Most of this patch simply parallels the work that was done for huge PMDs. The only major difference is how the new ->pud_entry method in mm_walk works. The ->pmd_entry method replaces the ->pte_entry method, whereas the ->pud_entry method works along with either ->pmd_entry or ->pte_entry. The pagewalk code takes care of locking the PUD before calling ->pud_walk, so handlers do not need to worry whether the PUD is stable. [dave.jiang@intel.com: fix SMP x86 32bit build for native_pud_clear()] Link: http://lkml.kernel.org/r/148719066814.31111.3239231168815337012.stgit@djiang5-desk3.ch.intel.com [dave.jiang@intel.com: native_pud_clear missing on i386 build] Link: http://lkml.kernel.org/r/148640375195.69754.3315433724330910314.stgit@djiang5-desk3.ch.intel.com Link: http://lkml.kernel.org/r/148545059381.17912.8602162635537598445.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> Signed-off-by: Dave Jiang <dave.jiang@intel.com> Tested-by: Alexander Kapshuk <alexander.kapshuk@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Jan Kara <jack@suse.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Matthew Wilcox <willy@linux.intel.com> 2017-02-24 17:57:02 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-02-24 20:46:54 -0500
commit: a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517 (patch)
tree: 54d78e89c63e519cb9e00fdab9efbf3189ef2f5e /mm/memory.c
parent: a2d581675d485eb7188f521f36efc114639a3096 (diff)
1 files changed, 81 insertions, 7 deletions
diff --git a/mm/memory.c b/mm/memory.c
index e721e8eba570..41e2a2d4b2a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1001,7 +1001,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
                        int err;
-                        VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+                        VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
                        err = copy_huge_pmd(dst_mm, src_mm,
                                            dst_pmd, src_pmd, addr, vma);
                        if (err == -ENOMEM)
@@ -1032,6 +1032,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
        src_pud = pud_offset(src_pgd, addr);
        do {
                next = pud_addr_end(addr, end);
+                if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+                        int err;
+                        VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
+                        err = copy_huge_pud(dst_mm, src_mm,
+                                            dst_pud, src_pud, addr, vma);
+                        if (err == -ENOMEM)
+                                return -ENOMEM;
+                        if (!err)
+                                continue;
+                        /* fall through */
+                }
                if (pud_none_or_clear_bad(src_pud))
                        continue;
                if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
@@ -1263,9 +1275,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
+                if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+                        if (next - addr != HPAGE_PUD_SIZE) {
+                                VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+                                split_huge_pud(vma, pud, addr);
+                        } else if (zap_huge_pud(tlb, vma, pud, addr))
+                                goto next;
+                        /* fall through */
+                }
                if (pud_none_or_clear_bad(pud))
                        continue;
                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+next:
+                cond_resched();
        } while (pud++, addr = next, addr != end);
        return addr;
@@ -3490,6 +3512,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
        return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
 }
+static int create_huge_pud(struct vm_fault *vmf)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        /* No support for anonymous transparent PUD pages yet */
+        if (vma_is_anonymous(vmf->vma))
+                return VM_FAULT_FALLBACK;
+        if (vmf->vma->vm_ops->huge_fault)
+                return vmf->vma->vm_ops->huge_fault(vmf);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+        return VM_FAULT_FALLBACK;
+}
+static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        /* No support for anonymous transparent PUD pages yet */
+        if (vma_is_anonymous(vmf->vma))
+                return VM_FAULT_FALLBACK;
+        if (vmf->vma->vm_ops->huge_fault)
+                return vmf->vma->vm_ops->huge_fault(vmf);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+        return VM_FAULT_FALLBACK;
+}
 /*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
@@ -3605,14 +3651,41 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        };
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
-        pud_t *pud;
        int ret;
        pgd = pgd_offset(mm, address);
-        pud = pud_alloc(mm, pgd, address);
-        if (!pud)
+        vmf.pud = pud_alloc(mm, pgd, address);
+        if (!vmf.pud)
                return VM_FAULT_OOM;
-        vmf.pmd = pmd_alloc(mm, pud, address);
+        if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
+                vmf.flags |= FAULT_FLAG_SIZE_PUD;
+                ret = create_huge_pud(&vmf);
+                if (!(ret & VM_FAULT_FALLBACK))
+                        return ret;
+        } else {
+                pud_t orig_pud = *vmf.pud;
+                barrier();
+                if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+                        unsigned int dirty = flags & FAULT_FLAG_WRITE;
+                        vmf.flags |= FAULT_FLAG_SIZE_PUD;
+                        /* NUMA case for anonymous PUDs would go here */
+                        if (dirty && !pud_write(orig_pud)) {
+                                ret = wp_huge_pud(&vmf, orig_pud);
+                                if (!(ret & VM_FAULT_FALLBACK))
+                                        return ret;
+                        } else {
+                                huge_pud_set_accessed(&vmf, orig_pud);
+                                return 0;
+                        }
+                }
+        }
+        vmf.pmd = pmd_alloc(mm, vmf.pud, address);
        if (!vmf.pmd)
                return VM_FAULT_OOM;
        if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
@@ -3743,13 +3816,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 */
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
+        spinlock_t *ptl;
        pmd_t *new = pmd_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;
        smp_wmb(); /* See comment in __pte_alloc */
-        spin_lock(&mm->page_table_lock);
+        ptl = pud_lock(mm, pud);
 #ifndef __ARCH_HAS_4LEVEL_HACK
        if (!pud_present(*pud)) {
                mm_inc_nr_pmds(mm);
@@ -3763,7 +3837,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
        } else /* Another has populated it */
                pmd_free(mm, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
-        spin_unlock(&mm->page_table_lock);
+        spin_unlock(ptl);
        return 0;
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
author	Matthew Wilcox <willy@linux.intel.com>	2017-02-24 17:57:02 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-02-24 20:46:54 -0500
commit	a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517 (patch)
tree	54d78e89c63e519cb9e00fdab9efbf3189ef2f5e /mm/memory.c
parent	a2d581675d485eb7188f521f36efc114639a3096 (diff)

diff --git a/mm/memory.c b/mm/memory.c index e721e8eba570..41e2a2d4b2a6 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -1001,7 +1001,7 @@ static inline int copy_pmd_range(struct mm_struct dst_mm, struct mm_struct src
1001	next = pmd_addr_end(addr, end);	1001	next = pmd_addr_end(addr, end);
1002	if (pmd_trans_huge(src_pmd) \|\| pmd_devmap(src_pmd)) {	1002	if (pmd_trans_huge(src_pmd) \|\| pmd_devmap(src_pmd)) {
1003	int err;	1003	int err;
1004	VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);	1004	VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
1005	err = copy_huge_pmd(dst_mm, src_mm,	1005	err = copy_huge_pmd(dst_mm, src_mm,
1006	dst_pmd, src_pmd, addr, vma);	1006	dst_pmd, src_pmd, addr, vma);
1007	if (err == -ENOMEM)	1007	if (err == -ENOMEM)
@@ -1032,6 +1032,18 @@ static inline int copy_pud_range(struct mm_struct dst_mm, struct mm_struct src
1032	src_pud = pud_offset(src_pgd, addr);	1032	src_pud = pud_offset(src_pgd, addr);
1033	do {	1033	do {
1034	next = pud_addr_end(addr, end);	1034	next = pud_addr_end(addr, end);
		1035	if (pud_trans_huge(src_pud) \|\| pud_devmap(src_pud)) {
		1036	int err;
		1037
		1038	VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
		1039	err = copy_huge_pud(dst_mm, src_mm,
		1040	dst_pud, src_pud, addr, vma);
		1041	if (err == -ENOMEM)
		1042	return -ENOMEM;
		1043	if (!err)
		1044	continue;
		1045	/* fall through */
		1046	}
1035	if (pud_none_or_clear_bad(src_pud))	1047	if (pud_none_or_clear_bad(src_pud))
1036	continue;	1048	continue;
1037	if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,	1049	if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
@@ -1263,9 +1275,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1263	pud = pud_offset(pgd, addr);	1275	pud = pud_offset(pgd, addr);
1264	do {	1276	do {
1265	next = pud_addr_end(addr, end);	1277	next = pud_addr_end(addr, end);
		1278	if (pud_trans_huge(pud) \|\| pud_devmap(pud)) {
		1279	if (next - addr != HPAGE_PUD_SIZE) {
		1280	VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
		1281	split_huge_pud(vma, pud, addr);
		1282	} else if (zap_huge_pud(tlb, vma, pud, addr))
		1283	goto next;
		1284	/* fall through */
		1285	}
1266	if (pud_none_or_clear_bad(pud))	1286	if (pud_none_or_clear_bad(pud))
1267	continue;	1287	continue;
1268	next = zap_pmd_range(tlb, vma, pud, addr, next, details);	1288	next = zap_pmd_range(tlb, vma, pud, addr, next, details);
		1289	next:
		1290	cond_resched();
1269	} while (pud++, addr = next, addr != end);	1291	} while (pud++, addr = next, addr != end);
1270		1292
1271	return addr;	1293	return addr;
@@ -3490,6 +3512,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
3490	return vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE);	3512	return vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE);
3491	}	3513	}
3492		3514
		3515	static int create_huge_pud(struct vm_fault *vmf)
		3516	{
		3517	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
		3518	/* No support for anonymous transparent PUD pages yet */
		3519	if (vma_is_anonymous(vmf->vma))
		3520	return VM_FAULT_FALLBACK;
		3521	if (vmf->vma->vm_ops->huge_fault)
		3522	return vmf->vma->vm_ops->huge_fault(vmf);
		3523	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
		3524	return VM_FAULT_FALLBACK;
		3525	}
		3526
		3527	static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
		3528	{
		3529	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
		3530	/* No support for anonymous transparent PUD pages yet */
		3531	if (vma_is_anonymous(vmf->vma))
		3532	return VM_FAULT_FALLBACK;
		3533	if (vmf->vma->vm_ops->huge_fault)
		3534	return vmf->vma->vm_ops->huge_fault(vmf);
		3535	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
		3536	return VM_FAULT_FALLBACK;
		3537	}
		3538
3493	/*	3539	/*
3494	* These routines also need to handle stuff like marking pages dirty	3540	* These routines also need to handle stuff like marking pages dirty
3495	* and/or accessed for architectures that don't do it in hardware (most	3541	* and/or accessed for architectures that don't do it in hardware (most
@@ -3605,14 +3651,41 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3605	};	3651	};
3606	struct mm_struct *mm = vma->vm_mm;	3652	struct mm_struct *mm = vma->vm_mm;
3607	pgd_t *pgd;	3653	pgd_t *pgd;
3608	pud_t *pud;
3609	int ret;	3654	int ret;
3610		3655
3611	pgd = pgd_offset(mm, address);	3656	pgd = pgd_offset(mm, address);
3612	pud = pud_alloc(mm, pgd, address);	3657
3613	if (!pud)	3658	vmf.pud = pud_alloc(mm, pgd, address);
		3659	if (!vmf.pud)
3614	return VM_FAULT_OOM;	3660	return VM_FAULT_OOM;
3615	vmf.pmd = pmd_alloc(mm, pud, address);	3661	if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
		3662	vmf.flags \|= FAULT_FLAG_SIZE_PUD;
		3663	ret = create_huge_pud(&vmf);
		3664	if (!(ret & VM_FAULT_FALLBACK))
		3665	return ret;
		3666	} else {
		3667	pud_t orig_pud = *vmf.pud;
		3668
		3669	barrier();
		3670	if (pud_trans_huge(orig_pud) \|\| pud_devmap(orig_pud)) {
		3671	unsigned int dirty = flags & FAULT_FLAG_WRITE;
		3672
		3673	vmf.flags \|= FAULT_FLAG_SIZE_PUD;
		3674
		3675	/* NUMA case for anonymous PUDs would go here */
		3676
		3677	if (dirty && !pud_write(orig_pud)) {
		3678	ret = wp_huge_pud(&vmf, orig_pud);
		3679	if (!(ret & VM_FAULT_FALLBACK))
		3680	return ret;
		3681	} else {
		3682	huge_pud_set_accessed(&vmf, orig_pud);
		3683	return 0;
		3684	}
		3685	}
		3686	}
		3687
		3688	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
3616	if (!vmf.pmd)	3689	if (!vmf.pmd)
3617	return VM_FAULT_OOM;	3690	return VM_FAULT_OOM;
3618	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {	3691	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
@@ -3743,13 +3816,14 @@ int __pud_alloc(struct mm_struct mm, pgd_t pgd, unsigned long address)
3743	*/	3816	*/
3744	int __pmd_alloc(struct mm_struct mm, pud_t pud, unsigned long address)	3817	int __pmd_alloc(struct mm_struct mm, pud_t pud, unsigned long address)
3745	{	3818	{
		3819	spinlock_t *ptl;
3746	pmd_t *new = pmd_alloc_one(mm, address);	3820	pmd_t *new = pmd_alloc_one(mm, address);
3747	if (!new)	3821	if (!new)
3748	return -ENOMEM;	3822	return -ENOMEM;
3749		3823
3750	smp_wmb(); /* See comment in __pte_alloc */	3824	smp_wmb(); /* See comment in __pte_alloc */
3751		3825
3752	spin_lock(&mm->page_table_lock);	3826	ptl = pud_lock(mm, pud);
3753	#ifndef __ARCH_HAS_4LEVEL_HACK	3827	#ifndef __ARCH_HAS_4LEVEL_HACK
3754	if (!pud_present(*pud)) {	3828	if (!pud_present(*pud)) {
3755	mm_inc_nr_pmds(mm);	3829	mm_inc_nr_pmds(mm);
@@ -3763,7 +3837,7 @@ int __pmd_alloc(struct mm_struct mm, pud_t pud, unsigned long address)
3763	} else /* Another has populated it */	3837	} else /* Another has populated it */
3764	pmd_free(mm, new);	3838	pmd_free(mm, new);
3765	#endif /* __ARCH_HAS_4LEVEL_HACK */	3839	#endif /* __ARCH_HAS_4LEVEL_HACK */
3766	spin_unlock(&mm->page_table_lock);	3840	spin_unlock(ptl);
3767	return 0;	3841	return 0;
3768	}	3842	}
3769	#endif /* __PAGETABLE_PMD_FOLDED */	3843	#endif /* __PAGETABLE_PMD_FOLDED */