mm: numa: Create basic numa page hinting infrastructure

Note: This patch started as "mm/mpol: Create special PROT_NONE infrastructure" and preserves the basic idea but steals *very* heavily from "autonuma: numa hinting page faults entry points" for the actual fault handlers without the migration parts. The end result is barely recognisable as either patch so all Signed-off and Reviewed-bys are dropped. If Peter, Ingo and Andrea are ok with this version, I will re-add the signed-offs-by to reflect the history. In order to facilitate a lazy -- fault driven -- migration of pages, create a special transient PAGE_NUMA variant, we can then use the 'spurious' protection faults to drive our migrations from. The meaning of PAGE_NUMA depends on the architecture but on x86 it is effectively PROT_NONE. Actual PROT_NONE mappings will not generate these NUMA faults for the reason that the page fault code checks the permission on the VMA (and will throw a segmentation fault on actual PROT_NONE mappings), before it ever calls handle_mm_fault. [dhillf@gmail.com: Fix typo] Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com>
author: Mel Gorman <mgorman@suse.de> 2012-10-25 08:16:31 -0400
committer: Mel Gorman <mgorman@suse.de> 2012-12-11 09:42:39 -0500
commit: d10e63f29488b0f312a443f9507ea9b6fd3c9090 (patch)
tree: b39e3caa5d25e9e5ebad84c606a724e25c6b8e91
parent: 1ba6e0b50b479cbadb8f05ebde3020da9ac87201 (diff)
3 files changed, 141 insertions, 3 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b31cb7da0346..a1d26a98c655 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -159,6 +159,10 @@ static inline struct page *compound_trans_head(struct page *page)
        }
        return page;
 }
+extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
+                                  pmd_t pmd, pmd_t *pmdp);
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -195,6 +199,12 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
 {
        return 0;
 }
+static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
+                                        pmd_t pmd, pmd_t *pmdp)
+{
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cd24aa562144..f5f37630c54d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1018,6 +1018,28 @@ out:
        return page;
 }
+/* NUMA hinting page fault entry point for trans huge pmds */
+int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
+                                pmd_t pmd, pmd_t *pmdp)
+{
+        struct page *page;
+        unsigned long haddr = addr & HPAGE_PMD_MASK;
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(pmd, *pmdp)))
+                goto out_unlock;
+        page = pmd_page(pmd);
+        pmd = pmd_mknonnuma(pmd);
+        set_pmd_at(mm, haddr, pmdp, pmd);
+        VM_BUG_ON(pmd_numa(*pmdp));
+        update_mmu_cache_pmd(vma, addr, pmdp);
+out_unlock:
+        spin_unlock(&mm->page_table_lock);
+        return 0;
+}
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
diff --git a/mm/memory.c b/mm/memory.c
index cd8e0daf1912..e30616f2cc3d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3448,6 +3448,103 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
+int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+{
+        struct page *page;
+        spinlock_t *ptl;
+        /*
+        * The "pte" at this point cannot be used safely without
+        * validation through pte_unmap_same(). It's of NUMA type but
+        * the pfn may be screwed if the read is non atomic.
+        *
+        * ptep_modify_prot_start is not called as this is clearing
+        * the _PAGE_NUMA bit and it is not really expected that there
+        * would be concurrent hardware modifications to the PTE.
+        */
+        ptl = pte_lockptr(mm, pmd);
+        spin_lock(ptl);
+        if (unlikely(!pte_same(*ptep, pte)))
+                goto out_unlock;
+        pte = pte_mknonnuma(pte);
+        set_pte_at(mm, addr, ptep, pte);
+        update_mmu_cache(vma, addr, ptep);
+        page = vm_normal_page(vma, addr, pte);
+        if (!page) {
+                pte_unmap_unlock(ptep, ptl);
+                return 0;
+        }
+out_unlock:
+        pte_unmap_unlock(ptep, ptl);
+        return 0;
+}
+/* NUMA hinting page fault entry point for regular pmds */
+#ifdef CONFIG_NUMA_BALANCING
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pmd_t *pmdp)
+{
+        pmd_t pmd;
+        pte_t *pte, *orig_pte;
+        unsigned long _addr = addr & PMD_MASK;
+        unsigned long offset;
+        spinlock_t *ptl;
+        bool numa = false;
+        spin_lock(&mm->page_table_lock);
+        pmd = *pmdp;
+        if (pmd_numa(pmd)) {
+                set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+                numa = true;
+        }
+        spin_unlock(&mm->page_table_lock);
+        if (!numa)
+                return 0;
+        /* we're in a page fault so some vma must be in the range */
+        BUG_ON(!vma);
+        BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
+        offset = max(_addr, vma->vm_start) & ~PMD_MASK;
+        VM_BUG_ON(offset >= PMD_SIZE);
+        orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
+        pte += offset >> PAGE_SHIFT;
+        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+                pte_t pteval = *pte;
+                struct page *page;
+                if (!pte_present(pteval))
+                        continue;
+                if (!pte_numa(pteval))
+                        continue;
+                if (addr >= vma->vm_end) {
+                        vma = find_vma(mm, addr);
+                        /* there's a pte present so there must be a vma */
+                        BUG_ON(!vma);
+                        BUG_ON(addr < vma->vm_start);
+                }
+                if (pte_numa(pteval)) {
+                        pteval = pte_mknonnuma(pteval);
+                        set_pte_at(mm, addr, pte, pteval);
+                }
+                page = vm_normal_page(vma, addr, pteval);
+                if (unlikely(!page))
+                        continue;
+        }
+        pte_unmap_unlock(orig_pte, ptl);
+        return 0;
+}
+#else
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pmd_t *pmdp)
+{
+        BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
@@ -3486,6 +3583,9 @@ int handle_pte_fault(struct mm_struct *mm,
                                        pte, pmd, flags, entry);
        }
+        if (pte_numa(entry))
+                return do_numa_page(mm, vma, address, entry, pte, pmd);
        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        if (unlikely(!pte_same(*pte, entry)))
@@ -3554,9 +3654,11 @@ retry:
                barrier();
                if (pmd_trans_huge(orig_pmd)) {
-                        if (flags & FAULT_FLAG_WRITE &&
+                        if (pmd_numa(*pmd))
-                            !pmd_write(orig_pmd) &&
+                                return do_huge_pmd_numa_page(mm, address,
-                            !pmd_trans_splitting(orig_pmd)) {
+                                                             orig_pmd, pmd);
+                        if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) {
                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                                                          orig_pmd);
                                /*
@@ -3568,10 +3670,14 @@ retry:
                                        goto retry;
                                return ret;
                        }
                        return 0;
                }
        }
+        if (pmd_numa(*pmd))
+                return do_pmd_numa_page(mm, vma, address, pmd);
        /*
         * Use __pte_alloc instead of pte_alloc_map, because we can't
         * run pte_offset_map on the pmd, if an huge pmd could
author	Mel Gorman <mgorman@suse.de>	2012-10-25 08:16:31 -0400
committer	Mel Gorman <mgorman@suse.de>	2012-12-11 09:42:39 -0500
commit	d10e63f29488b0f312a443f9507ea9b6fd3c9090 (patch)
tree	b39e3caa5d25e9e5ebad84c606a724e25c6b8e91
parent	1ba6e0b50b479cbadb8f05ebde3020da9ac87201 (diff)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b31cb7da0346..a1d26a98c655 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h
@@ -159,6 +159,10 @@ static inline struct page compound_trans_head(struct page page)
159	}	159	}
160	return page;	160	return page;
161	}	161	}
		162
		163	extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
		164	pmd_t pmd, pmd_t *pmdp);
		165
162	#else /* CONFIG_TRANSPARENT_HUGEPAGE */	166	#else /* CONFIG_TRANSPARENT_HUGEPAGE */
163	#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })	167	#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
164	#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })	168	#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -195,6 +199,12 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
195	{	199	{
196	return 0;	200	return 0;
197	}	201	}
		202
		203	static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
		204	pmd_t pmd, pmd_t *pmdp)
		205	{
		206	}
		207
198	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */	208	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
199		209
200	#endif /* _LINUX_HUGE_MM_H */	210	#endif /* _LINUX_HUGE_MM_H */


diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cd24aa562144..f5f37630c54d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c
@@ -1018,6 +1018,28 @@ out:
1018	return page;	1018	return page;
1019	}	1019	}
1020		1020
		1021	/* NUMA hinting page fault entry point for trans huge pmds */
		1022	int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
		1023	pmd_t pmd, pmd_t *pmdp)
		1024	{
		1025	struct page *page;
		1026	unsigned long haddr = addr & HPAGE_PMD_MASK;
		1027
		1028	spin_lock(&mm->page_table_lock);
		1029	if (unlikely(!pmd_same(pmd, *pmdp)))
		1030	goto out_unlock;
		1031
		1032	page = pmd_page(pmd);
		1033	pmd = pmd_mknonnuma(pmd);
		1034	set_pmd_at(mm, haddr, pmdp, pmd);
		1035	VM_BUG_ON(pmd_numa(*pmdp));
		1036	update_mmu_cache_pmd(vma, addr, pmdp);
		1037
		1038	out_unlock:
		1039	spin_unlock(&mm->page_table_lock);
		1040	return 0;
		1041	}
		1042
1021	int zap_huge_pmd(struct mmu_gather tlb, struct vm_area_struct vma,	1043	int zap_huge_pmd(struct mmu_gather tlb, struct vm_area_struct vma,
1022	pmd_t *pmd, unsigned long addr)	1044	pmd_t *pmd, unsigned long addr)
1023	{	1045	{


diff --git a/mm/memory.c b/mm/memory.c index cd8e0daf1912..e30616f2cc3d 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -3448,6 +3448,103 @@ static int do_nonlinear_fault(struct mm_struct mm, struct vm_area_struct vma,
3448	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);	3448	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3449	}	3449	}
3450		3450
		3451	int do_numa_page(struct mm_struct mm, struct vm_area_struct vma,
		3452	unsigned long addr, pte_t pte, pte_t ptep, pmd_t pmd)
		3453	{
		3454	struct page *page;
		3455	spinlock_t *ptl;
		3456
		3457	/*
		3458	* The "pte" at this point cannot be used safely without
		3459	* validation through pte_unmap_same(). It's of NUMA type but
		3460	* the pfn may be screwed if the read is non atomic.
		3461	*
		3462	* ptep_modify_prot_start is not called as this is clearing
		3463	* the _PAGE_NUMA bit and it is not really expected that there
		3464	* would be concurrent hardware modifications to the PTE.
		3465	*/
		3466	ptl = pte_lockptr(mm, pmd);
		3467	spin_lock(ptl);
		3468	if (unlikely(!pte_same(*ptep, pte)))
		3469	goto out_unlock;
		3470	pte = pte_mknonnuma(pte);
		3471	set_pte_at(mm, addr, ptep, pte);
		3472	update_mmu_cache(vma, addr, ptep);
		3473
		3474	page = vm_normal_page(vma, addr, pte);
		3475	if (!page) {
		3476	pte_unmap_unlock(ptep, ptl);
		3477	return 0;
		3478	}
		3479
		3480	out_unlock:
		3481	pte_unmap_unlock(ptep, ptl);
		3482	return 0;
		3483	}
		3484
		3485	/* NUMA hinting page fault entry point for regular pmds */
		3486	#ifdef CONFIG_NUMA_BALANCING
		3487	static int do_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
		3488	unsigned long addr, pmd_t *pmdp)
		3489	{
		3490	pmd_t pmd;
		3491	pte_t pte, orig_pte;
		3492	unsigned long _addr = addr & PMD_MASK;
		3493	unsigned long offset;
		3494	spinlock_t *ptl;
		3495	bool numa = false;
		3496
		3497	spin_lock(&mm->page_table_lock);
		3498	pmd = *pmdp;
		3499	if (pmd_numa(pmd)) {
		3500	set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
		3501	numa = true;
		3502	}
		3503	spin_unlock(&mm->page_table_lock);
		3504
		3505	if (!numa)
		3506	return 0;
		3507
		3508	/* we're in a page fault so some vma must be in the range */
		3509	BUG_ON(!vma);
		3510	BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
		3511	offset = max(_addr, vma->vm_start) & ~PMD_MASK;
		3512	VM_BUG_ON(offset >= PMD_SIZE);
		3513	orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
		3514	pte += offset >> PAGE_SHIFT;
		3515	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
		3516	pte_t pteval = *pte;
		3517	struct page *page;
		3518	if (!pte_present(pteval))
		3519	continue;
		3520	if (!pte_numa(pteval))
		3521	continue;
		3522	if (addr >= vma->vm_end) {
		3523	vma = find_vma(mm, addr);
		3524	/* there's a pte present so there must be a vma */
		3525	BUG_ON(!vma);
		3526	BUG_ON(addr < vma->vm_start);
		3527	}
		3528	if (pte_numa(pteval)) {
		3529	pteval = pte_mknonnuma(pteval);
		3530	set_pte_at(mm, addr, pte, pteval);
		3531	}
		3532	page = vm_normal_page(vma, addr, pteval);
		3533	if (unlikely(!page))
		3534	continue;
		3535	}
		3536	pte_unmap_unlock(orig_pte, ptl);
		3537
		3538	return 0;
		3539	}
		3540	#else
		3541	static int do_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
		3542	unsigned long addr, pmd_t *pmdp)
		3543	{
		3544	BUG();
		3545	}
		3546	#endif /* CONFIG_NUMA_BALANCING */
		3547
3451	/*	3548	/*
3452	* These routines also need to handle stuff like marking pages dirty	3549	* These routines also need to handle stuff like marking pages dirty
3453	* and/or accessed for architectures that don't do it in hardware (most	3550	* and/or accessed for architectures that don't do it in hardware (most
@@ -3486,6 +3583,9 @@ int handle_pte_fault(struct mm_struct *mm,
3486	pte, pmd, flags, entry);	3583	pte, pmd, flags, entry);
3487	}	3584	}
3488		3585
		3586	if (pte_numa(entry))
		3587	return do_numa_page(mm, vma, address, entry, pte, pmd);
		3588
3489	ptl = pte_lockptr(mm, pmd);	3589	ptl = pte_lockptr(mm, pmd);
3490	spin_lock(ptl);	3590	spin_lock(ptl);
3491	if (unlikely(!pte_same(*pte, entry)))	3591	if (unlikely(!pte_same(*pte, entry)))
@@ -3554,9 +3654,11 @@ retry:
3554		3654
3555	barrier();	3655	barrier();
3556	if (pmd_trans_huge(orig_pmd)) {	3656	if (pmd_trans_huge(orig_pmd)) {
3557	if (flags & FAULT_FLAG_WRITE &&	3657	if (pmd_numa(*pmd))
3558	!pmd_write(orig_pmd) &&	3658	return do_huge_pmd_numa_page(mm, address,
3559	!pmd_trans_splitting(orig_pmd)) {	3659	orig_pmd, pmd);
		3660
		3661	if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) {
3560	ret = do_huge_pmd_wp_page(mm, vma, address, pmd,	3662	ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3561	orig_pmd);	3663	orig_pmd);
3562	/*	3664	/*
@@ -3568,10 +3670,14 @@ retry:
3568	goto retry;	3670	goto retry;
3569	return ret;	3671	return ret;
3570	}	3672	}
		3673
3571	return 0;	3674	return 0;
3572	}	3675	}
3573	}	3676	}
3574		3677
		3678	if (pmd_numa(*pmd))
		3679	return do_pmd_numa_page(mm, vma, address, pmd);
		3680
3575	/*	3681	/*
3576	* Use __pte_alloc instead of pte_alloc_map, because we can't	3682	* Use __pte_alloc instead of pte_alloc_map, because we can't
3577	* run pte_offset_map on the pmd, if an huge pmd could	3683	* run pte_offset_map on the pmd, if an huge pmd could