mm/thp: allocate transparent hugepages on local node

This make sure that we try to allocate hugepages from local node if allowed by mempolicy. If we can't, we fallback to small page allocation based on mempolicy. This is based on the observation that allocating pages on local node is more beneficial than allocating hugepages on remote node. With this patch applied we may find transparent huge page allocation failures if the current node doesn't have enough freee hugepages. Before this patch such failures result in us retrying the allocation on other nodes in the numa node mask. [akpm@linux-foundation.org: fix comment, add CONFIG_TRANSPARENT_HUGEPAGE dependency] Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: David Rientjes <rientjes@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 2015-02-11 18:27:12 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-11 20:06:04 -0500
commit: 077fcf116c8c2bd7ee9487b645aa3b50368db7e1 (patch)
tree: 29e2513e00bcc29395a19c696a6d14f52e3c5b1d /mm
parent: 24e2716f63e613cf15d3beba3faa0711bcacc427 (diff)
2 files changed, 81 insertions, 15 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 889713180980..0531ea7dd7cf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -761,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
        return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
 }
-static inline struct page *alloc_hugepage_vma(int defrag,
-                                              struct vm_area_struct *vma,
-                                              unsigned long haddr, int nd,
-                                              gfp_t extra_gfp)
-{
-        return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
-                               HPAGE_PMD_ORDER, vma, haddr, nd);
-}
 /* Caller must hold page table lock. */
 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -790,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                               unsigned long address, pmd_t *pmd,
                               unsigned int flags)
 {
+        gfp_t gfp;
        struct page *page;
        unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -824,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                return 0;
        }
-        page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+        gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
-                        vma, haddr, numa_node_id(), 0);
+        page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
        if (unlikely(!page)) {
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1113,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_unlock(ptl);
 alloc:
        if (transparent_hugepage_enabled(vma) &&
-            !transparent_hugepage_debug_cow())
+            !transparent_hugepage_debug_cow()) {
-                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                gfp_t gfp;
-                                              vma, haddr, numa_node_id(), 0);
-        else
+                gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+                new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+        } else
                new_page = NULL;
        if (unlikely(!new_page)) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e0961b8c39c..8a32873fdbf7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2030,6 +2030,78 @@ retry_cpuset:
        return page;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/**
+ * alloc_hugepage_vma: Allocate a hugepage for a VMA
+ * @gfp:
+ *   %GFP_USER    user allocation.
+ *   %GFP_KERNEL  kernel allocations,
+ *   %GFP_HIGHMEM highmem/user allocations,
+ *   %GFP_FS      allocation should not call back into a file system.
+ *   %GFP_ATOMIC  don't sleep.
+ *
+ * @vma:   Pointer to VMA or NULL if not available.
+ * @addr:  Virtual Address of the allocation. Must be inside the VMA.
+ * @order: Order of the hugepage for gfp allocation.
+ *
+ * This functions allocate a huge page from the kernel page pool and applies
+ * a NUMA policy associated with the VMA or the current process.
+ * For policy other than %MPOL_INTERLEAVE, we make sure we allocate hugepage
+ * only from the current node if the current node is part of the node mask.
+ * If we can't allocate a hugepage we fail the allocation and don' try to fallback
+ * to other nodes in the node mask. If the current node is not part of node mask
+ * or if the NUMA policy is MPOL_INTERLEAVE we use the allocator that can
+ * fallback to nodes in the policy node mask.
+ *
+ * When VMA is not NULL caller must hold down_read on the mmap_sem of the
+ * mm_struct of the VMA to prevent it from going away. Should be used for
+ * all allocations for pages that will be mapped into
+ * user space. Returns NULL when no page can be allocated.
+ *
+ * Should be called with vma->vm_mm->mmap_sem held.
+ *
+ */
+struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma,
+                                unsigned long addr, int order)
+{
+        struct page *page;
+        nodemask_t *nmask;
+        struct mempolicy *pol;
+        int node = numa_node_id();
+        unsigned int cpuset_mems_cookie;
+retry_cpuset:
+        pol = get_vma_policy(vma, addr);
+        cpuset_mems_cookie = read_mems_allowed_begin();
+        /*
+         * For interleave policy, we don't worry about
+         * current node. Otherwise if current node is
+         * in nodemask, try to allocate hugepage from
+         * the current node. Don't fall back to other nodes
+         * for THP.
+         */
+        if (unlikely(pol->mode == MPOL_INTERLEAVE))
+                goto alloc_with_fallback;
+        nmask = policy_nodemask(gfp, pol);
+        if (!nmask || node_isset(node, *nmask)) {
+                mpol_cond_put(pol);
+                page = alloc_pages_exact_node(node, gfp, order);
+                if (unlikely(!page &&
+                             read_mems_allowed_retry(cpuset_mems_cookie)))
+                        goto retry_cpuset;
+                return page;
+        }
+alloc_with_fallback:
+        mpol_cond_put(pol);
+        /*
+         * if current node is not part of node mask, try
+         * the allocation from any node, and we can do retry
+         * in that case.
+         */
+        return alloc_pages_vma(gfp, order, vma, addr, node);
+}
+#endif
 /**
 *      alloc_pages_current - Allocate pages.
 *
author	Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>	2015-02-11 18:27:12 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-11 20:06:04 -0500
commit	077fcf116c8c2bd7ee9487b645aa3b50368db7e1 (patch)
tree	29e2513e00bcc29395a19c696a6d14f52e3c5b1d /mm
parent	24e2716f63e613cf15d3beba3faa0711bcacc427 (diff)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 889713180980..0531ea7dd7cf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c
@@ -761,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
761	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) \| extra_gfp;	761	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) \| extra_gfp;
762	}	762	}
763		763
764	static inline struct page *alloc_hugepage_vma(int defrag,
765	struct vm_area_struct *vma,
766	unsigned long haddr, int nd,
767	gfp_t extra_gfp)
768	{
769	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
770	HPAGE_PMD_ORDER, vma, haddr, nd);
771	}
772
773	/* Caller must hold page table lock. */	764	/* Caller must hold page table lock. */
774	static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,	765	static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
775	struct vm_area_struct vma, unsigned long haddr, pmd_t pmd,	766	struct vm_area_struct vma, unsigned long haddr, pmd_t pmd,
@@ -790,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct mm, struct vm_area_struct vma,
790	unsigned long address, pmd_t *pmd,	781	unsigned long address, pmd_t *pmd,
791	unsigned int flags)	782	unsigned int flags)
792	{	783	{
		784	gfp_t gfp;
793	struct page *page;	785	struct page *page;
794	unsigned long haddr = address & HPAGE_PMD_MASK;	786	unsigned long haddr = address & HPAGE_PMD_MASK;
795		787
@@ -824,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct mm, struct vm_area_struct vma,
824	}	816	}
825	return 0;	817	return 0;
826	}	818	}
827	page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),	819	gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
828	vma, haddr, numa_node_id(), 0);	820	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
829	if (unlikely(!page)) {	821	if (unlikely(!page)) {
830	count_vm_event(THP_FAULT_FALLBACK);	822	count_vm_event(THP_FAULT_FALLBACK);
831	return VM_FAULT_FALLBACK;	823	return VM_FAULT_FALLBACK;
@@ -1113,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct mm, struct vm_area_struct vma,
1113	spin_unlock(ptl);	1105	spin_unlock(ptl);
1114	alloc:	1106	alloc:
1115	if (transparent_hugepage_enabled(vma) &&	1107	if (transparent_hugepage_enabled(vma) &&
1116	!transparent_hugepage_debug_cow())	1108	!transparent_hugepage_debug_cow()) {
1117	new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),	1109	gfp_t gfp;
1118	vma, haddr, numa_node_id(), 0);	1110
1119	else	1111	gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
		1112	new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
		1113	} else
1120	new_page = NULL;	1114	new_page = NULL;
1121		1115
1122	if (unlikely(!new_page)) {	1116	if (unlikely(!new_page)) {


diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e0961b8c39c..8a32873fdbf7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -2030,6 +2030,78 @@ retry_cpuset:
2030	return page;	2030	return page;
2031	}	2031	}
2032		2032
		2033	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
		2034	/**
		2035	* alloc_hugepage_vma: Allocate a hugepage for a VMA
		2036	* @gfp:
		2037	* %GFP_USER user allocation.
		2038	* %GFP_KERNEL kernel allocations,
		2039	* %GFP_HIGHMEM highmem/user allocations,
		2040	* %GFP_FS allocation should not call back into a file system.
		2041	* %GFP_ATOMIC don't sleep.
		2042	*
		2043	* @vma: Pointer to VMA or NULL if not available.
		2044	* @addr: Virtual Address of the allocation. Must be inside the VMA.
		2045	* @order: Order of the hugepage for gfp allocation.
		2046	*
		2047	* This functions allocate a huge page from the kernel page pool and applies
		2048	* a NUMA policy associated with the VMA or the current process.
		2049	* For policy other than %MPOL_INTERLEAVE, we make sure we allocate hugepage
		2050	* only from the current node if the current node is part of the node mask.
		2051	* If we can't allocate a hugepage we fail the allocation and don' try to fallback
		2052	* to other nodes in the node mask. If the current node is not part of node mask
		2053	* or if the NUMA policy is MPOL_INTERLEAVE we use the allocator that can
		2054	* fallback to nodes in the policy node mask.
		2055	*
		2056	* When VMA is not NULL caller must hold down_read on the mmap_sem of the
		2057	* mm_struct of the VMA to prevent it from going away. Should be used for
		2058	* all allocations for pages that will be mapped into
		2059	* user space. Returns NULL when no page can be allocated.
		2060	*
		2061	* Should be called with vma->vm_mm->mmap_sem held.
		2062	*
		2063	*/
		2064	struct page alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct vma,
		2065	unsigned long addr, int order)
		2066	{
		2067	struct page *page;
		2068	nodemask_t *nmask;
		2069	struct mempolicy *pol;
		2070	int node = numa_node_id();
		2071	unsigned int cpuset_mems_cookie;
		2072
		2073	retry_cpuset:
		2074	pol = get_vma_policy(vma, addr);
		2075	cpuset_mems_cookie = read_mems_allowed_begin();
		2076	/*
		2077	* For interleave policy, we don't worry about
		2078	* current node. Otherwise if current node is
		2079	* in nodemask, try to allocate hugepage from
		2080	* the current node. Don't fall back to other nodes
		2081	* for THP.
		2082	*/
		2083	if (unlikely(pol->mode == MPOL_INTERLEAVE))
		2084	goto alloc_with_fallback;
		2085	nmask = policy_nodemask(gfp, pol);
		2086	if (!nmask \|\| node_isset(node, *nmask)) {
		2087	mpol_cond_put(pol);
		2088	page = alloc_pages_exact_node(node, gfp, order);
		2089	if (unlikely(!page &&
		2090	read_mems_allowed_retry(cpuset_mems_cookie)))
		2091	goto retry_cpuset;
		2092	return page;
		2093	}
		2094	alloc_with_fallback:
		2095	mpol_cond_put(pol);
		2096	/*
		2097	* if current node is not part of node mask, try
		2098	* the allocation from any node, and we can do retry
		2099	* in that case.
		2100	*/
		2101	return alloc_pages_vma(gfp, order, vma, addr, node);
		2102	}
		2103	#endif
		2104
2033	/**	2105	/**
2034	* alloc_pages_current - Allocate pages.	2106	* alloc_pages_current - Allocate pages.
2035	*	2107	*