hugetlb: fix hugepage allocation with memoryless nodes

Anton found a problem with the hugetlb pool allocation when some nodes have no memory (http://marc.info/?l=linux-mm&m=118133042025995&w=2). Lee worked on versions that tried to fix it, but none were accepted. Christoph has created a set of patches which allow for GFP_THISNODE allocations to fail if the node has no memory. Currently, alloc_fresh_huge_page() returns NULL when it is not able to allocate a huge page on the current node, as specified by its custom interleave variable. The callers of this function, though, assume that a failure in alloc_fresh_huge_page() indicates no hugepages can be allocated on the system period. This might not be the case, for instance, if we have an uneven NUMA system, and we happen to try to allocate a hugepage on a node with less memory and fail, while there is still plenty of free memory on the other nodes. To correct this, make alloc_fresh_huge_page() search through all online nodes before deciding no hugepages can be allocated. Add a helper function for actually allocating the hugepage. Use a new global nid iterator to control which nid to allocate on. Note: we expect particular semantics for __GFP_THISNODE, which are now enforced even for memoryless nodes. That is, there is should be no fallback to other nodes. Therefore, we rely on the nid passed into alloc_pages_node() to be the nid the page comes from. If this is incorrect, accounting will break. Tested on x86 !NUMA, x86 NUMA, x86_64 NUMA and ppc64 NUMA (with 2 memoryless nodes). Before on the ppc64 box: Trying to clear the hugetlb pool Done. 0 free Trying to resize the pool to 100 Node 0 HugePages_Free: 25 Node 1 HugePages_Free: 75 Node 2 HugePages_Free: 0 Node 3 HugePages_Free: 0 Done. Initially 100 free Trying to resize the pool to 200 Node 0 HugePages_Free: 50 Node 1 HugePages_Free: 150 Node 2 HugePages_Free: 0 Node 3 HugePages_Free: 0 Done. 200 free After: Trying to clear the hugetlb pool Done. 0 free Trying to resize the pool to 100 Node 0 HugePages_Free: 50 Node 1 HugePages_Free: 50 Node 2 HugePages_Free: 0 Node 3 HugePages_Free: 0 Done. Initially 100 free Trying to resize the pool to 200 Node 0 HugePages_Free: 100 Node 1 HugePages_Free: 100 Node 2 HugePages_Free: 0 Node 3 HugePages_Free: 0 Done. 200 free Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> Acked-by: Christoph Lameter <clameter@sgi.com> Cc: Adam Litke <agl@us.ibm.com> Cc: David Gibson <hermes@gibson.dropbear.id.au> Cc: Badari Pulavarty <pbadari@us.ibm.com> Cc: Ken Chen <kenchen@google.com> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Nishanth Aravamudan <nacc@us.ibm.com> 2007-10-16 04:26:24 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-16 12:43:03 -0400
commit: 63b4613c3f0d4b724ba259dc6c201bb68b884e1a (patch)
tree: 878818a47052fd204aa0a5d34e592967732d59f9 /mm/hugetlb.c
parent: 6b0c880dfefecedb9ad353014ed41505c32aca82 (diff)
1 files changed, 43 insertions, 20 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8fb86ba452b0..82efecbab96f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -32,6 +32,7 @@ static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
 int hugetlb_dynamic_pool;
+static int hugetlb_next_nid;
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -165,36 +166,56 @@ static int adjust_pool_surplus(int delta)
        return ret;
 }
-static int alloc_fresh_huge_page(void)
+static struct page *alloc_fresh_huge_page_node(int nid)
 {
-        static int prev_nid;
        struct page *page;
-        int nid;
-        /*
-         * Copy static prev_nid to local nid, work on that, then copy it
-         * back to prev_nid afterwards: otherwise there's a window in which
-         * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
-         * But we don't need to use a spin_lock here: it really doesn't
-         * matter if occasionally a racer chooses the same nid as we do.
-         */
-        nid = next_node(prev_nid, node_online_map);
-        if (nid == MAX_NUMNODES)
-                nid = first_node(node_online_map);
-        prev_nid = nid;
-        page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
+        page = alloc_pages_node(nid,
-                                        HUGETLB_PAGE_ORDER);
+                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
+                HUGETLB_PAGE_ORDER);
        if (page) {
                set_compound_page_dtor(page, free_huge_page);
                spin_lock(&hugetlb_lock);
                nr_huge_pages++;
-                nr_huge_pages_node[page_to_nid(page)]++;
+                nr_huge_pages_node[nid]++;
                spin_unlock(&hugetlb_lock);
                put_page(page); /* free it into the hugepage allocator */
-                return 1;
        }
-        return 0;
+        return page;
+}
+static int alloc_fresh_huge_page(void)
+{
+        struct page *page;
+        int start_nid;
+        int next_nid;
+        int ret = 0;
+        start_nid = hugetlb_next_nid;
+        do {
+                page = alloc_fresh_huge_page_node(hugetlb_next_nid);
+                if (page)
+                        ret = 1;
+                /*
+                 * Use a helper variable to find the next node and then
+                 * copy it back to hugetlb_next_nid afterwards:
+                 * otherwise there's a window in which a racer might
+                 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+                 * But we don't need to use a spin_lock here: it really
+                 * doesn't matter if occasionally a racer chooses the
+                 * same nid as we do.  Move nid forward in the mask even
+                 * if we just successfully allocated a hugepage so that
+                 * the next caller gets hugepages on the next node.
+                 */
+                next_nid = next_node(hugetlb_next_nid, node_online_map);
+                if (next_nid == MAX_NUMNODES)
+                        next_nid = first_node(node_online_map);
+                hugetlb_next_nid = next_nid;
+        } while (!page && hugetlb_next_nid != start_nid);
+        return ret;
 }
 static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
@@ -365,6 +386,8 @@ static int __init hugetlb_init(void)
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&hugepage_freelists[i]);
+        hugetlb_next_nid = first_node(node_online_map);
        for (i = 0; i < max_huge_pages; ++i) {
                if (!alloc_fresh_huge_page())
                        break;
author	Nishanth Aravamudan <nacc@us.ibm.com>	2007-10-16 04:26:24 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-16 12:43:03 -0400
commit	63b4613c3f0d4b724ba259dc6c201bb68b884e1a (patch)
tree	878818a47052fd204aa0a5d34e592967732d59f9 /mm/hugetlb.c
parent	6b0c880dfefecedb9ad353014ed41505c32aca82 (diff)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8fb86ba452b0..82efecbab96f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -32,6 +32,7 @@ static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
32	static gfp_t htlb_alloc_mask = GFP_HIGHUSER;	32	static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
33	unsigned long hugepages_treat_as_movable;	33	unsigned long hugepages_treat_as_movable;
34	int hugetlb_dynamic_pool;	34	int hugetlb_dynamic_pool;
		35	static int hugetlb_next_nid;
35		36
36	/*	37	/*
37	* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages	38	* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -165,36 +166,56 @@ static int adjust_pool_surplus(int delta)
165	return ret;	166	return ret;
166	}	167	}
167		168
168	static int alloc_fresh_huge_page(void)	169	static struct page *alloc_fresh_huge_page_node(int nid)
169	{	170	{
170	static int prev_nid;
171	struct page *page;	171	struct page *page;
172	int nid;
173
174	/*
175	* Copy static prev_nid to local nid, work on that, then copy it
176	* back to prev_nid afterwards: otherwise there's a window in which
177	* a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
178	* But we don't need to use a spin_lock here: it really doesn't
179	* matter if occasionally a racer chooses the same nid as we do.
180	*/
181	nid = next_node(prev_nid, node_online_map);
182	if (nid == MAX_NUMNODES)
183	nid = first_node(node_online_map);
184	prev_nid = nid;
185		172
186	page = alloc_pages_node(nid, htlb_alloc_mask\|__GFP_COMP\|__GFP_NOWARN,	173	page = alloc_pages_node(nid,
187	HUGETLB_PAGE_ORDER);	174	htlb_alloc_mask\|__GFP_COMP\|__GFP_THISNODE\|__GFP_NOWARN,
		175	HUGETLB_PAGE_ORDER);
188	if (page) {	176	if (page) {
189	set_compound_page_dtor(page, free_huge_page);	177	set_compound_page_dtor(page, free_huge_page);
190	spin_lock(&hugetlb_lock);	178	spin_lock(&hugetlb_lock);
191	nr_huge_pages++;	179	nr_huge_pages++;
192	nr_huge_pages_node[page_to_nid(page)]++;	180	nr_huge_pages_node[nid]++;
193	spin_unlock(&hugetlb_lock);	181	spin_unlock(&hugetlb_lock);
194	put_page(page); /* free it into the hugepage allocator */	182	put_page(page); /* free it into the hugepage allocator */
195	return 1;
196	}	183	}
197	return 0;	184
		185	return page;
		186	}
		187
		188	static int alloc_fresh_huge_page(void)
		189	{
		190	struct page *page;
		191	int start_nid;
		192	int next_nid;
		193	int ret = 0;
		194
		195	start_nid = hugetlb_next_nid;
		196
		197	do {
		198	page = alloc_fresh_huge_page_node(hugetlb_next_nid);
		199	if (page)
		200	ret = 1;
		201	/*
		202	* Use a helper variable to find the next node and then
		203	* copy it back to hugetlb_next_nid afterwards:
		204	* otherwise there's a window in which a racer might
		205	* pass invalid nid MAX_NUMNODES to alloc_pages_node.
		206	* But we don't need to use a spin_lock here: it really
		207	* doesn't matter if occasionally a racer chooses the
		208	* same nid as we do. Move nid forward in the mask even
		209	* if we just successfully allocated a hugepage so that
		210	* the next caller gets hugepages on the next node.
		211	*/
		212	next_nid = next_node(hugetlb_next_nid, node_online_map);
		213	if (next_nid == MAX_NUMNODES)
		214	next_nid = first_node(node_online_map);
		215	hugetlb_next_nid = next_nid;
		216	} while (!page && hugetlb_next_nid != start_nid);
		217
		218	return ret;
198	}	219	}
199		220
200	static struct page alloc_buddy_huge_page(struct vm_area_struct vma,	221	static struct page alloc_buddy_huge_page(struct vm_area_struct vma,
@@ -365,6 +386,8 @@ static int __init hugetlb_init(void)
365	for (i = 0; i < MAX_NUMNODES; ++i)	386	for (i = 0; i < MAX_NUMNODES; ++i)
366	INIT_LIST_HEAD(&hugepage_freelists[i]);	387	INIT_LIST_HEAD(&hugepage_freelists[i]);
367		388
		389	hugetlb_next_nid = first_node(node_online_map);
		390
368	for (i = 0; i < max_huge_pages; ++i) {	391	for (i = 0; i < max_huge_pages; ++i) {
369	if (!alloc_fresh_huge_page())	392	if (!alloc_fresh_huge_page())
370	break;	393	break;