aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
authorNishanth Aravamudan <nacc@us.ibm.com>2007-10-16 04:26:24 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 12:43:03 -0400
commit63b4613c3f0d4b724ba259dc6c201bb68b884e1a (patch)
tree878818a47052fd204aa0a5d34e592967732d59f9 /mm/hugetlb.c
parent6b0c880dfefecedb9ad353014ed41505c32aca82 (diff)
hugetlb: fix hugepage allocation with memoryless nodes
Anton found a problem with the hugetlb pool allocation when some nodes have no memory (http://marc.info/?l=linux-mm&m=118133042025995&w=2). Lee worked on versions that tried to fix it, but none were accepted. Christoph has created a set of patches which allow for GFP_THISNODE allocations to fail if the node has no memory. Currently, alloc_fresh_huge_page() returns NULL when it is not able to allocate a huge page on the current node, as specified by its custom interleave variable. The callers of this function, though, assume that a failure in alloc_fresh_huge_page() indicates no hugepages can be allocated on the system period. This might not be the case, for instance, if we have an uneven NUMA system, and we happen to try to allocate a hugepage on a node with less memory and fail, while there is still plenty of free memory on the other nodes. To correct this, make alloc_fresh_huge_page() search through all online nodes before deciding no hugepages can be allocated. Add a helper function for actually allocating the hugepage. Use a new global nid iterator to control which nid to allocate on. Note: we expect particular semantics for __GFP_THISNODE, which are now enforced even for memoryless nodes. That is, there is should be no fallback to other nodes. Therefore, we rely on the nid passed into alloc_pages_node() to be the nid the page comes from. If this is incorrect, accounting will break. Tested on x86 !NUMA, x86 NUMA, x86_64 NUMA and ppc64 NUMA (with 2 memoryless nodes). Before on the ppc64 box: Trying to clear the hugetlb pool Done. 0 free Trying to resize the pool to 100 Node 0 HugePages_Free: 25 Node 1 HugePages_Free: 75 Node 2 HugePages_Free: 0 Node 3 HugePages_Free: 0 Done. Initially 100 free Trying to resize the pool to 200 Node 0 HugePages_Free: 50 Node 1 HugePages_Free: 150 Node 2 HugePages_Free: 0 Node 3 HugePages_Free: 0 Done. 200 free After: Trying to clear the hugetlb pool Done. 0 free Trying to resize the pool to 100 Node 0 HugePages_Free: 50 Node 1 HugePages_Free: 50 Node 2 HugePages_Free: 0 Node 3 HugePages_Free: 0 Done. Initially 100 free Trying to resize the pool to 200 Node 0 HugePages_Free: 100 Node 1 HugePages_Free: 100 Node 2 HugePages_Free: 0 Node 3 HugePages_Free: 0 Done. 200 free Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> Acked-by: Christoph Lameter <clameter@sgi.com> Cc: Adam Litke <agl@us.ibm.com> Cc: David Gibson <hermes@gibson.dropbear.id.au> Cc: Badari Pulavarty <pbadari@us.ibm.com> Cc: Ken Chen <kenchen@google.com> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c63
1 files changed, 43 insertions, 20 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8fb86ba452b..82efecbab96 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -32,6 +32,7 @@ static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
32static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 32static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
33unsigned long hugepages_treat_as_movable; 33unsigned long hugepages_treat_as_movable;
34int hugetlb_dynamic_pool; 34int hugetlb_dynamic_pool;
35static int hugetlb_next_nid;
35 36
36/* 37/*
37 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 38 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -165,36 +166,56 @@ static int adjust_pool_surplus(int delta)
165 return ret; 166 return ret;
166} 167}
167 168
168static int alloc_fresh_huge_page(void) 169static struct page *alloc_fresh_huge_page_node(int nid)
169{ 170{
170 static int prev_nid;
171 struct page *page; 171 struct page *page;
172 int nid;
173
174 /*
175 * Copy static prev_nid to local nid, work on that, then copy it
176 * back to prev_nid afterwards: otherwise there's a window in which
177 * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
178 * But we don't need to use a spin_lock here: it really doesn't
179 * matter if occasionally a racer chooses the same nid as we do.
180 */
181 nid = next_node(prev_nid, node_online_map);
182 if (nid == MAX_NUMNODES)
183 nid = first_node(node_online_map);
184 prev_nid = nid;
185 172
186 page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 173 page = alloc_pages_node(nid,
187 HUGETLB_PAGE_ORDER); 174 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
175 HUGETLB_PAGE_ORDER);
188 if (page) { 176 if (page) {
189 set_compound_page_dtor(page, free_huge_page); 177 set_compound_page_dtor(page, free_huge_page);
190 spin_lock(&hugetlb_lock); 178 spin_lock(&hugetlb_lock);
191 nr_huge_pages++; 179 nr_huge_pages++;
192 nr_huge_pages_node[page_to_nid(page)]++; 180 nr_huge_pages_node[nid]++;
193 spin_unlock(&hugetlb_lock); 181 spin_unlock(&hugetlb_lock);
194 put_page(page); /* free it into the hugepage allocator */ 182 put_page(page); /* free it into the hugepage allocator */
195 return 1;
196 } 183 }
197 return 0; 184
185 return page;
186}
187
188static int alloc_fresh_huge_page(void)
189{
190 struct page *page;
191 int start_nid;
192 int next_nid;
193 int ret = 0;
194
195 start_nid = hugetlb_next_nid;
196
197 do {
198 page = alloc_fresh_huge_page_node(hugetlb_next_nid);
199 if (page)
200 ret = 1;
201 /*
202 * Use a helper variable to find the next node and then
203 * copy it back to hugetlb_next_nid afterwards:
204 * otherwise there's a window in which a racer might
205 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
206 * But we don't need to use a spin_lock here: it really
207 * doesn't matter if occasionally a racer chooses the
208 * same nid as we do. Move nid forward in the mask even
209 * if we just successfully allocated a hugepage so that
210 * the next caller gets hugepages on the next node.
211 */
212 next_nid = next_node(hugetlb_next_nid, node_online_map);
213 if (next_nid == MAX_NUMNODES)
214 next_nid = first_node(node_online_map);
215 hugetlb_next_nid = next_nid;
216 } while (!page && hugetlb_next_nid != start_nid);
217
218 return ret;
198} 219}
199 220
200static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 221static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
@@ -365,6 +386,8 @@ static int __init hugetlb_init(void)
365 for (i = 0; i < MAX_NUMNODES; ++i) 386 for (i = 0; i < MAX_NUMNODES; ++i)
366 INIT_LIST_HEAD(&hugepage_freelists[i]); 387 INIT_LIST_HEAD(&hugepage_freelists[i]);
367 388
389 hugetlb_next_nid = first_node(node_online_map);
390
368 for (i = 0; i < max_huge_pages; ++i) { 391 for (i = 0; i < max_huge_pages; ++i) {
369 if (!alloc_fresh_huge_page()) 392 if (!alloc_fresh_huge_page())
370 break; 393 break;