diff options
author | Adam Litke <agl@us.ibm.com> | 2007-10-16 04:26:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-16 12:43:02 -0400 |
commit | 7893d1d505d59db9d4f35165c8b6d3c6dff40a32 (patch) | |
tree | 6bea3b41e111b1d1774980296a032012a3926e9c | |
parent | 6af2acb6619688046039234f716fd003e6ed2b3f (diff) |
hugetlb: Try to grow hugetlb pool for MAP_PRIVATE mappings
Because we overcommit hugepages for MAP_PRIVATE mappings, it is possible that
the hugetlb pool will be exhausted or completely reserved when a hugepage is
needed to satisfy a page fault. Before killing the process in this situation,
try to allocate a hugepage directly from the buddy allocator.
The explicitly configured pool size becomes a low watermark. When dynamically
grown, the allocated huge pages are accounted as a surplus over the watermark.
As huge pages are freed on a node, surplus pages are released to the buddy
allocator so that the pool will shrink back to the watermark.
Surplus accounting also allows for friendlier explicit pool resizing. When
shrinking a pool that is fully in-use, increase the surplus so pages will be
returned to the buddy allocator as soon as they are freed. When growing a
pool that has a surplus, consume the surplus first and then allocate new
pages.
Signed-off-by: Adam Litke <agl@us.ibm.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Dave McCracken <dave.mccracken@oracle.com>
Cc: William Irwin <bill.irwin@oracle.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Ken Chen <kenchen@google.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/hugetlb.c | 139 |
1 files changed, 125 insertions, 14 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ba029d640740..8768e5250323 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -23,10 +23,12 @@ | |||
23 | 23 | ||
24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; |
26 | static unsigned long surplus_huge_pages; | ||
26 | unsigned long max_huge_pages; | 27 | unsigned long max_huge_pages; |
27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 28 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 29 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
29 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | 30 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; |
31 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | ||
30 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 32 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
31 | unsigned long hugepages_treat_as_movable; | 33 | unsigned long hugepages_treat_as_movable; |
32 | 34 | ||
@@ -109,15 +111,57 @@ static void update_and_free_page(struct page *page) | |||
109 | 111 | ||
110 | static void free_huge_page(struct page *page) | 112 | static void free_huge_page(struct page *page) |
111 | { | 113 | { |
112 | BUG_ON(page_count(page)); | 114 | int nid = page_to_nid(page); |
113 | 115 | ||
116 | BUG_ON(page_count(page)); | ||
114 | INIT_LIST_HEAD(&page->lru); | 117 | INIT_LIST_HEAD(&page->lru); |
115 | 118 | ||
116 | spin_lock(&hugetlb_lock); | 119 | spin_lock(&hugetlb_lock); |
117 | enqueue_huge_page(page); | 120 | if (surplus_huge_pages_node[nid]) { |
121 | update_and_free_page(page); | ||
122 | surplus_huge_pages--; | ||
123 | surplus_huge_pages_node[nid]--; | ||
124 | } else { | ||
125 | enqueue_huge_page(page); | ||
126 | } | ||
118 | spin_unlock(&hugetlb_lock); | 127 | spin_unlock(&hugetlb_lock); |
119 | } | 128 | } |
120 | 129 | ||
130 | /* | ||
131 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
132 | * balanced by operating on them in a round-robin fashion. | ||
133 | * Returns 1 if an adjustment was made. | ||
134 | */ | ||
135 | static int adjust_pool_surplus(int delta) | ||
136 | { | ||
137 | static int prev_nid; | ||
138 | int nid = prev_nid; | ||
139 | int ret = 0; | ||
140 | |||
141 | VM_BUG_ON(delta != -1 && delta != 1); | ||
142 | do { | ||
143 | nid = next_node(nid, node_online_map); | ||
144 | if (nid == MAX_NUMNODES) | ||
145 | nid = first_node(node_online_map); | ||
146 | |||
147 | /* To shrink on this node, there must be a surplus page */ | ||
148 | if (delta < 0 && !surplus_huge_pages_node[nid]) | ||
149 | continue; | ||
150 | /* Surplus cannot exceed the total number of pages */ | ||
151 | if (delta > 0 && surplus_huge_pages_node[nid] >= | ||
152 | nr_huge_pages_node[nid]) | ||
153 | continue; | ||
154 | |||
155 | surplus_huge_pages += delta; | ||
156 | surplus_huge_pages_node[nid] += delta; | ||
157 | ret = 1; | ||
158 | break; | ||
159 | } while (nid != prev_nid); | ||
160 | |||
161 | prev_nid = nid; | ||
162 | return ret; | ||
163 | } | ||
164 | |||
121 | static int alloc_fresh_huge_page(void) | 165 | static int alloc_fresh_huge_page(void) |
122 | { | 166 | { |
123 | static int prev_nid; | 167 | static int prev_nid; |
@@ -150,10 +194,30 @@ static int alloc_fresh_huge_page(void) | |||
150 | return 0; | 194 | return 0; |
151 | } | 195 | } |
152 | 196 | ||
197 | static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | ||
198 | unsigned long address) | ||
199 | { | ||
200 | struct page *page; | ||
201 | |||
202 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, | ||
203 | HUGETLB_PAGE_ORDER); | ||
204 | if (page) { | ||
205 | set_compound_page_dtor(page, free_huge_page); | ||
206 | spin_lock(&hugetlb_lock); | ||
207 | nr_huge_pages++; | ||
208 | nr_huge_pages_node[page_to_nid(page)]++; | ||
209 | surplus_huge_pages++; | ||
210 | surplus_huge_pages_node[page_to_nid(page)]++; | ||
211 | spin_unlock(&hugetlb_lock); | ||
212 | } | ||
213 | |||
214 | return page; | ||
215 | } | ||
216 | |||
153 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 217 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
154 | unsigned long addr) | 218 | unsigned long addr) |
155 | { | 219 | { |
156 | struct page *page; | 220 | struct page *page = NULL; |
157 | 221 | ||
158 | spin_lock(&hugetlb_lock); | 222 | spin_lock(&hugetlb_lock); |
159 | if (vma->vm_flags & VM_MAYSHARE) | 223 | if (vma->vm_flags & VM_MAYSHARE) |
@@ -173,7 +237,16 @@ fail: | |||
173 | if (vma->vm_flags & VM_MAYSHARE) | 237 | if (vma->vm_flags & VM_MAYSHARE) |
174 | resv_huge_pages++; | 238 | resv_huge_pages++; |
175 | spin_unlock(&hugetlb_lock); | 239 | spin_unlock(&hugetlb_lock); |
176 | return NULL; | 240 | |
241 | /* | ||
242 | * Private mappings do not use reserved huge pages so the allocation | ||
243 | * may have failed due to an undersized hugetlb pool. Try to grab a | ||
244 | * surplus huge page from the buddy allocator. | ||
245 | */ | ||
246 | if (!(vma->vm_flags & VM_MAYSHARE)) | ||
247 | page = alloc_buddy_huge_page(vma, addr); | ||
248 | |||
249 | return page; | ||
177 | } | 250 | } |
178 | 251 | ||
179 | static int __init hugetlb_init(void) | 252 | static int __init hugetlb_init(void) |
@@ -241,26 +314,62 @@ static inline void try_to_free_low(unsigned long count) | |||
241 | } | 314 | } |
242 | #endif | 315 | #endif |
243 | 316 | ||
317 | #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) | ||
244 | static unsigned long set_max_huge_pages(unsigned long count) | 318 | static unsigned long set_max_huge_pages(unsigned long count) |
245 | { | 319 | { |
246 | while (count > nr_huge_pages) { | 320 | unsigned long min_count, ret; |
247 | if (!alloc_fresh_huge_page()) | ||
248 | return nr_huge_pages; | ||
249 | } | ||
250 | if (count >= nr_huge_pages) | ||
251 | return nr_huge_pages; | ||
252 | 321 | ||
322 | /* | ||
323 | * Increase the pool size | ||
324 | * First take pages out of surplus state. Then make up the | ||
325 | * remaining difference by allocating fresh huge pages. | ||
326 | */ | ||
253 | spin_lock(&hugetlb_lock); | 327 | spin_lock(&hugetlb_lock); |
254 | count = max(count, resv_huge_pages); | 328 | while (surplus_huge_pages && count > persistent_huge_pages) { |
255 | try_to_free_low(count); | 329 | if (!adjust_pool_surplus(-1)) |
256 | while (count < nr_huge_pages) { | 330 | break; |
331 | } | ||
332 | |||
333 | while (count > persistent_huge_pages) { | ||
334 | int ret; | ||
335 | /* | ||
336 | * If this allocation races such that we no longer need the | ||
337 | * page, free_huge_page will handle it by freeing the page | ||
338 | * and reducing the surplus. | ||
339 | */ | ||
340 | spin_unlock(&hugetlb_lock); | ||
341 | ret = alloc_fresh_huge_page(); | ||
342 | spin_lock(&hugetlb_lock); | ||
343 | if (!ret) | ||
344 | goto out; | ||
345 | |||
346 | } | ||
347 | if (count >= persistent_huge_pages) | ||
348 | goto out; | ||
349 | |||
350 | /* | ||
351 | * Decrease the pool size | ||
352 | * First return free pages to the buddy allocator (being careful | ||
353 | * to keep enough around to satisfy reservations). Then place | ||
354 | * pages into surplus state as needed so the pool will shrink | ||
355 | * to the desired size as pages become free. | ||
356 | */ | ||
357 | min_count = max(count, resv_huge_pages); | ||
358 | try_to_free_low(min_count); | ||
359 | while (min_count < persistent_huge_pages) { | ||
257 | struct page *page = dequeue_huge_page(NULL, 0); | 360 | struct page *page = dequeue_huge_page(NULL, 0); |
258 | if (!page) | 361 | if (!page) |
259 | break; | 362 | break; |
260 | update_and_free_page(page); | 363 | update_and_free_page(page); |
261 | } | 364 | } |
365 | while (count < persistent_huge_pages) { | ||
366 | if (!adjust_pool_surplus(1)) | ||
367 | break; | ||
368 | } | ||
369 | out: | ||
370 | ret = persistent_huge_pages; | ||
262 | spin_unlock(&hugetlb_lock); | 371 | spin_unlock(&hugetlb_lock); |
263 | return nr_huge_pages; | 372 | return ret; |
264 | } | 373 | } |
265 | 374 | ||
266 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 375 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
@@ -292,10 +401,12 @@ int hugetlb_report_meminfo(char *buf) | |||
292 | "HugePages_Total: %5lu\n" | 401 | "HugePages_Total: %5lu\n" |
293 | "HugePages_Free: %5lu\n" | 402 | "HugePages_Free: %5lu\n" |
294 | "HugePages_Rsvd: %5lu\n" | 403 | "HugePages_Rsvd: %5lu\n" |
404 | "HugePages_Surp: %5lu\n" | ||
295 | "Hugepagesize: %5lu kB\n", | 405 | "Hugepagesize: %5lu kB\n", |
296 | nr_huge_pages, | 406 | nr_huge_pages, |
297 | free_huge_pages, | 407 | free_huge_pages, |
298 | resv_huge_pages, | 408 | resv_huge_pages, |
409 | surplus_huge_pages, | ||
299 | HPAGE_SIZE/1024); | 410 | HPAGE_SIZE/1024); |
300 | } | 411 | } |
301 | 412 | ||