diff options
-rw-r--r-- | include/linux/hugetlb.h | 1 | ||||
-rw-r--r-- | kernel/sysctl.c | 8 | ||||
-rw-r--r-- | mm/hugetlb.c | 67 |
3 files changed, 70 insertions, 6 deletions
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 24968790bc3e..f7bc869a29b8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -34,6 +34,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); | |||
34 | extern unsigned long max_huge_pages; | 34 | extern unsigned long max_huge_pages; |
35 | extern unsigned long hugepages_treat_as_movable; | 35 | extern unsigned long hugepages_treat_as_movable; |
36 | extern int hugetlb_dynamic_pool; | 36 | extern int hugetlb_dynamic_pool; |
37 | extern unsigned long nr_overcommit_huge_pages; | ||
37 | extern const unsigned long hugetlb_zero, hugetlb_infinity; | 38 | extern const unsigned long hugetlb_zero, hugetlb_infinity; |
38 | extern int sysctl_hugetlb_shm_group; | 39 | extern int sysctl_hugetlb_shm_group; |
39 | 40 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8ac51714b08c..b85a1282605d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -912,6 +912,14 @@ static struct ctl_table vm_table[] = { | |||
912 | .mode = 0644, | 912 | .mode = 0644, |
913 | .proc_handler = &proc_dointvec, | 913 | .proc_handler = &proc_dointvec, |
914 | }, | 914 | }, |
915 | { | ||
916 | .ctl_name = CTL_UNNUMBERED, | ||
917 | .procname = "nr_overcommit_hugepages", | ||
918 | .data = &nr_overcommit_huge_pages, | ||
919 | .maxlen = sizeof(nr_overcommit_huge_pages), | ||
920 | .mode = 0644, | ||
921 | .proc_handler = &proc_doulongvec_minmax, | ||
922 | }, | ||
915 | #endif | 923 | #endif |
916 | { | 924 | { |
917 | .ctl_name = VM_LOWMEM_RESERVE_RATIO, | 925 | .ctl_name = VM_LOWMEM_RESERVE_RATIO, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6f978218c2c8..3a790651475a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -32,6 +32,7 @@ static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | |||
32 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 32 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
33 | unsigned long hugepages_treat_as_movable; | 33 | unsigned long hugepages_treat_as_movable; |
34 | int hugetlb_dynamic_pool; | 34 | int hugetlb_dynamic_pool; |
35 | unsigned long nr_overcommit_huge_pages; | ||
35 | static int hugetlb_next_nid; | 36 | static int hugetlb_next_nid; |
36 | 37 | ||
37 | /* | 38 | /* |
@@ -227,22 +228,62 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
227 | unsigned long address) | 228 | unsigned long address) |
228 | { | 229 | { |
229 | struct page *page; | 230 | struct page *page; |
231 | unsigned int nid; | ||
230 | 232 | ||
231 | /* Check if the dynamic pool is enabled */ | 233 | /* Check if the dynamic pool is enabled */ |
232 | if (!hugetlb_dynamic_pool) | 234 | if (!hugetlb_dynamic_pool) |
233 | return NULL; | 235 | return NULL; |
234 | 236 | ||
237 | /* | ||
238 | * Assume we will successfully allocate the surplus page to | ||
239 | * prevent racing processes from causing the surplus to exceed | ||
240 | * overcommit | ||
241 | * | ||
242 | * This however introduces a different race, where a process B | ||
243 | * tries to grow the static hugepage pool while alloc_pages() is | ||
244 | * called by process A. B will only examine the per-node | ||
245 | * counters in determining if surplus huge pages can be | ||
246 | * converted to normal huge pages in adjust_pool_surplus(). A | ||
247 | * won't be able to increment the per-node counter, until the | ||
248 | * lock is dropped by B, but B doesn't drop hugetlb_lock until | ||
249 | * no more huge pages can be converted from surplus to normal | ||
250 | * state (and doesn't try to convert again). Thus, we have a | ||
251 | * case where a surplus huge page exists, the pool is grown, and | ||
252 | * the surplus huge page still exists after, even though it | ||
253 | * should just have been converted to a normal huge page. This | ||
254 | * does not leak memory, though, as the hugepage will be freed | ||
255 | * once it is out of use. It also does not allow the counters to | ||
256 | * go out of whack in adjust_pool_surplus() as we don't modify | ||
257 | * the node values until we've gotten the hugepage and only the | ||
258 | * per-node value is checked there. | ||
259 | */ | ||
260 | spin_lock(&hugetlb_lock); | ||
261 | if (surplus_huge_pages >= nr_overcommit_huge_pages) { | ||
262 | spin_unlock(&hugetlb_lock); | ||
263 | return NULL; | ||
264 | } else { | ||
265 | nr_huge_pages++; | ||
266 | surplus_huge_pages++; | ||
267 | } | ||
268 | spin_unlock(&hugetlb_lock); | ||
269 | |||
235 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, | 270 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, |
236 | HUGETLB_PAGE_ORDER); | 271 | HUGETLB_PAGE_ORDER); |
272 | |||
273 | spin_lock(&hugetlb_lock); | ||
237 | if (page) { | 274 | if (page) { |
275 | nid = page_to_nid(page); | ||
238 | set_compound_page_dtor(page, free_huge_page); | 276 | set_compound_page_dtor(page, free_huge_page); |
239 | spin_lock(&hugetlb_lock); | 277 | /* |
240 | nr_huge_pages++; | 278 | * We incremented the global counters already |
241 | nr_huge_pages_node[page_to_nid(page)]++; | 279 | */ |
242 | surplus_huge_pages++; | 280 | nr_huge_pages_node[nid]++; |
243 | surplus_huge_pages_node[page_to_nid(page)]++; | 281 | surplus_huge_pages_node[nid]++; |
244 | spin_unlock(&hugetlb_lock); | 282 | } else { |
283 | nr_huge_pages--; | ||
284 | surplus_huge_pages--; | ||
245 | } | 285 | } |
286 | spin_unlock(&hugetlb_lock); | ||
246 | 287 | ||
247 | return page; | 288 | return page; |
248 | } | 289 | } |
@@ -481,6 +522,12 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
481 | * Increase the pool size | 522 | * Increase the pool size |
482 | * First take pages out of surplus state. Then make up the | 523 | * First take pages out of surplus state. Then make up the |
483 | * remaining difference by allocating fresh huge pages. | 524 | * remaining difference by allocating fresh huge pages. |
525 | * | ||
526 | * We might race with alloc_buddy_huge_page() here and be unable | ||
527 | * to convert a surplus huge page to a normal huge page. That is | ||
528 | * not critical, though, it just means the overall size of the | ||
529 | * pool might be one hugepage larger than it needs to be, but | ||
530 | * within all the constraints specified by the sysctls. | ||
484 | */ | 531 | */ |
485 | spin_lock(&hugetlb_lock); | 532 | spin_lock(&hugetlb_lock); |
486 | while (surplus_huge_pages && count > persistent_huge_pages) { | 533 | while (surplus_huge_pages && count > persistent_huge_pages) { |
@@ -509,6 +556,14 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
509 | * to keep enough around to satisfy reservations). Then place | 556 | * to keep enough around to satisfy reservations). Then place |
510 | * pages into surplus state as needed so the pool will shrink | 557 | * pages into surplus state as needed so the pool will shrink |
511 | * to the desired size as pages become free. | 558 | * to the desired size as pages become free. |
559 | * | ||
560 | * By placing pages into the surplus state independent of the | ||
561 | * overcommit value, we are allowing the surplus pool size to | ||
562 | * exceed overcommit. There are few sane options here. Since | ||
563 | * alloc_buddy_huge_page() is checking the global counter, | ||
564 | * though, we'll note that we're not allowed to exceed surplus | ||
565 | * and won't grow the pool anywhere else. Not until one of the | ||
566 | * sysctls are changed, or the surplus pages go out of use. | ||
512 | */ | 567 | */ |
513 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; | 568 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; |
514 | min_count = max(count, min_count); | 569 | min_count = max(count, min_count); |