diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 70 |
1 files changed, 60 insertions, 10 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6121b57bbe96..7224a4f07106 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -31,7 +31,7 @@ static unsigned int free_huge_pages_node[MAX_NUMNODES]; | |||
31 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | 31 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; |
32 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 32 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
33 | unsigned long hugepages_treat_as_movable; | 33 | unsigned long hugepages_treat_as_movable; |
34 | int hugetlb_dynamic_pool; | 34 | unsigned long nr_overcommit_huge_pages; |
35 | static int hugetlb_next_nid; | 35 | static int hugetlb_next_nid; |
36 | 36 | ||
37 | /* | 37 | /* |
@@ -227,22 +227,58 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
227 | unsigned long address) | 227 | unsigned long address) |
228 | { | 228 | { |
229 | struct page *page; | 229 | struct page *page; |
230 | unsigned int nid; | ||
230 | 231 | ||
231 | /* Check if the dynamic pool is enabled */ | 232 | /* |
232 | if (!hugetlb_dynamic_pool) | 233 | * Assume we will successfully allocate the surplus page to |
234 | * prevent racing processes from causing the surplus to exceed | ||
235 | * overcommit | ||
236 | * | ||
237 | * This however introduces a different race, where a process B | ||
238 | * tries to grow the static hugepage pool while alloc_pages() is | ||
239 | * called by process A. B will only examine the per-node | ||
240 | * counters in determining if surplus huge pages can be | ||
241 | * converted to normal huge pages in adjust_pool_surplus(). A | ||
242 | * won't be able to increment the per-node counter, until the | ||
243 | * lock is dropped by B, but B doesn't drop hugetlb_lock until | ||
244 | * no more huge pages can be converted from surplus to normal | ||
245 | * state (and doesn't try to convert again). Thus, we have a | ||
246 | * case where a surplus huge page exists, the pool is grown, and | ||
247 | * the surplus huge page still exists after, even though it | ||
248 | * should just have been converted to a normal huge page. This | ||
249 | * does not leak memory, though, as the hugepage will be freed | ||
250 | * once it is out of use. It also does not allow the counters to | ||
251 | * go out of whack in adjust_pool_surplus() as we don't modify | ||
252 | * the node values until we've gotten the hugepage and only the | ||
253 | * per-node value is checked there. | ||
254 | */ | ||
255 | spin_lock(&hugetlb_lock); | ||
256 | if (surplus_huge_pages >= nr_overcommit_huge_pages) { | ||
257 | spin_unlock(&hugetlb_lock); | ||
233 | return NULL; | 258 | return NULL; |
259 | } else { | ||
260 | nr_huge_pages++; | ||
261 | surplus_huge_pages++; | ||
262 | } | ||
263 | spin_unlock(&hugetlb_lock); | ||
234 | 264 | ||
235 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, | 265 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, |
236 | HUGETLB_PAGE_ORDER); | 266 | HUGETLB_PAGE_ORDER); |
267 | |||
268 | spin_lock(&hugetlb_lock); | ||
237 | if (page) { | 269 | if (page) { |
270 | nid = page_to_nid(page); | ||
238 | set_compound_page_dtor(page, free_huge_page); | 271 | set_compound_page_dtor(page, free_huge_page); |
239 | spin_lock(&hugetlb_lock); | 272 | /* |
240 | nr_huge_pages++; | 273 | * We incremented the global counters already |
241 | nr_huge_pages_node[page_to_nid(page)]++; | 274 | */ |
242 | surplus_huge_pages++; | 275 | nr_huge_pages_node[nid]++; |
243 | surplus_huge_pages_node[page_to_nid(page)]++; | 276 | surplus_huge_pages_node[nid]++; |
244 | spin_unlock(&hugetlb_lock); | 277 | } else { |
278 | nr_huge_pages--; | ||
279 | surplus_huge_pages--; | ||
245 | } | 280 | } |
281 | spin_unlock(&hugetlb_lock); | ||
246 | 282 | ||
247 | return page; | 283 | return page; |
248 | } | 284 | } |
@@ -481,6 +517,12 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
481 | * Increase the pool size | 517 | * Increase the pool size |
482 | * First take pages out of surplus state. Then make up the | 518 | * First take pages out of surplus state. Then make up the |
483 | * remaining difference by allocating fresh huge pages. | 519 | * remaining difference by allocating fresh huge pages. |
520 | * | ||
521 | * We might race with alloc_buddy_huge_page() here and be unable | ||
522 | * to convert a surplus huge page to a normal huge page. That is | ||
523 | * not critical, though, it just means the overall size of the | ||
524 | * pool might be one hugepage larger than it needs to be, but | ||
525 | * within all the constraints specified by the sysctls. | ||
484 | */ | 526 | */ |
485 | spin_lock(&hugetlb_lock); | 527 | spin_lock(&hugetlb_lock); |
486 | while (surplus_huge_pages && count > persistent_huge_pages) { | 528 | while (surplus_huge_pages && count > persistent_huge_pages) { |
@@ -509,6 +551,14 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
509 | * to keep enough around to satisfy reservations). Then place | 551 | * to keep enough around to satisfy reservations). Then place |
510 | * pages into surplus state as needed so the pool will shrink | 552 | * pages into surplus state as needed so the pool will shrink |
511 | * to the desired size as pages become free. | 553 | * to the desired size as pages become free. |
554 | * | ||
555 | * By placing pages into the surplus state independent of the | ||
556 | * overcommit value, we are allowing the surplus pool size to | ||
557 | * exceed overcommit. There are few sane options here. Since | ||
558 | * alloc_buddy_huge_page() is checking the global counter, | ||
559 | * though, we'll note that we're not allowed to exceed surplus | ||
560 | * and won't grow the pool anywhere else. Not until one of the | ||
561 | * sysctls are changed, or the surplus pages go out of use. | ||
512 | */ | 562 | */ |
513 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; | 563 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; |
514 | min_count = max(count, min_count); | 564 | min_count = max(count, min_count); |
@@ -907,7 +957,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
907 | */ | 957 | */ |
908 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | 958 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); |
909 | 959 | ||
910 | if (!pte || pte_none(*pte)) { | 960 | if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { |
911 | int ret; | 961 | int ret; |
912 | 962 | ||
913 | spin_unlock(&mm->page_table_lock); | 963 | spin_unlock(&mm->page_table_lock); |