diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 88 |
1 files changed, 75 insertions, 13 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6121b57bbe96..db861d8b6c28 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -31,7 +31,7 @@ static unsigned int free_huge_pages_node[MAX_NUMNODES]; | |||
31 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | 31 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; |
32 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 32 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
33 | unsigned long hugepages_treat_as_movable; | 33 | unsigned long hugepages_treat_as_movable; |
34 | int hugetlb_dynamic_pool; | 34 | unsigned long nr_overcommit_huge_pages; |
35 | static int hugetlb_next_nid; | 35 | static int hugetlb_next_nid; |
36 | 36 | ||
37 | /* | 37 | /* |
@@ -227,22 +227,58 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
227 | unsigned long address) | 227 | unsigned long address) |
228 | { | 228 | { |
229 | struct page *page; | 229 | struct page *page; |
230 | unsigned int nid; | ||
230 | 231 | ||
231 | /* Check if the dynamic pool is enabled */ | 232 | /* |
232 | if (!hugetlb_dynamic_pool) | 233 | * Assume we will successfully allocate the surplus page to |
234 | * prevent racing processes from causing the surplus to exceed | ||
235 | * overcommit | ||
236 | * | ||
237 | * This however introduces a different race, where a process B | ||
238 | * tries to grow the static hugepage pool while alloc_pages() is | ||
239 | * called by process A. B will only examine the per-node | ||
240 | * counters in determining if surplus huge pages can be | ||
241 | * converted to normal huge pages in adjust_pool_surplus(). A | ||
242 | * won't be able to increment the per-node counter, until the | ||
243 | * lock is dropped by B, but B doesn't drop hugetlb_lock until | ||
244 | * no more huge pages can be converted from surplus to normal | ||
245 | * state (and doesn't try to convert again). Thus, we have a | ||
246 | * case where a surplus huge page exists, the pool is grown, and | ||
247 | * the surplus huge page still exists after, even though it | ||
248 | * should just have been converted to a normal huge page. This | ||
249 | * does not leak memory, though, as the hugepage will be freed | ||
250 | * once it is out of use. It also does not allow the counters to | ||
251 | * go out of whack in adjust_pool_surplus() as we don't modify | ||
252 | * the node values until we've gotten the hugepage and only the | ||
253 | * per-node value is checked there. | ||
254 | */ | ||
255 | spin_lock(&hugetlb_lock); | ||
256 | if (surplus_huge_pages >= nr_overcommit_huge_pages) { | ||
257 | spin_unlock(&hugetlb_lock); | ||
233 | return NULL; | 258 | return NULL; |
259 | } else { | ||
260 | nr_huge_pages++; | ||
261 | surplus_huge_pages++; | ||
262 | } | ||
263 | spin_unlock(&hugetlb_lock); | ||
234 | 264 | ||
235 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, | 265 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, |
236 | HUGETLB_PAGE_ORDER); | 266 | HUGETLB_PAGE_ORDER); |
267 | |||
268 | spin_lock(&hugetlb_lock); | ||
237 | if (page) { | 269 | if (page) { |
270 | nid = page_to_nid(page); | ||
238 | set_compound_page_dtor(page, free_huge_page); | 271 | set_compound_page_dtor(page, free_huge_page); |
239 | spin_lock(&hugetlb_lock); | 272 | /* |
240 | nr_huge_pages++; | 273 | * We incremented the global counters already |
241 | nr_huge_pages_node[page_to_nid(page)]++; | 274 | */ |
242 | surplus_huge_pages++; | 275 | nr_huge_pages_node[nid]++; |
243 | surplus_huge_pages_node[page_to_nid(page)]++; | 276 | surplus_huge_pages_node[nid]++; |
244 | spin_unlock(&hugetlb_lock); | 277 | } else { |
278 | nr_huge_pages--; | ||
279 | surplus_huge_pages--; | ||
245 | } | 280 | } |
281 | spin_unlock(&hugetlb_lock); | ||
246 | 282 | ||
247 | return page; | 283 | return page; |
248 | } | 284 | } |
@@ -382,9 +418,14 @@ static struct page *alloc_huge_page_private(struct vm_area_struct *vma, | |||
382 | if (free_huge_pages > resv_huge_pages) | 418 | if (free_huge_pages > resv_huge_pages) |
383 | page = dequeue_huge_page(vma, addr); | 419 | page = dequeue_huge_page(vma, addr); |
384 | spin_unlock(&hugetlb_lock); | 420 | spin_unlock(&hugetlb_lock); |
385 | if (!page) | 421 | if (!page) { |
386 | page = alloc_buddy_huge_page(vma, addr); | 422 | page = alloc_buddy_huge_page(vma, addr); |
387 | return page ? page : ERR_PTR(-VM_FAULT_OOM); | 423 | if (!page) { |
424 | hugetlb_put_quota(vma->vm_file->f_mapping, 1); | ||
425 | return ERR_PTR(-VM_FAULT_OOM); | ||
426 | } | ||
427 | } | ||
428 | return page; | ||
388 | } | 429 | } |
389 | 430 | ||
390 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 431 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
@@ -481,6 +522,12 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
481 | * Increase the pool size | 522 | * Increase the pool size |
482 | * First take pages out of surplus state. Then make up the | 523 | * First take pages out of surplus state. Then make up the |
483 | * remaining difference by allocating fresh huge pages. | 524 | * remaining difference by allocating fresh huge pages. |
525 | * | ||
526 | * We might race with alloc_buddy_huge_page() here and be unable | ||
527 | * to convert a surplus huge page to a normal huge page. That is | ||
528 | * not critical, though, it just means the overall size of the | ||
529 | * pool might be one hugepage larger than it needs to be, but | ||
530 | * within all the constraints specified by the sysctls. | ||
484 | */ | 531 | */ |
485 | spin_lock(&hugetlb_lock); | 532 | spin_lock(&hugetlb_lock); |
486 | while (surplus_huge_pages && count > persistent_huge_pages) { | 533 | while (surplus_huge_pages && count > persistent_huge_pages) { |
@@ -509,6 +556,14 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
509 | * to keep enough around to satisfy reservations). Then place | 556 | * to keep enough around to satisfy reservations). Then place |
510 | * pages into surplus state as needed so the pool will shrink | 557 | * pages into surplus state as needed so the pool will shrink |
511 | * to the desired size as pages become free. | 558 | * to the desired size as pages become free. |
559 | * | ||
560 | * By placing pages into the surplus state independent of the | ||
561 | * overcommit value, we are allowing the surplus pool size to | ||
562 | * exceed overcommit. There are few sane options here. Since | ||
563 | * alloc_buddy_huge_page() is checking the global counter, | ||
564 | * though, we'll note that we're not allowed to exceed surplus | ||
565 | * and won't grow the pool anywhere else. Not until one of the | ||
566 | * sysctls are changed, or the surplus pages go out of use. | ||
512 | */ | 567 | */ |
513 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; | 568 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; |
514 | min_count = max(count, min_count); | 569 | min_count = max(count, min_count); |
@@ -644,6 +699,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
644 | dst_pte = huge_pte_alloc(dst, addr); | 699 | dst_pte = huge_pte_alloc(dst, addr); |
645 | if (!dst_pte) | 700 | if (!dst_pte) |
646 | goto nomem; | 701 | goto nomem; |
702 | |||
703 | /* If the pagetables are shared don't copy or take references */ | ||
704 | if (dst_pte == src_pte) | ||
705 | continue; | ||
706 | |||
647 | spin_lock(&dst->page_table_lock); | 707 | spin_lock(&dst->page_table_lock); |
648 | spin_lock(&src->page_table_lock); | 708 | spin_lock(&src->page_table_lock); |
649 | if (!pte_none(*src_pte)) { | 709 | if (!pte_none(*src_pte)) { |
@@ -907,7 +967,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
907 | */ | 967 | */ |
908 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | 968 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); |
909 | 969 | ||
910 | if (!pte || pte_none(*pte)) { | 970 | if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { |
911 | int ret; | 971 | int ret; |
912 | 972 | ||
913 | spin_unlock(&mm->page_table_lock); | 973 | spin_unlock(&mm->page_table_lock); |
@@ -1156,8 +1216,10 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) | |||
1156 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 1216 | if (hugetlb_get_quota(inode->i_mapping, chg)) |
1157 | return -ENOSPC; | 1217 | return -ENOSPC; |
1158 | ret = hugetlb_acct_memory(chg); | 1218 | ret = hugetlb_acct_memory(chg); |
1159 | if (ret < 0) | 1219 | if (ret < 0) { |
1220 | hugetlb_put_quota(inode->i_mapping, chg); | ||
1160 | return ret; | 1221 | return ret; |
1222 | } | ||
1161 | region_add(&inode->i_mapping->private_list, from, to); | 1223 | region_add(&inode->i_mapping->private_list, from, to); |
1162 | return 0; | 1224 | return 0; |
1163 | } | 1225 | } |