aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
authorJoonsoo Kim <iamjoonsoo.kim@lge.com>2013-09-11 17:21:18 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-11 18:57:28 -0400
commitaf0ed73e699bb0453603b1d1a4727377641b2096 (patch)
tree84b29594887255cb9e6eb68239c845ba892903f0 /mm/hugetlb.c
parenta63884e921cb33a6beb260fa88bcbf1712d98a9a (diff)
mm, hugetlb: decrement reserve count if VM_NORESERVE alloc page cache
If a vma with VM_NORESERVE allocate a new page for page cache, we should check whether this area is reserved or not. If this address is already reserved by other process(in case of chg == 0), we should decrement reserve count, because this allocated page will go into page cache and currently, there is no way to know that this page comes from reserved pool or not when releasing inode. This may introduce over-counting problem to reserved count. With following example code, you can easily reproduce this situation. Assume 2MB, nr_hugepages = 100 size = 20 * MB; flag = MAP_SHARED; p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0); if (p == MAP_FAILED) { fprintf(stderr, "mmap() failed: %s\n", strerror(errno)); return -1; } flag = MAP_SHARED | MAP_NORESERVE; q = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0); if (q == MAP_FAILED) { fprintf(stderr, "mmap() failed: %s\n", strerror(errno)); } q[0] = 'c'; After finish the program, run 'cat /proc/meminfo'. You can see below result. HugePages_Free: 100 HugePages_Rsvd: 1 To fix this, we should check our mapping type and tracked region. If our mapping is VM_NORESERVE, VM_MAYSHARE and chg is 0, this imply that current allocated page will go into page cache which is already reserved region when mapping is created. In this case, we should decrease reserve count. As implementing above, this patch solve the problem. [akpm@linux-foundation.org: fix spelling in comment] Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Acked-by: Hillf Danton <dhillf@gmail.com> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Cc: Davidlohr Bueso <davidlohr.bueso@hp.com> Cc: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c34
1 files changed, 26 insertions, 8 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dacf0d2256d9..5b084c7b34c6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -443,10 +443,23 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
443} 443}
444 444
445/* Returns true if the VMA has associated reserve pages */ 445/* Returns true if the VMA has associated reserve pages */
446static int vma_has_reserves(struct vm_area_struct *vma) 446static int vma_has_reserves(struct vm_area_struct *vma, long chg)
447{ 447{
448 if (vma->vm_flags & VM_NORESERVE) 448 if (vma->vm_flags & VM_NORESERVE) {
449 return 0; 449 /*
450 * This address is already reserved by other process(chg == 0),
451 * so, we should decrement reserved count. Without decrementing,
452 * reserve count remains after releasing inode, because this
453 * allocated page will go into page cache and is regarded as
454 * coming from reserved pool in releasing step. Currently, we
455 * don't have any other solution to deal with this situation
456 * properly, so add work-around here.
457 */
458 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
459 return 1;
460 else
461 return 0;
462 }
450 463
451 /* Shared mappings always use reserves */ 464 /* Shared mappings always use reserves */
452 if (vma->vm_flags & VM_MAYSHARE) 465 if (vma->vm_flags & VM_MAYSHARE)
@@ -520,7 +533,8 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
520 533
521static struct page *dequeue_huge_page_vma(struct hstate *h, 534static struct page *dequeue_huge_page_vma(struct hstate *h,
522 struct vm_area_struct *vma, 535 struct vm_area_struct *vma,
523 unsigned long address, int avoid_reserve) 536 unsigned long address, int avoid_reserve,
537 long chg)
524{ 538{
525 struct page *page = NULL; 539 struct page *page = NULL;
526 struct mempolicy *mpol; 540 struct mempolicy *mpol;
@@ -535,7 +549,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
535 * have no page reserves. This check ensures that reservations are 549 * have no page reserves. This check ensures that reservations are
536 * not "stolen". The child may still get SIGKILLed 550 * not "stolen". The child may still get SIGKILLed
537 */ 551 */
538 if (!vma_has_reserves(vma) && 552 if (!vma_has_reserves(vma, chg) &&
539 h->free_huge_pages - h->resv_huge_pages == 0) 553 h->free_huge_pages - h->resv_huge_pages == 0)
540 goto err; 554 goto err;
541 555
@@ -553,8 +567,12 @@ retry_cpuset:
553 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { 567 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
554 page = dequeue_huge_page_node(h, zone_to_nid(zone)); 568 page = dequeue_huge_page_node(h, zone_to_nid(zone));
555 if (page) { 569 if (page) {
556 if (!avoid_reserve && vma_has_reserves(vma)) 570 if (avoid_reserve)
557 h->resv_huge_pages--; 571 break;
572 if (!vma_has_reserves(vma, chg))
573 break;
574
575 h->resv_huge_pages--;
558 break; 576 break;
559 } 577 }
560 } 578 }
@@ -1155,7 +1173,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1155 return ERR_PTR(-ENOSPC); 1173 return ERR_PTR(-ENOSPC);
1156 } 1174 }
1157 spin_lock(&hugetlb_lock); 1175 spin_lock(&hugetlb_lock);
1158 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1176 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
1159 if (!page) { 1177 if (!page) {
1160 spin_unlock(&hugetlb_lock); 1178 spin_unlock(&hugetlb_lock);
1161 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1179 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);