aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2012-10-08 19:34:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 03:23:02 -0400
commitb676b293fb48672904ee1b9828cb50b4eed01717 (patch)
tree22b2dcc1623da40a5ddfaf6db2bc5ab1c2476ddb
parente90bdb7f52f94204c78fb40b0804645defdebd71 (diff)
mm, thp: fix mapped pages avoiding unevictable list on mlock
When a transparent hugepage is mapped and it is included in an mlock() range, follow_page() incorrectly avoids setting the page's mlock bit and moving it to the unevictable lru. This is evident if you try to mlock(), munlock(), and then mlock() a range again. Currently: #define MAP_SIZE (4 << 30) /* 4GB */ void *ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); mlock(ptr, MAP_SIZE); $ grep -E "Unevictable|Inactive\(anon" /proc/meminfo Inactive(anon): 6304 kB Unevictable: 4213924 kB munlock(ptr, MAP_SIZE); Inactive(anon): 4186252 kB Unevictable: 19652 kB mlock(ptr, MAP_SIZE); Inactive(anon): 4198556 kB Unevictable: 21684 kB Notice that less than 2MB was added to the unevictable list; this is because these pages in the range are not transparent hugepages since the 4GB range was allocated with mmap() and has no specific alignment. If posix_memalign() were used instead, unevictable would not have grown at all on the second mlock(). The fix is to call mlock_vma_page() so that the mlock bit is set and the page is added to the unevictable list. With this patch: mlock(ptr, MAP_SIZE); Inactive(anon): 4056 kB Unevictable: 4213940 kB munlock(ptr, MAP_SIZE); Inactive(anon): 4198268 kB Unevictable: 19636 kB mlock(ptr, MAP_SIZE); Inactive(anon): 4008 kB Unevictable: 4213940 kB Signed-off-by: David Rientjes <rientjes@google.com> Acked-by: Hugh Dickins <hughd@google.com> Reviewed-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michel Lespinasse <walken@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--mm/huge_memory.c11
-rw-r--r--mm/memory.c2
3 files changed, 12 insertions, 3 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6ab47af5a84..b31cb7da034 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,7 +11,7 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
11extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 11extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
12 unsigned long address, pmd_t *pmd, 12 unsigned long address, pmd_t *pmd,
13 pmd_t orig_pmd); 13 pmd_t orig_pmd);
14extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, 14extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
15 unsigned long addr, 15 unsigned long addr,
16 pmd_t *pmd, 16 pmd_t *pmd,
17 unsigned int flags); 17 unsigned int flags);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08a943b9cf9..3a8d6b7d95d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -971,11 +971,12 @@ out_unlock:
971 return ret; 971 return ret;
972} 972}
973 973
974struct page *follow_trans_huge_pmd(struct mm_struct *mm, 974struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
975 unsigned long addr, 975 unsigned long addr,
976 pmd_t *pmd, 976 pmd_t *pmd,
977 unsigned int flags) 977 unsigned int flags)
978{ 978{
979 struct mm_struct *mm = vma->vm_mm;
979 struct page *page = NULL; 980 struct page *page = NULL;
980 981
981 assert_spin_locked(&mm->page_table_lock); 982 assert_spin_locked(&mm->page_table_lock);
@@ -998,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
998 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 999 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
999 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); 1000 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
1000 } 1001 }
1002 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1003 if (page->mapping && trylock_page(page)) {
1004 lru_add_drain();
1005 if (page->mapping)
1006 mlock_vma_page(page);
1007 unlock_page(page);
1008 }
1009 }
1001 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1010 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1002 VM_BUG_ON(!PageCompound(page)); 1011 VM_BUG_ON(!PageCompound(page));
1003 if (flags & FOLL_GET) 1012 if (flags & FOLL_GET)
diff --git a/mm/memory.c b/mm/memory.c
index 45bb6d296b6..fb135ba4aba 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1528,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1528 spin_unlock(&mm->page_table_lock); 1528 spin_unlock(&mm->page_table_lock);
1529 wait_split_huge_page(vma->anon_vma, pmd); 1529 wait_split_huge_page(vma->anon_vma, pmd);
1530 } else { 1530 } else {
1531 page = follow_trans_huge_pmd(mm, address, 1531 page = follow_trans_huge_pmd(vma, address,
1532 pmd, flags); 1532 pmd, flags);
1533 spin_unlock(&mm->page_table_lock); 1533 spin_unlock(&mm->page_table_lock);
1534 goto out; 1534 goto out;