diff options
author | David Rientjes <rientjes@google.com> | 2012-10-08 19:34:03 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-09 03:23:02 -0400 |
commit | b676b293fb48672904ee1b9828cb50b4eed01717 (patch) | |
tree | 22b2dcc1623da40a5ddfaf6db2bc5ab1c2476ddb | |
parent | e90bdb7f52f94204c78fb40b0804645defdebd71 (diff) |
mm, thp: fix mapped pages avoiding unevictable list on mlock
When a transparent hugepage is mapped and it is included in an mlock()
range, follow_page() incorrectly avoids setting the page's mlock bit and
moving it to the unevictable lru.
This is evident if you try to mlock(), munlock(), and then mlock() a
range again. Currently:
#define MAP_SIZE (4 << 30) /* 4GB */
void *ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
mlock(ptr, MAP_SIZE);
$ grep -E "Unevictable|Inactive\(anon" /proc/meminfo
Inactive(anon): 6304 kB
Unevictable: 4213924 kB
munlock(ptr, MAP_SIZE);
Inactive(anon): 4186252 kB
Unevictable: 19652 kB
mlock(ptr, MAP_SIZE);
Inactive(anon): 4198556 kB
Unevictable: 21684 kB
Notice that less than 2MB was added to the unevictable list; this is
because these pages in the range are not transparent hugepages since the
4GB range was allocated with mmap() and has no specific alignment. If
posix_memalign() were used instead, unevictable would not have grown at
all on the second mlock().
The fix is to call mlock_vma_page() so that the mlock bit is set and the
page is added to the unevictable list. With this patch:
mlock(ptr, MAP_SIZE);
Inactive(anon): 4056 kB
Unevictable: 4213940 kB
munlock(ptr, MAP_SIZE);
Inactive(anon): 4198268 kB
Unevictable: 19636 kB
mlock(ptr, MAP_SIZE);
Inactive(anon): 4008 kB
Unevictable: 4213940 kB
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/huge_mm.h | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 11 | ||||
-rw-r--r-- | mm/memory.c | 2 |
3 files changed, 12 insertions, 3 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 6ab47af5a849..b31cb7da0346 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -11,7 +11,7 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
11 | extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 11 | extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, |
12 | unsigned long address, pmd_t *pmd, | 12 | unsigned long address, pmd_t *pmd, |
13 | pmd_t orig_pmd); | 13 | pmd_t orig_pmd); |
14 | extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, | 14 | extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
15 | unsigned long addr, | 15 | unsigned long addr, |
16 | pmd_t *pmd, | 16 | pmd_t *pmd, |
17 | unsigned int flags); | 17 | unsigned int flags); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 08a943b9cf95..3a8d6b7d95db 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -971,11 +971,12 @@ out_unlock: | |||
971 | return ret; | 971 | return ret; |
972 | } | 972 | } |
973 | 973 | ||
974 | struct page *follow_trans_huge_pmd(struct mm_struct *mm, | 974 | struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
975 | unsigned long addr, | 975 | unsigned long addr, |
976 | pmd_t *pmd, | 976 | pmd_t *pmd, |
977 | unsigned int flags) | 977 | unsigned int flags) |
978 | { | 978 | { |
979 | struct mm_struct *mm = vma->vm_mm; | ||
979 | struct page *page = NULL; | 980 | struct page *page = NULL; |
980 | 981 | ||
981 | assert_spin_locked(&mm->page_table_lock); | 982 | assert_spin_locked(&mm->page_table_lock); |
@@ -998,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, | |||
998 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | 999 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); |
999 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); | 1000 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); |
1000 | } | 1001 | } |
1002 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | ||
1003 | if (page->mapping && trylock_page(page)) { | ||
1004 | lru_add_drain(); | ||
1005 | if (page->mapping) | ||
1006 | mlock_vma_page(page); | ||
1007 | unlock_page(page); | ||
1008 | } | ||
1009 | } | ||
1001 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 1010 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
1002 | VM_BUG_ON(!PageCompound(page)); | 1011 | VM_BUG_ON(!PageCompound(page)); |
1003 | if (flags & FOLL_GET) | 1012 | if (flags & FOLL_GET) |
diff --git a/mm/memory.c b/mm/memory.c index 45bb6d296b6f..fb135ba4aba9 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1528,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1528 | spin_unlock(&mm->page_table_lock); | 1528 | spin_unlock(&mm->page_table_lock); |
1529 | wait_split_huge_page(vma->anon_vma, pmd); | 1529 | wait_split_huge_page(vma->anon_vma, pmd); |
1530 | } else { | 1530 | } else { |
1531 | page = follow_trans_huge_pmd(mm, address, | 1531 | page = follow_trans_huge_pmd(vma, address, |
1532 | pmd, flags); | 1532 | pmd, flags); |
1533 | spin_unlock(&mm->page_table_lock); | 1533 | spin_unlock(&mm->page_table_lock); |
1534 | goto out; | 1534 | goto out; |