aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2011-11-02 16:36:59 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-11-02 19:06:57 -0400
commit70b50f94f1644e2aa7cb374819cfd93f3c28d725 (patch)
tree79198cd9a92600140827a670d1ed5eefdcd23d79 /include
parent994c0e992522c123298b4a91b72f5e67ba2d1123 (diff)
mm: thp: tail page refcounting fix
Michel while working on the working set estimation code, noticed that calling get_page_unless_zero() on a random pfn_to_page(random_pfn) wasn't safe, if the pfn ended up being a tail page of a transparent hugepage under splitting by __split_huge_page_refcount(). He then found the problem could also theoretically materialize with page_cache_get_speculative() during the speculative radix tree lookups that uses get_page_unless_zero() in SMP if the radix tree page is freed and reallocated and get_user_pages is called on it before page_cache_get_speculative has a chance to call get_page_unless_zero(). So the best way to fix the problem is to keep page_tail->_count zero at all times. This will guarantee that get_page_unless_zero() can never succeed on any tail page. page_tail->_mapcount is guaranteed zero and is unused for all tail pages of a compound page, so we can simply account the tail page references there and transfer them to tail_page->_count in __split_huge_page_refcount() (in addition to the head_page->_mapcount). While debugging this s/_count/_mapcount/ change I also noticed get_page is called by direct-io.c on pages returned by get_user_pages. That wasn't entirely safe because the two atomic_inc in get_page weren't atomic. As opposed to other get_user_page users like secondary-MMU page fault to establish the shadow pagetables would never call any superflous get_page after get_user_page returns. It's safer to make get_page universally safe for tail pages and to use get_page_foll() within follow_page (inside get_user_pages()). get_page_foll() is safe to do the refcounting for tail pages without taking any locks because it is run within PT lock protected critical sections (PT lock for pte and page_table_lock for pmd_trans_huge). The standard get_page() as invoked by direct-io instead will now take the compound_lock but still only for tail pages. The direct-io paths are usually I/O bound and the compound_lock is per THP so very finegrined, so there's no risk of scalability issues with it. A simple direct-io benchmarks with all lockdep prove locking and spinlock debugging infrastructure enabled shows identical performance and no overhead. So it's worth it. Ideally direct-io should stop calling get_page() on pages returned by get_user_pages(). The spinlock in get_page() is already optimized away for no-THP builds but doing get_page() on tail pages returned by GUP is generally a rare operation and usually only run in I/O paths. This new refcounting on page_tail->_mapcount in addition to avoiding new RCU critical sections will also allow the working set estimation code to work without any further complexity associated to the tail page refcounting with THP. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Michel Lespinasse <walken@google.com> Reviewed-by: Michel Lespinasse <walken@google.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: <stable@kernel.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/mm.h56
-rw-r--r--include/linux/mm_types.h21
2 files changed, 39 insertions, 38 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3b3e3b8bb706..f81b7b41930c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -356,36 +356,39 @@ static inline struct page *compound_head(struct page *page)
356 return page; 356 return page;
357} 357}
358 358
359/*
360 * The atomic page->_mapcount, starts from -1: so that transitions
361 * both from it and to it can be tracked, using atomic_inc_and_test
362 * and atomic_add_negative(-1).
363 */
364static inline void reset_page_mapcount(struct page *page)
365{
366 atomic_set(&(page)->_mapcount, -1);
367}
368
369static inline int page_mapcount(struct page *page)
370{
371 return atomic_read(&(page)->_mapcount) + 1;
372}
373
359static inline int page_count(struct page *page) 374static inline int page_count(struct page *page)
360{ 375{
361 return atomic_read(&compound_head(page)->_count); 376 return atomic_read(&compound_head(page)->_count);
362} 377}
363 378
379extern bool __get_page_tail(struct page *page);
380
364static inline void get_page(struct page *page) 381static inline void get_page(struct page *page)
365{ 382{
383 if (unlikely(PageTail(page)))
384 if (likely(__get_page_tail(page)))
385 return;
366 /* 386 /*
367 * Getting a normal page or the head of a compound page 387 * Getting a normal page or the head of a compound page
368 * requires to already have an elevated page->_count. Only if 388 * requires to already have an elevated page->_count.
369 * we're getting a tail page, the elevated page->_count is
370 * required only in the head page, so for tail pages the
371 * bugcheck only verifies that the page->_count isn't
372 * negative.
373 */ 389 */
374 VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); 390 VM_BUG_ON(atomic_read(&page->_count) <= 0);
375 atomic_inc(&page->_count); 391 atomic_inc(&page->_count);
376 /*
377 * Getting a tail page will elevate both the head and tail
378 * page->_count(s).
379 */
380 if (unlikely(PageTail(page))) {
381 /*
382 * This is safe only because
383 * __split_huge_page_refcount can't run under
384 * get_page().
385 */
386 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
387 atomic_inc(&page->first_page->_count);
388 }
389} 392}
390 393
391static inline struct page *virt_to_head_page(const void *x) 394static inline struct page *virt_to_head_page(const void *x)
@@ -804,21 +807,6 @@ static inline pgoff_t page_index(struct page *page)
804} 807}
805 808
806/* 809/*
807 * The atomic page->_mapcount, like _count, starts from -1:
808 * so that transitions both from it and to it can be tracked,
809 * using atomic_inc_and_test and atomic_add_negative(-1).
810 */
811static inline void reset_page_mapcount(struct page *page)
812{
813 atomic_set(&(page)->_mapcount, -1);
814}
815
816static inline int page_mapcount(struct page *page)
817{
818 return atomic_read(&(page)->_mapcount) + 1;
819}
820
821/*
822 * Return true if this page is mapped into pagetables. 810 * Return true if this page is mapped into pagetables.
823 */ 811 */
824static inline int page_mapped(struct page *page) 812static inline int page_mapped(struct page *page)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3e01a19a91e8..5b42f1b34eb7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -62,10 +62,23 @@ struct page {
62 struct { 62 struct {
63 63
64 union { 64 union {
65 atomic_t _mapcount; /* Count of ptes mapped in mms, 65 /*
66 * to show when page is mapped 66 * Count of ptes mapped in
67 * & limit reverse map searches. 67 * mms, to show when page is
68 */ 68 * mapped & limit reverse map
69 * searches.
70 *
71 * Used also for tail pages
72 * refcounting instead of
73 * _count. Tail pages cannot
74 * be mapped and keeping the
75 * tail page _count zero at
76 * all times guarantees
77 * get_page_unless_zero() will
78 * never succeed on tail
79 * pages.
80 */
81 atomic_t _mapcount;
69 82
70 struct { 83 struct {
71 unsigned inuse:16; 84 unsigned inuse:16;