diff options
author | Andrea Arcangeli <aarcange@redhat.com> | 2011-11-02 16:36:59 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-11-02 19:06:57 -0400 |
commit | 70b50f94f1644e2aa7cb374819cfd93f3c28d725 (patch) | |
tree | 79198cd9a92600140827a670d1ed5eefdcd23d79 /mm/swap.c | |
parent | 994c0e992522c123298b4a91b72f5e67ba2d1123 (diff) |
mm: thp: tail page refcounting fix
Michel while working on the working set estimation code, noticed that
calling get_page_unless_zero() on a random pfn_to_page(random_pfn)
wasn't safe, if the pfn ended up being a tail page of a transparent
hugepage under splitting by __split_huge_page_refcount().
He then found the problem could also theoretically materialize with
page_cache_get_speculative() during the speculative radix tree lookups
that uses get_page_unless_zero() in SMP if the radix tree page is freed
and reallocated and get_user_pages is called on it before
page_cache_get_speculative has a chance to call get_page_unless_zero().
So the best way to fix the problem is to keep page_tail->_count zero at
all times. This will guarantee that get_page_unless_zero() can never
succeed on any tail page. page_tail->_mapcount is guaranteed zero and
is unused for all tail pages of a compound page, so we can simply
account the tail page references there and transfer them to
tail_page->_count in __split_huge_page_refcount() (in addition to the
head_page->_mapcount).
While debugging this s/_count/_mapcount/ change I also noticed get_page is
called by direct-io.c on pages returned by get_user_pages. That wasn't
entirely safe because the two atomic_inc in get_page weren't atomic. As
opposed to other get_user_page users like secondary-MMU page fault to
establish the shadow pagetables would never call any superflous get_page
after get_user_page returns. It's safer to make get_page universally safe
for tail pages and to use get_page_foll() within follow_page (inside
get_user_pages()). get_page_foll() is safe to do the refcounting for tail
pages without taking any locks because it is run within PT lock protected
critical sections (PT lock for pte and page_table_lock for
pmd_trans_huge).
The standard get_page() as invoked by direct-io instead will now take
the compound_lock but still only for tail pages. The direct-io paths
are usually I/O bound and the compound_lock is per THP so very
finegrined, so there's no risk of scalability issues with it. A simple
direct-io benchmarks with all lockdep prove locking and spinlock
debugging infrastructure enabled shows identical performance and no
overhead. So it's worth it. Ideally direct-io should stop calling
get_page() on pages returned by get_user_pages(). The spinlock in
get_page() is already optimized away for no-THP builds but doing
get_page() on tail pages returned by GUP is generally a rare operation
and usually only run in I/O paths.
This new refcounting on page_tail->_mapcount in addition to avoiding new
RCU critical sections will also allow the working set estimation code to
work without any further complexity associated to the tail page
refcounting with THP.
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: <stable@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swap.c')
-rw-r--r-- | mm/swap.c | 83 |
1 files changed, 53 insertions, 30 deletions
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page) | |||
78 | { | 78 | { |
79 | if (unlikely(PageTail(page))) { | 79 | if (unlikely(PageTail(page))) { |
80 | /* __split_huge_page_refcount can run under us */ | 80 | /* __split_huge_page_refcount can run under us */ |
81 | struct page *page_head = page->first_page; | 81 | struct page *page_head = compound_trans_head(page); |
82 | smp_rmb(); | 82 | |
83 | /* | 83 | if (likely(page != page_head && |
84 | * If PageTail is still set after smp_rmb() we can be sure | 84 | get_page_unless_zero(page_head))) { |
85 | * that the page->first_page we read wasn't a dangling pointer. | ||
86 | * See __split_huge_page_refcount() smp_wmb(). | ||
87 | */ | ||
88 | if (likely(PageTail(page) && get_page_unless_zero(page_head))) { | ||
89 | unsigned long flags; | 85 | unsigned long flags; |
90 | /* | 86 | /* |
91 | * Verify that our page_head wasn't converted | 87 | * page_head wasn't a dangling pointer but it |
92 | * to a a regular page before we got a | 88 | * may not be a head page anymore by the time |
93 | * reference on it. | 89 | * we obtain the lock. That is ok as long as it |
90 | * can't be freed from under us. | ||
94 | */ | 91 | */ |
95 | if (unlikely(!PageHead(page_head))) { | ||
96 | /* PageHead is cleared after PageTail */ | ||
97 | smp_rmb(); | ||
98 | VM_BUG_ON(PageTail(page)); | ||
99 | goto out_put_head; | ||
100 | } | ||
101 | /* | ||
102 | * Only run compound_lock on a valid PageHead, | ||
103 | * after having it pinned with | ||
104 | * get_page_unless_zero() above. | ||
105 | */ | ||
106 | smp_mb(); | ||
107 | /* page_head wasn't a dangling pointer */ | ||
108 | flags = compound_lock_irqsave(page_head); | 92 | flags = compound_lock_irqsave(page_head); |
109 | if (unlikely(!PageTail(page))) { | 93 | if (unlikely(!PageTail(page))) { |
110 | /* __split_huge_page_refcount run before us */ | 94 | /* __split_huge_page_refcount run before us */ |
111 | compound_unlock_irqrestore(page_head, flags); | 95 | compound_unlock_irqrestore(page_head, flags); |
112 | VM_BUG_ON(PageHead(page_head)); | 96 | VM_BUG_ON(PageHead(page_head)); |
113 | out_put_head: | ||
114 | if (put_page_testzero(page_head)) | 97 | if (put_page_testzero(page_head)) |
115 | __put_single_page(page_head); | 98 | __put_single_page(page_head); |
116 | out_put_single: | 99 | out_put_single: |
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page) | |||
121 | VM_BUG_ON(page_head != page->first_page); | 104 | VM_BUG_ON(page_head != page->first_page); |
122 | /* | 105 | /* |
123 | * We can release the refcount taken by | 106 | * We can release the refcount taken by |
124 | * get_page_unless_zero now that | 107 | * get_page_unless_zero() now that |
125 | * split_huge_page_refcount is blocked on the | 108 | * __split_huge_page_refcount() is blocked on |
126 | * compound_lock. | 109 | * the compound_lock. |
127 | */ | 110 | */ |
128 | if (put_page_testzero(page_head)) | 111 | if (put_page_testzero(page_head)) |
129 | VM_BUG_ON(1); | 112 | VM_BUG_ON(1); |
130 | /* __split_huge_page_refcount will wait now */ | 113 | /* __split_huge_page_refcount will wait now */ |
131 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | 114 | VM_BUG_ON(page_mapcount(page) <= 0); |
132 | atomic_dec(&page->_count); | 115 | atomic_dec(&page->_mapcount); |
133 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | 116 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); |
117 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
134 | compound_unlock_irqrestore(page_head, flags); | 118 | compound_unlock_irqrestore(page_head, flags); |
135 | if (put_page_testzero(page_head)) { | 119 | if (put_page_testzero(page_head)) { |
136 | if (PageHead(page_head)) | 120 | if (PageHead(page_head)) |
@@ -160,6 +144,45 @@ void put_page(struct page *page) | |||
160 | } | 144 | } |
161 | EXPORT_SYMBOL(put_page); | 145 | EXPORT_SYMBOL(put_page); |
162 | 146 | ||
147 | /* | ||
148 | * This function is exported but must not be called by anything other | ||
149 | * than get_page(). It implements the slow path of get_page(). | ||
150 | */ | ||
151 | bool __get_page_tail(struct page *page) | ||
152 | { | ||
153 | /* | ||
154 | * This takes care of get_page() if run on a tail page | ||
155 | * returned by one of the get_user_pages/follow_page variants. | ||
156 | * get_user_pages/follow_page itself doesn't need the compound | ||
157 | * lock because it runs __get_page_tail_foll() under the | ||
158 | * proper PT lock that already serializes against | ||
159 | * split_huge_page(). | ||
160 | */ | ||
161 | unsigned long flags; | ||
162 | bool got = false; | ||
163 | struct page *page_head = compound_trans_head(page); | ||
164 | |||
165 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
166 | /* | ||
167 | * page_head wasn't a dangling pointer but it | ||
168 | * may not be a head page anymore by the time | ||
169 | * we obtain the lock. That is ok as long as it | ||
170 | * can't be freed from under us. | ||
171 | */ | ||
172 | flags = compound_lock_irqsave(page_head); | ||
173 | /* here __split_huge_page_refcount won't run anymore */ | ||
174 | if (likely(PageTail(page))) { | ||
175 | __get_page_tail_foll(page, false); | ||
176 | got = true; | ||
177 | } | ||
178 | compound_unlock_irqrestore(page_head, flags); | ||
179 | if (unlikely(!got)) | ||
180 | put_page(page_head); | ||
181 | } | ||
182 | return got; | ||
183 | } | ||
184 | EXPORT_SYMBOL(__get_page_tail); | ||
185 | |||
163 | /** | 186 | /** |
164 | * put_pages_list() - release a list of pages | 187 | * put_pages_list() - release a list of pages |
165 | * @pages: list of pages threaded on page->lru | 188 | * @pages: list of pages threaded on page->lru |