aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2013-11-21 17:32:02 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2014-02-06 14:08:12 -0500
commit17b6ada0567b5a9b837d37ad007c6da36dd759c0 (patch)
tree7ae8b3f2adbba0a5846cf2b216d609dadfb62818 /mm
parentc18e49ad50903819b947aea33c1cdcef724f7c35 (diff)
mm: hugetlbfs: fix hugetlbfs optimization
commit 27c73ae759774e63313c1fbfeb17ba076cea64c5 upstream. Commit 7cb2ef56e6a8 ("mm: fix aio performance regression for database caused by THP") can cause dereference of a dangling pointer if split_huge_page runs during PageHuge() if there are updates to the tail_page->private field. Also it is repeating compound_head twice for hugetlbfs and it is running compound_head+compound_trans_head for THP when a single one is needed in both cases. The new code within the PageSlab() check doesn't need to verify that the THP page size is never bigger than the smallest hugetlbfs page size, to avoid memory corruption. A longstanding theoretical race condition was found while fixing the above (see the change right after the skip_unlock label, that is relevant for the compound_lock path too). By re-establishing the _mapcount tail refcounting for all compound pages, this also fixes the below problem: echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages BUG: Bad page state in process bash pfn:59a01 page:ffffea000139b038 count:0 mapcount:10 mapping: (null) index:0x0 page flags: 0x1c00000000008000(tail) Modules linked in: CPU: 6 PID: 2018 Comm: bash Not tainted 3.12.0+ #25 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x55/0x76 bad_page+0xd5/0x130 free_pages_prepare+0x213/0x280 __free_pages+0x36/0x80 update_and_free_page+0xc1/0xd0 free_pool_huge_page+0xc2/0xe0 set_max_huge_pages.part.58+0x14c/0x220 nr_hugepages_store_common.isra.60+0xd0/0xf0 nr_hugepages_store+0x13/0x20 kobj_attr_store+0xf/0x20 sysfs_write_file+0x189/0x1e0 vfs_write+0xc5/0x1f0 SyS_write+0x55/0xb0 system_call_fastpath+0x16/0x1b Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Tested-by: Khalid Aziz <khalid.aziz@oracle.com> Cc: Pravin Shelar <pshelar@nicira.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Christoph Lameter <cl@linux.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Minchan Kim <minchan@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Guillaume Morin <guillaume@morinfr.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/swap.c143
2 files changed, 100 insertions, 60 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c5eb85ec645..40ad2c6e0ca9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -690,6 +690,23 @@ int PageHuge(struct page *page)
690} 690}
691EXPORT_SYMBOL_GPL(PageHuge); 691EXPORT_SYMBOL_GPL(PageHuge);
692 692
693/*
694 * PageHeadHuge() only returns true for hugetlbfs head page, but not for
695 * normal or transparent huge pages.
696 */
697int PageHeadHuge(struct page *page_head)
698{
699 compound_page_dtor *dtor;
700
701 if (!PageHead(page_head))
702 return 0;
703
704 dtor = get_compound_page_dtor(page_head);
705
706 return dtor == free_huge_page;
707}
708EXPORT_SYMBOL_GPL(PageHeadHuge);
709
693pgoff_t __basepage_index(struct page *page) 710pgoff_t __basepage_index(struct page *page)
694{ 711{
695 struct page *page_head = compound_head(page); 712 struct page *page_head = compound_head(page);
diff --git a/mm/swap.c b/mm/swap.c
index 9f2225f2b5b0..ea58dbde788e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -79,19 +79,6 @@ static void __put_compound_page(struct page *page)
79 79
80static void put_compound_page(struct page *page) 80static void put_compound_page(struct page *page)
81{ 81{
82 /*
83 * hugetlbfs pages cannot be split from under us. If this is a
84 * hugetlbfs page, check refcount on head page and release the page if
85 * the refcount becomes zero.
86 */
87 if (PageHuge(page)) {
88 page = compound_head(page);
89 if (put_page_testzero(page))
90 __put_compound_page(page);
91
92 return;
93 }
94
95 if (unlikely(PageTail(page))) { 82 if (unlikely(PageTail(page))) {
96 /* __split_huge_page_refcount can run under us */ 83 /* __split_huge_page_refcount can run under us */
97 struct page *page_head = compound_trans_head(page); 84 struct page *page_head = compound_trans_head(page);
@@ -108,14 +95,31 @@ static void put_compound_page(struct page *page)
108 * still hot on arches that do not support 95 * still hot on arches that do not support
109 * this_cpu_cmpxchg_double(). 96 * this_cpu_cmpxchg_double().
110 */ 97 */
111 if (PageSlab(page_head)) { 98 if (PageSlab(page_head) || PageHeadHuge(page_head)) {
112 if (PageTail(page)) { 99 if (likely(PageTail(page))) {
100 /*
101 * __split_huge_page_refcount
102 * cannot race here.
103 */
104 VM_BUG_ON(!PageHead(page_head));
105 atomic_dec(&page->_mapcount);
113 if (put_page_testzero(page_head)) 106 if (put_page_testzero(page_head))
114 VM_BUG_ON(1); 107 VM_BUG_ON(1);
115 108 if (put_page_testzero(page_head))
116 atomic_dec(&page->_mapcount); 109 __put_compound_page(page_head);
117 goto skip_lock_tail; 110 return;
118 } else 111 } else
112 /*
113 * __split_huge_page_refcount
114 * run before us, "page" was a
115 * THP tail. The split
116 * page_head has been freed
117 * and reallocated as slab or
118 * hugetlbfs page of smaller
119 * order (only possible if
120 * reallocated as slab on
121 * x86).
122 */
119 goto skip_lock; 123 goto skip_lock;
120 } 124 }
121 /* 125 /*
@@ -129,8 +133,27 @@ static void put_compound_page(struct page *page)
129 /* __split_huge_page_refcount run before us */ 133 /* __split_huge_page_refcount run before us */
130 compound_unlock_irqrestore(page_head, flags); 134 compound_unlock_irqrestore(page_head, flags);
131skip_lock: 135skip_lock:
132 if (put_page_testzero(page_head)) 136 if (put_page_testzero(page_head)) {
133 __put_single_page(page_head); 137 /*
138 * The head page may have been
139 * freed and reallocated as a
140 * compound page of smaller
141 * order and then freed again.
142 * All we know is that it
143 * cannot have become: a THP
144 * page, a compound page of
145 * higher order, a tail page.
146 * That is because we still
147 * hold the refcount of the
148 * split THP tail and
149 * page_head was the THP head
150 * before the split.
151 */
152 if (PageHead(page_head))
153 __put_compound_page(page_head);
154 else
155 __put_single_page(page_head);
156 }
134out_put_single: 157out_put_single:
135 if (put_page_testzero(page)) 158 if (put_page_testzero(page))
136 __put_single_page(page); 159 __put_single_page(page);
@@ -152,7 +175,6 @@ out_put_single:
152 VM_BUG_ON(atomic_read(&page->_count) != 0); 175 VM_BUG_ON(atomic_read(&page->_count) != 0);
153 compound_unlock_irqrestore(page_head, flags); 176 compound_unlock_irqrestore(page_head, flags);
154 177
155skip_lock_tail:
156 if (put_page_testzero(page_head)) { 178 if (put_page_testzero(page_head)) {
157 if (PageHead(page_head)) 179 if (PageHead(page_head))
158 __put_compound_page(page_head); 180 __put_compound_page(page_head);
@@ -195,51 +217,52 @@ bool __get_page_tail(struct page *page)
195 * proper PT lock that already serializes against 217 * proper PT lock that already serializes against
196 * split_huge_page(). 218 * split_huge_page().
197 */ 219 */
220 unsigned long flags;
198 bool got = false; 221 bool got = false;
199 struct page *page_head; 222 struct page *page_head = compound_trans_head(page);
200
201 /*
202 * If this is a hugetlbfs page it cannot be split under us. Simply
203 * increment refcount for the head page.
204 */
205 if (PageHuge(page)) {
206 page_head = compound_head(page);
207 atomic_inc(&page_head->_count);
208 got = true;
209 } else {
210 unsigned long flags;
211 223
212 page_head = compound_trans_head(page); 224 if (likely(page != page_head && get_page_unless_zero(page_head))) {
213 if (likely(page != page_head && 225 /* Ref to put_compound_page() comment. */
214 get_page_unless_zero(page_head))) { 226 if (PageSlab(page_head) || PageHeadHuge(page_head)) {
215
216 /* Ref to put_compound_page() comment. */
217 if (PageSlab(page_head)) {
218 if (likely(PageTail(page))) {
219 __get_page_tail_foll(page, false);
220 return true;
221 } else {
222 put_page(page_head);
223 return false;
224 }
225 }
226
227 /*
228 * page_head wasn't a dangling pointer but it
229 * may not be a head page anymore by the time
230 * we obtain the lock. That is ok as long as it
231 * can't be freed from under us.
232 */
233 flags = compound_lock_irqsave(page_head);
234 /* here __split_huge_page_refcount won't run anymore */
235 if (likely(PageTail(page))) { 227 if (likely(PageTail(page))) {
228 /*
229 * This is a hugetlbfs page or a slab
230 * page. __split_huge_page_refcount
231 * cannot race here.
232 */
233 VM_BUG_ON(!PageHead(page_head));
236 __get_page_tail_foll(page, false); 234 __get_page_tail_foll(page, false);
237 got = true; 235 return true;
238 } 236 } else {
239 compound_unlock_irqrestore(page_head, flags); 237 /*
240 if (unlikely(!got)) 238 * __split_huge_page_refcount run
239 * before us, "page" was a THP
240 * tail. The split page_head has been
241 * freed and reallocated as slab or
242 * hugetlbfs page of smaller order
243 * (only possible if reallocated as
244 * slab on x86).
245 */
241 put_page(page_head); 246 put_page(page_head);
247 return false;
248 }
249 }
250
251 /*
252 * page_head wasn't a dangling pointer but it
253 * may not be a head page anymore by the time
254 * we obtain the lock. That is ok as long as it
255 * can't be freed from under us.
256 */
257 flags = compound_lock_irqsave(page_head);
258 /* here __split_huge_page_refcount won't run anymore */
259 if (likely(PageTail(page))) {
260 __get_page_tail_foll(page, false);
261 got = true;
242 } 262 }
263 compound_unlock_irqrestore(page_head, flags);
264 if (unlikely(!got))
265 put_page(page_head);
243 } 266 }
244 return got; 267 return got;
245} 268}