aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2013-11-21 17:32:02 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-21 19:42:27 -0500
commit27c73ae759774e63313c1fbfeb17ba076cea64c5 (patch)
tree13d013902e8784cf3f94f522e8366f79ef6b2c5e
parent044c8d4b15743f6e0a4cb6f2aeb32745a116ebff (diff)
mm: hugetlbfs: fix hugetlbfs optimization
Commit 7cb2ef56e6a8 ("mm: fix aio performance regression for database caused by THP") can cause dereference of a dangling pointer if split_huge_page runs during PageHuge() if there are updates to the tail_page->private field. Also it is repeating compound_head twice for hugetlbfs and it is running compound_head+compound_trans_head for THP when a single one is needed in both cases. The new code within the PageSlab() check doesn't need to verify that the THP page size is never bigger than the smallest hugetlbfs page size, to avoid memory corruption. A longstanding theoretical race condition was found while fixing the above (see the change right after the skip_unlock label, that is relevant for the compound_lock path too). By re-establishing the _mapcount tail refcounting for all compound pages, this also fixes the below problem: echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages BUG: Bad page state in process bash pfn:59a01 page:ffffea000139b038 count:0 mapcount:10 mapping: (null) index:0x0 page flags: 0x1c00000000008000(tail) Modules linked in: CPU: 6 PID: 2018 Comm: bash Not tainted 3.12.0+ #25 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x55/0x76 bad_page+0xd5/0x130 free_pages_prepare+0x213/0x280 __free_pages+0x36/0x80 update_and_free_page+0xc1/0xd0 free_pool_huge_page+0xc2/0xe0 set_max_huge_pages.part.58+0x14c/0x220 nr_hugepages_store_common.isra.60+0xd0/0xf0 nr_hugepages_store+0x13/0x20 kobj_attr_store+0xf/0x20 sysfs_write_file+0x189/0x1e0 vfs_write+0xc5/0x1f0 SyS_write+0x55/0xb0 system_call_fastpath+0x16/0x1b Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Tested-by: Khalid Aziz <khalid.aziz@oracle.com> Cc: Pravin Shelar <pshelar@nicira.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Christoph Lameter <cl@linux.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Minchan Kim <minchan@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/hugetlb.h6
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/swap.c143
3 files changed, 106 insertions, 60 deletions
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 85e0c58bdfdf..9649ff0c63f8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -31,6 +31,7 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
31void hugepage_put_subpool(struct hugepage_subpool *spool); 31void hugepage_put_subpool(struct hugepage_subpool *spool);
32 32
33int PageHuge(struct page *page); 33int PageHuge(struct page *page);
34int PageHeadHuge(struct page *page_head);
34 35
35void reset_vma_resv_huge_pages(struct vm_area_struct *vma); 36void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
36int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); 37int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -103,6 +104,11 @@ static inline int PageHuge(struct page *page)
103 return 0; 104 return 0;
104} 105}
105 106
107static inline int PageHeadHuge(struct page *page_head)
108{
109 return 0;
110}
111
106static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 112static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
107{ 113{
108} 114}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2130365d387d..dee6cf4e6d34 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -702,6 +702,23 @@ int PageHuge(struct page *page)
702} 702}
703EXPORT_SYMBOL_GPL(PageHuge); 703EXPORT_SYMBOL_GPL(PageHuge);
704 704
705/*
706 * PageHeadHuge() only returns true for hugetlbfs head page, but not for
707 * normal or transparent huge pages.
708 */
709int PageHeadHuge(struct page *page_head)
710{
711 compound_page_dtor *dtor;
712
713 if (!PageHead(page_head))
714 return 0;
715
716 dtor = get_compound_page_dtor(page_head);
717
718 return dtor == free_huge_page;
719}
720EXPORT_SYMBOL_GPL(PageHeadHuge);
721
705pgoff_t __basepage_index(struct page *page) 722pgoff_t __basepage_index(struct page *page)
706{ 723{
707 struct page *page_head = compound_head(page); 724 struct page *page_head = compound_head(page);
diff --git a/mm/swap.c b/mm/swap.c
index 7a9f80d451f5..84b26aaabd03 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -82,19 +82,6 @@ static void __put_compound_page(struct page *page)
82 82
83static void put_compound_page(struct page *page) 83static void put_compound_page(struct page *page)
84{ 84{
85 /*
86 * hugetlbfs pages cannot be split from under us. If this is a
87 * hugetlbfs page, check refcount on head page and release the page if
88 * the refcount becomes zero.
89 */
90 if (PageHuge(page)) {
91 page = compound_head(page);
92 if (put_page_testzero(page))
93 __put_compound_page(page);
94
95 return;
96 }
97
98 if (unlikely(PageTail(page))) { 85 if (unlikely(PageTail(page))) {
99 /* __split_huge_page_refcount can run under us */ 86 /* __split_huge_page_refcount can run under us */
100 struct page *page_head = compound_trans_head(page); 87 struct page *page_head = compound_trans_head(page);
@@ -111,14 +98,31 @@ static void put_compound_page(struct page *page)
111 * still hot on arches that do not support 98 * still hot on arches that do not support
112 * this_cpu_cmpxchg_double(). 99 * this_cpu_cmpxchg_double().
113 */ 100 */
114 if (PageSlab(page_head)) { 101 if (PageSlab(page_head) || PageHeadHuge(page_head)) {
115 if (PageTail(page)) { 102 if (likely(PageTail(page))) {
103 /*
104 * __split_huge_page_refcount
105 * cannot race here.
106 */
107 VM_BUG_ON(!PageHead(page_head));
108 atomic_dec(&page->_mapcount);
116 if (put_page_testzero(page_head)) 109 if (put_page_testzero(page_head))
117 VM_BUG_ON(1); 110 VM_BUG_ON(1);
118 111 if (put_page_testzero(page_head))
119 atomic_dec(&page->_mapcount); 112 __put_compound_page(page_head);
120 goto skip_lock_tail; 113 return;
121 } else 114 } else
115 /*
116 * __split_huge_page_refcount
117 * run before us, "page" was a
118 * THP tail. The split
119 * page_head has been freed
120 * and reallocated as slab or
121 * hugetlbfs page of smaller
122 * order (only possible if
123 * reallocated as slab on
124 * x86).
125 */
122 goto skip_lock; 126 goto skip_lock;
123 } 127 }
124 /* 128 /*
@@ -132,8 +136,27 @@ static void put_compound_page(struct page *page)
132 /* __split_huge_page_refcount run before us */ 136 /* __split_huge_page_refcount run before us */
133 compound_unlock_irqrestore(page_head, flags); 137 compound_unlock_irqrestore(page_head, flags);
134skip_lock: 138skip_lock:
135 if (put_page_testzero(page_head)) 139 if (put_page_testzero(page_head)) {
136 __put_single_page(page_head); 140 /*
141 * The head page may have been
142 * freed and reallocated as a
143 * compound page of smaller
144 * order and then freed again.
145 * All we know is that it
146 * cannot have become: a THP
147 * page, a compound page of
148 * higher order, a tail page.
149 * That is because we still
150 * hold the refcount of the
151 * split THP tail and
152 * page_head was the THP head
153 * before the split.
154 */
155 if (PageHead(page_head))
156 __put_compound_page(page_head);
157 else
158 __put_single_page(page_head);
159 }
137out_put_single: 160out_put_single:
138 if (put_page_testzero(page)) 161 if (put_page_testzero(page))
139 __put_single_page(page); 162 __put_single_page(page);
@@ -155,7 +178,6 @@ out_put_single:
155 VM_BUG_ON(atomic_read(&page->_count) != 0); 178 VM_BUG_ON(atomic_read(&page->_count) != 0);
156 compound_unlock_irqrestore(page_head, flags); 179 compound_unlock_irqrestore(page_head, flags);
157 180
158skip_lock_tail:
159 if (put_page_testzero(page_head)) { 181 if (put_page_testzero(page_head)) {
160 if (PageHead(page_head)) 182 if (PageHead(page_head))
161 __put_compound_page(page_head); 183 __put_compound_page(page_head);
@@ -198,51 +220,52 @@ bool __get_page_tail(struct page *page)
198 * proper PT lock that already serializes against 220 * proper PT lock that already serializes against
199 * split_huge_page(). 221 * split_huge_page().
200 */ 222 */
223 unsigned long flags;
201 bool got = false; 224 bool got = false;
202 struct page *page_head; 225 struct page *page_head = compound_trans_head(page);
203
204 /*
205 * If this is a hugetlbfs page it cannot be split under us. Simply
206 * increment refcount for the head page.
207 */
208 if (PageHuge(page)) {
209 page_head = compound_head(page);
210 atomic_inc(&page_head->_count);
211 got = true;
212 } else {
213 unsigned long flags;
214 226
215 page_head = compound_trans_head(page); 227 if (likely(page != page_head && get_page_unless_zero(page_head))) {
216 if (likely(page != page_head && 228 /* Ref to put_compound_page() comment. */
217 get_page_unless_zero(page_head))) { 229 if (PageSlab(page_head) || PageHeadHuge(page_head)) {
218
219 /* Ref to put_compound_page() comment. */
220 if (PageSlab(page_head)) {
221 if (likely(PageTail(page))) {
222 __get_page_tail_foll(page, false);
223 return true;
224 } else {
225 put_page(page_head);
226 return false;
227 }
228 }
229
230 /*
231 * page_head wasn't a dangling pointer but it
232 * may not be a head page anymore by the time
233 * we obtain the lock. That is ok as long as it
234 * can't be freed from under us.
235 */
236 flags = compound_lock_irqsave(page_head);
237 /* here __split_huge_page_refcount won't run anymore */
238 if (likely(PageTail(page))) { 230 if (likely(PageTail(page))) {
231 /*
232 * This is a hugetlbfs page or a slab
233 * page. __split_huge_page_refcount
234 * cannot race here.
235 */
236 VM_BUG_ON(!PageHead(page_head));
239 __get_page_tail_foll(page, false); 237 __get_page_tail_foll(page, false);
240 got = true; 238 return true;
241 } 239 } else {
242 compound_unlock_irqrestore(page_head, flags); 240 /*
243 if (unlikely(!got)) 241 * __split_huge_page_refcount run
242 * before us, "page" was a THP
243 * tail. The split page_head has been
244 * freed and reallocated as slab or
245 * hugetlbfs page of smaller order
246 * (only possible if reallocated as
247 * slab on x86).
248 */
244 put_page(page_head); 249 put_page(page_head);
250 return false;
251 }
252 }
253
254 /*
255 * page_head wasn't a dangling pointer but it
256 * may not be a head page anymore by the time
257 * we obtain the lock. That is ok as long as it
258 * can't be freed from under us.
259 */
260 flags = compound_lock_irqsave(page_head);
261 /* here __split_huge_page_refcount won't run anymore */
262 if (likely(PageTail(page))) {
263 __get_page_tail_foll(page, false);
264 got = true;
245 } 265 }
266 compound_unlock_irqrestore(page_head, flags);
267 if (unlikely(!got))
268 put_page(page_head);
246 } 269 }
247 return got; 270 return got;
248} 271}