aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2014-03-03 18:38:18 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-03-04 10:55:47 -0500
commit668f9abbd4334e6c29fa8acd71635c4f9101caa7 (patch)
tree7060fc64f0164c4936b92f60569e63fcddfdfc51
parentaa15aa0e869cb218f68faff8f229fa6b6b5cdf08 (diff)
mm: close PageTail race
Commit bf6bddf1924e ("mm: introduce compaction and migration for ballooned pages") introduces page_count(page) into memory compaction which dereferences page->first_page if PageTail(page). This results in a very rare NULL pointer dereference on the aforementioned page_count(page). Indeed, anything that does compound_head(), including page_count() is susceptible to racing with prep_compound_page() and seeing a NULL or dangling page->first_page pointer. This patch uses Andrea's implementation of compound_trans_head() that deals with such a race and makes it the default compound_head() implementation. This includes a read memory barrier that ensures that if PageTail(head) is true that we return a head page that is neither NULL nor dangling. The patch then adds a store memory barrier to prep_compound_page() to ensure page->first_page is set. This is the safest way to ensure we see the head page that we are expecting, PageTail(page) is already in the unlikely() path and the memory barriers are unfortunately required. Hugetlbfs is the exception, we don't enforce a store memory barrier during init since no race is possible. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Holger Kiehl <Holger.Kiehl@dwd.de> Cc: Christoph Lameter <cl@linux.com> Cc: Rafael Aquini <aquini@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.cz> Cc: Mel Gorman <mgorman@suse.de> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/block/aoe/aoecmd.c4
-rw-r--r--drivers/vfio/vfio_iommu_type1.c4
-rw-r--r--fs/proc/page.c5
-rw-r--r--include/linux/huge_mm.h41
-rw-r--r--include/linux/mm.h14
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/swap.c4
9 files changed, 25 insertions, 55 deletions
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 8184451b57c0..422b7d84f686 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -874,7 +874,7 @@ bio_pageinc(struct bio *bio)
874 /* Non-zero page count for non-head members of 874 /* Non-zero page count for non-head members of
875 * compound pages is no longer allowed by the kernel. 875 * compound pages is no longer allowed by the kernel.
876 */ 876 */
877 page = compound_trans_head(bv.bv_page); 877 page = compound_head(bv.bv_page);
878 atomic_inc(&page->_count); 878 atomic_inc(&page->_count);
879 } 879 }
880} 880}
@@ -887,7 +887,7 @@ bio_pagedec(struct bio *bio)
887 struct bvec_iter iter; 887 struct bvec_iter iter;
888 888
889 bio_for_each_segment(bv, bio, iter) { 889 bio_for_each_segment(bv, bio, iter) {
890 page = compound_trans_head(bv.bv_page); 890 page = compound_head(bv.bv_page);
891 atomic_dec(&page->_count); 891 atomic_dec(&page->_count);
892 } 892 }
893} 893}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4fb7a8f83c8a..54af4e933695 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -186,12 +186,12 @@ static bool is_invalid_reserved_pfn(unsigned long pfn)
186 if (pfn_valid(pfn)) { 186 if (pfn_valid(pfn)) {
187 bool reserved; 187 bool reserved;
188 struct page *tail = pfn_to_page(pfn); 188 struct page *tail = pfn_to_page(pfn);
189 struct page *head = compound_trans_head(tail); 189 struct page *head = compound_head(tail);
190 reserved = !!(PageReserved(head)); 190 reserved = !!(PageReserved(head));
191 if (head != tail) { 191 if (head != tail) {
192 /* 192 /*
193 * "head" is not a dangling pointer 193 * "head" is not a dangling pointer
194 * (compound_trans_head takes care of that) 194 * (compound_head takes care of that)
195 * but the hugepage may have been split 195 * but the hugepage may have been split
196 * from under us (and we may not hold a 196 * from under us (and we may not hold a
197 * reference count on the head page so it can 197 * reference count on the head page so it can
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 02174a610315..e647c55275d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -121,9 +121,8 @@ u64 stable_page_flags(struct page *page)
121 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon 121 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
122 * to make sure a given page is a thp, not a non-huge compound page. 122 * to make sure a given page is a thp, not a non-huge compound page.
123 */ 123 */
124 else if (PageTransCompound(page) && 124 else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
125 (PageLRU(compound_trans_head(page)) || 125 PageAnon(compound_head(page))))
126 PageAnon(compound_trans_head(page))))
127 u |= 1 << KPF_THP; 126 u |= 1 << KPF_THP;
128 127
129 /* 128 /*
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index db512014e061..b826239bdce0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -157,46 +157,6 @@ static inline int hpage_nr_pages(struct page *page)
157 return HPAGE_PMD_NR; 157 return HPAGE_PMD_NR;
158 return 1; 158 return 1;
159} 159}
160/*
161 * compound_trans_head() should be used instead of compound_head(),
162 * whenever the "page" passed as parameter could be the tail of a
163 * transparent hugepage that could be undergoing a
164 * __split_huge_page_refcount(). The page structure layout often
165 * changes across releases and it makes extensive use of unions. So if
166 * the page structure layout will change in a way that
167 * page->first_page gets clobbered by __split_huge_page_refcount, the
168 * implementation making use of smp_rmb() will be required.
169 *
170 * Currently we define compound_trans_head as compound_head, because
171 * page->private is in the same union with page->first_page, and
172 * page->private isn't clobbered. However this also means we're
173 * currently leaving dirt into the page->private field of anonymous
174 * pages resulting from a THP split, instead of setting page->private
175 * to zero like for every other page that has PG_private not set. But
176 * anonymous pages don't use page->private so this is not a problem.
177 */
178#if 0
179/* This will be needed if page->private will be clobbered in split_huge_page */
180static inline struct page *compound_trans_head(struct page *page)
181{
182 if (PageTail(page)) {
183 struct page *head;
184 head = page->first_page;
185 smp_rmb();
186 /*
187 * head may be a dangling pointer.
188 * __split_huge_page_refcount clears PageTail before
189 * overwriting first_page, so if PageTail is still
190 * there it means the head pointer isn't dangling.
191 */
192 if (PageTail(page))
193 return head;
194 }
195 return page;
196}
197#else
198#define compound_trans_head(page) compound_head(page)
199#endif
200 160
201extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 161extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
202 unsigned long addr, pmd_t pmd, pmd_t *pmdp); 162 unsigned long addr, pmd_t pmd, pmd_t *pmdp);
@@ -226,7 +186,6 @@ static inline int split_huge_page(struct page *page)
226 do { } while (0) 186 do { } while (0)
227#define split_huge_page_pmd_mm(__mm, __address, __pmd) \ 187#define split_huge_page_pmd_mm(__mm, __address, __pmd) \
228 do { } while (0) 188 do { } while (0)
229#define compound_trans_head(page) compound_head(page)
230static inline int hugepage_madvise(struct vm_area_struct *vma, 189static inline int hugepage_madvise(struct vm_area_struct *vma,
231 unsigned long *vm_flags, int advice) 190 unsigned long *vm_flags, int advice)
232{ 191{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f28f46eade6a..03ab3e58f511 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -399,8 +399,18 @@ static inline void compound_unlock_irqrestore(struct page *page,
399 399
400static inline struct page *compound_head(struct page *page) 400static inline struct page *compound_head(struct page *page)
401{ 401{
402 if (unlikely(PageTail(page))) 402 if (unlikely(PageTail(page))) {
403 return page->first_page; 403 struct page *head = page->first_page;
404
405 /*
406 * page->first_page may be a dangling pointer to an old
407 * compound page, so recheck that it is still a tail
408 * page before returning.
409 */
410 smp_rmb();
411 if (likely(PageTail(page)))
412 return head;
413 }
404 return page; 414 return page;
405} 415}
406 416
diff --git a/mm/ksm.c b/mm/ksm.c
index aa4c7c7250c1..68710e80994a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item)
444static struct page *page_trans_compound_anon(struct page *page) 444static struct page *page_trans_compound_anon(struct page *page)
445{ 445{
446 if (PageTransCompound(page)) { 446 if (PageTransCompound(page)) {
447 struct page *head = compound_trans_head(page); 447 struct page *head = compound_head(page);
448 /* 448 /*
449 * head may actually be splitted and freed from under 449 * head may actually be splitted and freed from under
450 * us but it's ok here. 450 * us but it's ok here.
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2f2f34a4e77d..90002ea43638 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1651,7 +1651,7 @@ int soft_offline_page(struct page *page, int flags)
1651{ 1651{
1652 int ret; 1652 int ret;
1653 unsigned long pfn = page_to_pfn(page); 1653 unsigned long pfn = page_to_pfn(page);
1654 struct page *hpage = compound_trans_head(page); 1654 struct page *hpage = compound_head(page);
1655 1655
1656 if (PageHWPoison(page)) { 1656 if (PageHWPoison(page)) {
1657 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1657 pr_info("soft offline: %#lx page already poisoned\n", pfn);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e3758a09a009..3d1bf889465a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order)
369 __SetPageHead(page); 369 __SetPageHead(page);
370 for (i = 1; i < nr_pages; i++) { 370 for (i = 1; i < nr_pages; i++) {
371 struct page *p = page + i; 371 struct page *p = page + i;
372 __SetPageTail(p);
373 set_page_count(p, 0); 372 set_page_count(p, 0);
374 p->first_page = page; 373 p->first_page = page;
374 /* Make sure p->first_page is always valid for PageTail() */
375 smp_wmb();
376 __SetPageTail(p);
375 } 377 }
376} 378}
377 379
diff --git a/mm/swap.c b/mm/swap.c
index b31ba67d440a..0092097b3f4c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -98,7 +98,7 @@ static void put_compound_page(struct page *page)
98 } 98 }
99 99
100 /* __split_huge_page_refcount can run under us */ 100 /* __split_huge_page_refcount can run under us */
101 page_head = compound_trans_head(page); 101 page_head = compound_head(page);
102 102
103 /* 103 /*
104 * THP can not break up slab pages so avoid taking 104 * THP can not break up slab pages so avoid taking
@@ -253,7 +253,7 @@ bool __get_page_tail(struct page *page)
253 */ 253 */
254 unsigned long flags; 254 unsigned long flags;
255 bool got; 255 bool got;
256 struct page *page_head = compound_trans_head(page); 256 struct page *page_head = compound_head(page);
257 257
258 /* Ref to put_compound_page() comment. */ 258 /* Ref to put_compound_page() comment. */
259 if (!__compound_tail_refcounted(page_head)) { 259 if (!__compound_tail_refcounted(page_head)) {