aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2015-11-06 19:29:54 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-06 20:50:42 -0500
commit1d798ca3f16437c71ff63e36597ff07f9c12e4d6 (patch)
tree4b70d32439fb18ef699175413e4b82c4af206f81 /mm/page_alloc.c
parentf1e61557f0230d51a3df8d825f2c156e75563bff (diff)
mm: make compound_head() robust
Hugh has pointed that compound_head() call can be unsafe in some context. There's one example: CPU0 CPU1 isolate_migratepages_block() page_count() compound_head() !!PageTail() == true put_page() tail->first_page = NULL head = tail->first_page alloc_pages(__GFP_COMP) prep_compound_page() tail->first_page = head __SetPageTail(p); !!PageTail() == true <head == NULL dereferencing> The race is pure theoretical. I don't it's possible to trigger it in practice. But who knows. We can fix the race by changing how encode PageTail() and compound_head() within struct page to be able to update them in one shot. The patch introduces page->compound_head into third double word block in front of compound_dtor and compound_order. Bit 0 encodes PageTail() and the rest bits are pointer to head page if bit zero is set. The patch moves page->pmd_huge_pte out of word, just in case if an architecture defines pgtable_t into something what can have the bit 0 set. hugetlb_cgroup uses page->lru.next in the second tail page to store pointer struct hugetlb_cgroup. The patch switch it to use page->private in the second tail page instead. The space is free since ->first_page is removed from the union. The patch also opens possibility to remove HUGETLB_CGROUP_MIN_ORDER limitation, since there's now space in first tail page to store struct hugetlb_cgroup pointer. But that's out of scope of the patch. That means page->compound_head shares storage space with: - page->lru.next; - page->next; - page->rcu_head.next; That's too long list to be absolutely sure, but looks like nobody uses bit 0 of the word. page->rcu_head.next guaranteed[1] to have bit 0 clean as long as we use call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(). But future call_rcu_lazy() is not allowed as it makes use of the bit and we can get false positive PageTail(). [1] http://lkml.kernel.org/g/20150827163634.GD4029@linux.vnet.ibm.com Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: David Rientjes <rientjes@google.com> Cc: Vlastimil Babka <vbabka@suse.cz> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c48
1 files changed, 31 insertions, 17 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fae1bd6f9f37..e361001519d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -445,15 +445,15 @@ out:
445/* 445/*
446 * Higher-order pages are called "compound pages". They are structured thusly: 446 * Higher-order pages are called "compound pages". They are structured thusly:
447 * 447 *
448 * The first PAGE_SIZE page is called the "head page". 448 * The first PAGE_SIZE page is called the "head page" and have PG_head set.
449 * 449 *
450 * The remaining PAGE_SIZE pages are called "tail pages". 450 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
451 * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
451 * 452 *
452 * All pages have PG_compound set. All tail pages have their ->first_page 453 * The first tail page's ->compound_dtor holds the offset in array of compound
453 * pointing at the head page. 454 * page destructors. See compound_page_dtors.
454 * 455 *
455 * The first tail page's ->lru.next holds the address of the compound page's 456 * The first tail page's ->compound_order holds the order of allocation.
456 * put_page() function. Its ->lru.prev holds the order of allocation.
457 * This usage means that zero-order pages may not be compound. 457 * This usage means that zero-order pages may not be compound.
458 */ 458 */
459 459
@@ -473,10 +473,7 @@ void prep_compound_page(struct page *page, unsigned long order)
473 for (i = 1; i < nr_pages; i++) { 473 for (i = 1; i < nr_pages; i++) {
474 struct page *p = page + i; 474 struct page *p = page + i;
475 set_page_count(p, 0); 475 set_page_count(p, 0);
476 p->first_page = page; 476 set_compound_head(p, page);
477 /* Make sure p->first_page is always valid for PageTail() */
478 smp_wmb();
479 __SetPageTail(p);
480 } 477 }
481} 478}
482 479
@@ -854,17 +851,30 @@ static void free_one_page(struct zone *zone,
854 851
855static int free_tail_pages_check(struct page *head_page, struct page *page) 852static int free_tail_pages_check(struct page *head_page, struct page *page)
856{ 853{
857 if (!IS_ENABLED(CONFIG_DEBUG_VM)) 854 int ret = 1;
858 return 0; 855
856 /*
857 * We rely page->lru.next never has bit 0 set, unless the page
858 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
859 */
860 BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
861
862 if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
863 ret = 0;
864 goto out;
865 }
859 if (unlikely(!PageTail(page))) { 866 if (unlikely(!PageTail(page))) {
860 bad_page(page, "PageTail not set", 0); 867 bad_page(page, "PageTail not set", 0);
861 return 1; 868 goto out;
862 } 869 }
863 if (unlikely(page->first_page != head_page)) { 870 if (unlikely(compound_head(page) != head_page)) {
864 bad_page(page, "first_page not consistent", 0); 871 bad_page(page, "compound_head not consistent", 0);
865 return 1; 872 goto out;
866 } 873 }
867 return 0; 874 ret = 0;
875out:
876 clear_compound_head(page);
877 return ret;
868} 878}
869 879
870static void __meminit __init_single_page(struct page *page, unsigned long pfn, 880static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -931,6 +941,10 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
931 struct page *page = pfn_to_page(start_pfn); 941 struct page *page = pfn_to_page(start_pfn);
932 942
933 init_reserved_page(start_pfn); 943 init_reserved_page(start_pfn);
944
945 /* Avoid false-positive PageTail() */
946 INIT_LIST_HEAD(&page->lru);
947
934 SetPageReserved(page); 948 SetPageReserved(page);
935 } 949 }
936 } 950 }