diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2015-11-06 19:29:54 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-06 20:50:42 -0500 |
commit | 1d798ca3f16437c71ff63e36597ff07f9c12e4d6 (patch) | |
tree | 4b70d32439fb18ef699175413e4b82c4af206f81 /mm/page_alloc.c | |
parent | f1e61557f0230d51a3df8d825f2c156e75563bff (diff) |
mm: make compound_head() robust
Hugh has pointed that compound_head() call can be unsafe in some
context. There's one example:
CPU0 CPU1
isolate_migratepages_block()
page_count()
compound_head()
!!PageTail() == true
put_page()
tail->first_page = NULL
head = tail->first_page
alloc_pages(__GFP_COMP)
prep_compound_page()
tail->first_page = head
__SetPageTail(p);
!!PageTail() == true
<head == NULL dereferencing>
The race is pure theoretical. I don't it's possible to trigger it in
practice. But who knows.
We can fix the race by changing how encode PageTail() and compound_head()
within struct page to be able to update them in one shot.
The patch introduces page->compound_head into third double word block in
front of compound_dtor and compound_order. Bit 0 encodes PageTail() and
the rest bits are pointer to head page if bit zero is set.
The patch moves page->pmd_huge_pte out of word, just in case if an
architecture defines pgtable_t into something what can have the bit 0
set.
hugetlb_cgroup uses page->lru.next in the second tail page to store
pointer struct hugetlb_cgroup. The patch switch it to use page->private
in the second tail page instead. The space is free since ->first_page is
removed from the union.
The patch also opens possibility to remove HUGETLB_CGROUP_MIN_ORDER
limitation, since there's now space in first tail page to store struct
hugetlb_cgroup pointer. But that's out of scope of the patch.
That means page->compound_head shares storage space with:
- page->lru.next;
- page->next;
- page->rcu_head.next;
That's too long list to be absolutely sure, but looks like nobody uses
bit 0 of the word.
page->rcu_head.next guaranteed[1] to have bit 0 clean as long as we use
call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(). But future
call_rcu_lazy() is not allowed as it makes use of the bit and we can
get false positive PageTail().
[1] http://lkml.kernel.org/g/20150827163634.GD4029@linux.vnet.ibm.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 48 |
1 files changed, 31 insertions, 17 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fae1bd6f9f37..e361001519d3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -445,15 +445,15 @@ out: | |||
445 | /* | 445 | /* |
446 | * Higher-order pages are called "compound pages". They are structured thusly: | 446 | * Higher-order pages are called "compound pages". They are structured thusly: |
447 | * | 447 | * |
448 | * The first PAGE_SIZE page is called the "head page". | 448 | * The first PAGE_SIZE page is called the "head page" and have PG_head set. |
449 | * | 449 | * |
450 | * The remaining PAGE_SIZE pages are called "tail pages". | 450 | * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded |
451 | * in bit 0 of page->compound_head. The rest of bits is pointer to head page. | ||
451 | * | 452 | * |
452 | * All pages have PG_compound set. All tail pages have their ->first_page | 453 | * The first tail page's ->compound_dtor holds the offset in array of compound |
453 | * pointing at the head page. | 454 | * page destructors. See compound_page_dtors. |
454 | * | 455 | * |
455 | * The first tail page's ->lru.next holds the address of the compound page's | 456 | * The first tail page's ->compound_order holds the order of allocation. |
456 | * put_page() function. Its ->lru.prev holds the order of allocation. | ||
457 | * This usage means that zero-order pages may not be compound. | 457 | * This usage means that zero-order pages may not be compound. |
458 | */ | 458 | */ |
459 | 459 | ||
@@ -473,10 +473,7 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
473 | for (i = 1; i < nr_pages; i++) { | 473 | for (i = 1; i < nr_pages; i++) { |
474 | struct page *p = page + i; | 474 | struct page *p = page + i; |
475 | set_page_count(p, 0); | 475 | set_page_count(p, 0); |
476 | p->first_page = page; | 476 | set_compound_head(p, page); |
477 | /* Make sure p->first_page is always valid for PageTail() */ | ||
478 | smp_wmb(); | ||
479 | __SetPageTail(p); | ||
480 | } | 477 | } |
481 | } | 478 | } |
482 | 479 | ||
@@ -854,17 +851,30 @@ static void free_one_page(struct zone *zone, | |||
854 | 851 | ||
855 | static int free_tail_pages_check(struct page *head_page, struct page *page) | 852 | static int free_tail_pages_check(struct page *head_page, struct page *page) |
856 | { | 853 | { |
857 | if (!IS_ENABLED(CONFIG_DEBUG_VM)) | 854 | int ret = 1; |
858 | return 0; | 855 | |
856 | /* | ||
857 | * We rely page->lru.next never has bit 0 set, unless the page | ||
858 | * is PageTail(). Let's make sure that's true even for poisoned ->lru. | ||
859 | */ | ||
860 | BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); | ||
861 | |||
862 | if (!IS_ENABLED(CONFIG_DEBUG_VM)) { | ||
863 | ret = 0; | ||
864 | goto out; | ||
865 | } | ||
859 | if (unlikely(!PageTail(page))) { | 866 | if (unlikely(!PageTail(page))) { |
860 | bad_page(page, "PageTail not set", 0); | 867 | bad_page(page, "PageTail not set", 0); |
861 | return 1; | 868 | goto out; |
862 | } | 869 | } |
863 | if (unlikely(page->first_page != head_page)) { | 870 | if (unlikely(compound_head(page) != head_page)) { |
864 | bad_page(page, "first_page not consistent", 0); | 871 | bad_page(page, "compound_head not consistent", 0); |
865 | return 1; | 872 | goto out; |
866 | } | 873 | } |
867 | return 0; | 874 | ret = 0; |
875 | out: | ||
876 | clear_compound_head(page); | ||
877 | return ret; | ||
868 | } | 878 | } |
869 | 879 | ||
870 | static void __meminit __init_single_page(struct page *page, unsigned long pfn, | 880 | static void __meminit __init_single_page(struct page *page, unsigned long pfn, |
@@ -931,6 +941,10 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) | |||
931 | struct page *page = pfn_to_page(start_pfn); | 941 | struct page *page = pfn_to_page(start_pfn); |
932 | 942 | ||
933 | init_reserved_page(start_pfn); | 943 | init_reserved_page(start_pfn); |
944 | |||
945 | /* Avoid false-positive PageTail() */ | ||
946 | INIT_LIST_HEAD(&page->lru); | ||
947 | |||
934 | SetPageReserved(page); | 948 | SetPageReserved(page); |
935 | } | 949 | } |
936 | } | 950 | } |