mm: thp: tail page refcounting fix

Michel while working on the working set estimation code, noticed that calling get_page_unless_zero() on a random pfn_to_page(random_pfn) wasn't safe, if the pfn ended up being a tail page of a transparent hugepage under splitting by __split_huge_page_refcount(). He then found the problem could also theoretically materialize with page_cache_get_speculative() during the speculative radix tree lookups that uses get_page_unless_zero() in SMP if the radix tree page is freed and reallocated and get_user_pages is called on it before page_cache_get_speculative has a chance to call get_page_unless_zero(). So the best way to fix the problem is to keep page_tail->_count zero at all times. This will guarantee that get_page_unless_zero() can never succeed on any tail page. page_tail->_mapcount is guaranteed zero and is unused for all tail pages of a compound page, so we can simply account the tail page references there and transfer them to tail_page->_count in __split_huge_page_refcount() (in addition to the head_page->_mapcount). While debugging this s/_count/_mapcount/ change I also noticed get_page is called by direct-io.c on pages returned by get_user_pages. That wasn't entirely safe because the two atomic_inc in get_page weren't atomic. As opposed to other get_user_page users like secondary-MMU page fault to establish the shadow pagetables would never call any superflous get_page after get_user_page returns. It's safer to make get_page universally safe for tail pages and to use get_page_foll() within follow_page (inside get_user_pages()). get_page_foll() is safe to do the refcounting for tail pages without taking any locks because it is run within PT lock protected critical sections (PT lock for pte and page_table_lock for pmd_trans_huge). The standard get_page() as invoked by direct-io instead will now take the compound_lock but still only for tail pages. The direct-io paths are usually I/O bound and the compound_lock is per THP so very finegrined, so there's no risk of scalability issues with it. A simple direct-io benchmarks with all lockdep prove locking and spinlock debugging infrastructure enabled shows identical performance and no overhead. So it's worth it. Ideally direct-io should stop calling get_page() on pages returned by get_user_pages(). The spinlock in get_page() is already optimized away for no-THP builds but doing get_page() on tail pages returned by GUP is generally a rare operation and usually only run in I/O paths. This new refcounting on page_tail->_mapcount in addition to avoiding new RCU critical sections will also allow the working set estimation code to work without any further complexity associated to the tail page refcounting with THP. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Michel Lespinasse <walken@google.com> Reviewed-by: Michel Lespinasse <walken@google.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: <stable@kernel.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Andrea Arcangeli <aarcange@redhat.com> 2011-11-02 16:36:59 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-11-02 19:06:57 -0400
commit: 70b50f94f1644e2aa7cb374819cfd93f3c28d725 (patch)
tree: 79198cd9a92600140827a670d1ed5eefdcd23d79
parent: 994c0e992522c123298b4a91b72f5e67ba2d1123 (diff)
8 files changed, 171 insertions, 84 deletions
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index fec13200868f..b9e1c7ff5f6d 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -22,8 +22,9 @@ static inline void get_huge_page_tail(struct page *page)
         * __split_huge_page_refcount() cannot run
         * from under us.
         */
-        VM_BUG_ON(atomic_read(&page->_count) < 0);
+        VM_BUG_ON(page_mapcount(page) < 0);
-        atomic_inc(&page->_count);
+        VM_BUG_ON(atomic_read(&page->_count) != 0);
+        atomic_inc(&page->_mapcount);
 }
 /*
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index dbe34b931374..3b5032a62b0f 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -114,8 +114,9 @@ static inline void get_huge_page_tail(struct page *page)
         * __split_huge_page_refcount() cannot run
         * from under us.
         */
-        VM_BUG_ON(atomic_read(&page->_count) < 0);
+        VM_BUG_ON(page_mapcount(page) < 0);
-        atomic_inc(&page->_count);
+        VM_BUG_ON(atomic_read(&page->_count) != 0);
+        atomic_inc(&page->_mapcount);
 }
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3b3e3b8bb706..f81b7b41930c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -356,36 +356,39 @@ static inline struct page *compound_head(struct page *page)
        return page;
 }
+/*
+ * The atomic page->_mapcount, starts from -1: so that transitions
+ * both from it and to it can be tracked, using atomic_inc_and_test
+ * and atomic_add_negative(-1).
+ */
+static inline void reset_page_mapcount(struct page *page)
+{
+        atomic_set(&(page)->_mapcount, -1);
+}
+static inline int page_mapcount(struct page *page)
+{
+        return atomic_read(&(page)->_mapcount) + 1;
+}
 static inline int page_count(struct page *page)
 {
        return atomic_read(&compound_head(page)->_count);
 }
+extern bool __get_page_tail(struct page *page);
 static inline void get_page(struct page *page)
 {
+        if (unlikely(PageTail(page)))
+                if (likely(__get_page_tail(page)))
+                        return;
        /*
         * Getting a normal page or the head of a compound page
-         * requires to already have an elevated page->_count. Only if
+         * requires to already have an elevated page->_count.
-         * we're getting a tail page, the elevated page->_count is
-         * required only in the head page, so for tail pages the
-         * bugcheck only verifies that the page->_count isn't
-         * negative.
         */
-        VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
+        VM_BUG_ON(atomic_read(&page->_count) <= 0);
        atomic_inc(&page->_count);
-        /*
-         * Getting a tail page will elevate both the head and tail
-         * page->_count(s).
-         */
-        if (unlikely(PageTail(page))) {
-                /*
-                 * This is safe only because
-                 * __split_huge_page_refcount can't run under
-                 * get_page().
-                 */
-                VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
-                atomic_inc(&page->first_page->_count);
-        }
 }
 static inline struct page *virt_to_head_page(const void *x)
@@ -804,21 +807,6 @@ static inline pgoff_t page_index(struct page *page)
 }
 /*
- * The atomic page->_mapcount, like _count, starts from -1:
- * so that transitions both from it and to it can be tracked,
- * using atomic_inc_and_test and atomic_add_negative(-1).
- */
-static inline void reset_page_mapcount(struct page *page)
-{
-        atomic_set(&(page)->_mapcount, -1);
-}
-static inline int page_mapcount(struct page *page)
-{
-        return atomic_read(&(page)->_mapcount) + 1;
-}
-/*
 * Return true if this page is mapped into pagetables.
 */
 static inline int page_mapped(struct page *page)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3e01a19a91e8..5b42f1b34eb7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -62,10 +62,23 @@ struct page {
                        struct {
                                union {
-                                        atomic_t _mapcount;     /* Count of ptes mapped in mms,
+                                        /*
-                                                         * to show when page is mapped
+                                         * Count of ptes mapped in
-                                                         * & limit reverse map searches.
+                                         * mms, to show when page is
-                                                         */
+                                         * mapped & limit reverse map
+                                         * searches.
+                                         *
+                                         * Used also for tail pages
+                                         * refcounting instead of
+                                         * _count. Tail pages cannot
+                                         * be mapped and keeping the
+                                         * tail page _count zero at
+                                         * all times guarantees
+                                         * get_page_unless_zero() will
+                                         * never succeed on tail
+                                         * pages.
+                                         */
+                                        atomic_t _mapcount;
                                        struct {
                                                unsigned inuse:16;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 860ec211ddd6..4298abaae153 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -990,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON(!PageCompound(page));
        if (flags & FOLL_GET)
-                get_page(page);
+                get_page_foll(page);
 out:
        return page;
@@ -1202,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page)
        unsigned long head_index = page->index;
        struct zone *zone = page_zone(page);
        int zonestat;
+        int tail_count = 0;
        /* prevent PageLRU to go away from under us, and freeze lru stats */
        spin_lock_irq(&zone->lru_lock);
@@ -1210,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page)
        for (i = 1; i < HPAGE_PMD_NR; i++) {
                struct page *page_tail = page + i;
-                /* tail_page->_count cannot change */
+                /* tail_page->_mapcount cannot change */
-                atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+                BUG_ON(page_mapcount(page_tail) < 0);
-                BUG_ON(page_count(page) <= 0);
+                tail_count += page_mapcount(page_tail);
-                atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+                /* check for overflow */
-                BUG_ON(atomic_read(&page_tail->_count) <= 0);
+                BUG_ON(tail_count < 0);
+                BUG_ON(atomic_read(&page_tail->_count) != 0);
+                /*
+                 * tail_page->_count is zero and not changing from
+                 * under us. But get_page_unless_zero() may be running
+                 * from under us on the tail_page. If we used
+                 * atomic_set() below instead of atomic_add(), we
+                 * would then run atomic_set() concurrently with
+                 * get_page_unless_zero(), and atomic_set() is
+                 * implemented in C not using locked ops. spin_unlock
+                 * on x86 sometime uses locked ops because of PPro
+                 * errata 66, 92, so unless somebody can guarantee
+                 * atomic_set() here would be safe on all archs (and
+                 * not only on x86), it's safer to use atomic_add().
+                 */
+                atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
+                           &page_tail->_count);
                /* after clearing PageTail the gup refcount can be released */
                smp_mb();
@@ -1232,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page)
                                      (1L << PG_uptodate)));
                page_tail->flags |= (1L << PG_dirty);
-                /*
+                /* clear PageTail before overwriting first_page */
-                 * 1) clear PageTail before overwriting first_page
-                 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
-                 */
                smp_wmb();
                /*
@@ -1252,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page)
                 * status is achieved setting a reserved bit in the
                 * pmd, not by clearing the present bit.
                */
-                BUG_ON(page_mapcount(page_tail));
                page_tail->_mapcount = page->_mapcount;
                BUG_ON(page_tail->mapping);
@@ -1269,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page)
                lru_add_page_tail(zone, page, page_tail);
        }
+        atomic_sub(tail_count, &page->_count);
+        BUG_ON(atomic_read(&page->_count) <= 0);
        __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
diff --git a/mm/internal.h b/mm/internal.h
index d071d380fb49..2189af491783 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
        atomic_dec(&page->_count);
 }
+static inline void __get_page_tail_foll(struct page *page,
+                                        bool get_page_head)
+{
+        /*
+         * If we're getting a tail page, the elevated page->_count is
+         * required only in the head page and we will elevate the head
+         * page->_count and tail page->_mapcount.
+         *
+         * We elevate page_tail->_mapcount for tail pages to force
+         * page_tail->_count to be zero at all times to avoid getting
+         * false positives from get_page_unless_zero() with
+         * speculative page access (like in
+         * page_cache_get_speculative()) on tail pages.
+         */
+        VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+        VM_BUG_ON(atomic_read(&page->_count) != 0);
+        VM_BUG_ON(page_mapcount(page) < 0);
+        if (get_page_head)
+                atomic_inc(&page->first_page->_count);
+        atomic_inc(&page->_mapcount);
+}
+/*
+ * This is meant to be called as the FOLL_GET operation of
+ * follow_page() and it must be called while holding the proper PT
+ * lock while the pte (or pmd_trans_huge) is still mapping the page.
+ */
+static inline void get_page_foll(struct page *page)
+{
+        if (unlikely(PageTail(page)))
+                /*
+                 * This is safe only because
+                 * __split_huge_page_refcount() can't run under
+                 * get_page_foll() because we hold the proper PT lock.
+                 */
+                __get_page_tail_foll(page, true);
+        else {
+                /*
+                 * Getting a normal page or the head of a compound page
+                 * requires to already have an elevated page->_count.
+                 */
+                VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                atomic_inc(&page->_count);
+        }
+}
 extern unsigned long highest_memmap_pfn;
 /*
diff --git a/mm/memory.c b/mm/memory.c
index a56e3ba816b2..b2b87315cdc6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1503,7 +1503,7 @@ split_fallthrough:
        }
        if (flags & FOLL_GET)
-                get_page(page);
+                get_page_foll(page);
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !PageDirty(page))
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f18b0b3..87627f181c3f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
 {
        if (unlikely(PageTail(page))) {
                /* __split_huge_page_refcount can run under us */
-                struct page *page_head = page->first_page;
+                struct page *page_head = compound_trans_head(page);
-                smp_rmb();
-                /*
+                if (likely(page != page_head &&
-                 * If PageTail is still set after smp_rmb() we can be sure
+                           get_page_unless_zero(page_head))) {
-                 * that the page->first_page we read wasn't a dangling pointer.
-                 * See __split_huge_page_refcount() smp_wmb().
-                 */
-                if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
                        unsigned long flags;
                        /*
-                         * Verify that our page_head wasn't converted
+                         * page_head wasn't a dangling pointer but it
-                         * to a a regular page before we got a
+                         * may not be a head page anymore by the time
-                         * reference on it.
+                         * we obtain the lock. That is ok as long as it
+                         * can't be freed from under us.
                         */
-                        if (unlikely(!PageHead(page_head))) {
-                                /* PageHead is cleared after PageTail */
-                                smp_rmb();
-                                VM_BUG_ON(PageTail(page));
-                                goto out_put_head;
-                        }
-                        /*
-                         * Only run compound_lock on a valid PageHead,
-                         * after having it pinned with
-                         * get_page_unless_zero() above.
-                         */
-                        smp_mb();
-                        /* page_head wasn't a dangling pointer */
                        flags = compound_lock_irqsave(page_head);
                        if (unlikely(!PageTail(page))) {
                                /* __split_huge_page_refcount run before us */
                                compound_unlock_irqrestore(page_head, flags);
                                VM_BUG_ON(PageHead(page_head));
-                        out_put_head:
                                if (put_page_testzero(page_head))
                                        __put_single_page(page_head);
                        out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
                        VM_BUG_ON(page_head != page->first_page);
                        /*
                         * We can release the refcount taken by
-                         * get_page_unless_zero now that
+                         * get_page_unless_zero() now that
-                         * split_huge_page_refcount is blocked on the
+                         * __split_huge_page_refcount() is blocked on
-                         * compound_lock.
+                         * the compound_lock.
                         */
                        if (put_page_testzero(page_head))
                                VM_BUG_ON(1);
                        /* __split_huge_page_refcount will wait now */
-                        VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                        VM_BUG_ON(page_mapcount(page) <= 0);
-                        atomic_dec(&page->_count);
+                        atomic_dec(&page->_mapcount);
                        VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+                        VM_BUG_ON(atomic_read(&page->_count) != 0);
                        compound_unlock_irqrestore(page_head, flags);
                        if (put_page_testzero(page_head)) {
                                if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
 }
 EXPORT_SYMBOL(put_page);
+/*
+ * This function is exported but must not be called by anything other
+ * than get_page(). It implements the slow path of get_page().
+ */
+bool __get_page_tail(struct page *page)
+{
+        /*
+         * This takes care of get_page() if run on a tail page
+         * returned by one of the get_user_pages/follow_page variants.
+         * get_user_pages/follow_page itself doesn't need the compound
+         * lock because it runs __get_page_tail_foll() under the
+         * proper PT lock that already serializes against
+         * split_huge_page().
+         */
+        unsigned long flags;
+        bool got = false;
+        struct page *page_head = compound_trans_head(page);
+        if (likely(page != page_head && get_page_unless_zero(page_head))) {
+                /*
+                 * page_head wasn't a dangling pointer but it
+                 * may not be a head page anymore by the time
+                 * we obtain the lock. That is ok as long as it
+                 * can't be freed from under us.
+                 */
+                flags = compound_lock_irqsave(page_head);
+                /* here __split_huge_page_refcount won't run anymore */
+                if (likely(PageTail(page))) {
+                        __get_page_tail_foll(page, false);
+                        got = true;
+                }
+                compound_unlock_irqrestore(page_head, flags);
+                if (unlikely(!got))
+                        put_page(page_head);
+        }
+        return got;
+}
+EXPORT_SYMBOL(__get_page_tail);
 /**
 * put_pages_list() - release a list of pages
 * @pages: list of pages threaded on page->lru
author	Andrea Arcangeli <aarcange@redhat.com>	2011-11-02 16:36:59 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-11-02 19:06:57 -0400
commit	70b50f94f1644e2aa7cb374819cfd93f3c28d725 (patch)
tree	79198cd9a92600140827a670d1ed5eefdcd23d79
parent	994c0e992522c123298b4a91b72f5e67ba2d1123 (diff)

diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index fec13200868f..b9e1c7ff5f6d 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c
@@ -22,8 +22,9 @@ static inline void get_huge_page_tail(struct page *page)
22	* __split_huge_page_refcount() cannot run	22	* __split_huge_page_refcount() cannot run
23	* from under us.	23	* from under us.
24	*/	24	*/
25	VM_BUG_ON(atomic_read(&page->_count) < 0);	25	VM_BUG_ON(page_mapcount(page) < 0);
26	atomic_inc(&page->_count);	26	VM_BUG_ON(atomic_read(&page->_count) != 0);
		27	atomic_inc(&page->_mapcount);
27	}	28	}
28		29
29	/*	30	/*


diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index dbe34b931374..3b5032a62b0f 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c
@@ -114,8 +114,9 @@ static inline void get_huge_page_tail(struct page *page)
114	* __split_huge_page_refcount() cannot run	114	* __split_huge_page_refcount() cannot run
115	* from under us.	115	* from under us.
116	*/	116	*/
117	VM_BUG_ON(atomic_read(&page->_count) < 0);	117	VM_BUG_ON(page_mapcount(page) < 0);
118	atomic_inc(&page->_count);	118	VM_BUG_ON(atomic_read(&page->_count) != 0);
		119	atomic_inc(&page->_mapcount);
119	}	120	}
120		121
121	static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,	122	static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,


diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b3e3b8bb706..f81b7b41930c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h
@@ -356,36 +356,39 @@ static inline struct page compound_head(struct page page)
356	return page;	356	return page;
357	}	357	}
358		358
		359	/*
		360	* The atomic page->_mapcount, starts from -1: so that transitions
		361	* both from it and to it can be tracked, using atomic_inc_and_test
		362	* and atomic_add_negative(-1).
		363	*/
		364	static inline void reset_page_mapcount(struct page *page)
		365	{
		366	atomic_set(&(page)->_mapcount, -1);
		367	}
		368
		369	static inline int page_mapcount(struct page *page)
		370	{
		371	return atomic_read(&(page)->_mapcount) + 1;
		372	}
		373
359	static inline int page_count(struct page *page)	374	static inline int page_count(struct page *page)
360	{	375	{
361	return atomic_read(&compound_head(page)->_count);	376	return atomic_read(&compound_head(page)->_count);
362	}	377	}
363		378
		379	extern bool __get_page_tail(struct page *page);
		380
364	static inline void get_page(struct page *page)	381	static inline void get_page(struct page *page)
365	{	382	{
		383	if (unlikely(PageTail(page)))
		384	if (likely(__get_page_tail(page)))
		385	return;
366	/*	386	/*
367	* Getting a normal page or the head of a compound page	387	* Getting a normal page or the head of a compound page
368	* requires to already have an elevated page->_count. Only if	388	* requires to already have an elevated page->_count.
369	* we're getting a tail page, the elevated page->_count is
370	* required only in the head page, so for tail pages the
371	* bugcheck only verifies that the page->_count isn't
372	* negative.
373	*/	389	*/
374	VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));	390	VM_BUG_ON(atomic_read(&page->_count) <= 0);
375	atomic_inc(&page->_count);	391	atomic_inc(&page->_count);
376	/*
377	* Getting a tail page will elevate both the head and tail
378	* page->_count(s).
379	*/
380	if (unlikely(PageTail(page))) {
381	/*
382	* This is safe only because
383	* __split_huge_page_refcount can't run under
384	* get_page().
385	*/
386	VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
387	atomic_inc(&page->first_page->_count);
388	}
389	}	392	}
390		393
391	static inline struct page virt_to_head_page(const void x)	394	static inline struct page virt_to_head_page(const void x)
@@ -804,21 +807,6 @@ static inline pgoff_t page_index(struct page *page)
804	}	807	}
805		808
806	/*	809	/*
807	* The atomic page->_mapcount, like _count, starts from -1:
808	* so that transitions both from it and to it can be tracked,
809	* using atomic_inc_and_test and atomic_add_negative(-1).
810	*/
811	static inline void reset_page_mapcount(struct page *page)
812	{
813	atomic_set(&(page)->_mapcount, -1);
814	}
815
816	static inline int page_mapcount(struct page *page)
817	{
818	return atomic_read(&(page)->_mapcount) + 1;
819	}
820
821	/*
822	* Return true if this page is mapped into pagetables.	810	* Return true if this page is mapped into pagetables.
823	*/	811	*/
824	static inline int page_mapped(struct page *page)	812	static inline int page_mapped(struct page *page)


diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3e01a19a91e8..5b42f1b34eb7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h
@@ -62,10 +62,23 @@ struct page {
62	struct {	62	struct {
63		63
64	union {	64	union {
65	atomic_t _mapcount; /* Count of ptes mapped in mms,	65	/*
66	* to show when page is mapped	66	* Count of ptes mapped in
67	* & limit reverse map searches.	67	* mms, to show when page is
68	*/	68	* mapped & limit reverse map
		69	* searches.
		70	*
		71	* Used also for tail pages
		72	* refcounting instead of
		73	* _count. Tail pages cannot
		74	* be mapped and keeping the
		75	* tail page _count zero at
		76	* all times guarantees
		77	* get_page_unless_zero() will
		78	* never succeed on tail
		79	* pages.
		80	*/
		81	atomic_t _mapcount;
69		82
70	struct {	83	struct {
71	unsigned inuse:16;	84	unsigned inuse:16;


diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 860ec211ddd6..4298abaae153 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c
@@ -990,7 +990,7 @@ struct page follow_trans_huge_pmd(struct mm_struct mm,
990	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;	990	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
991	VM_BUG_ON(!PageCompound(page));	991	VM_BUG_ON(!PageCompound(page));
992	if (flags & FOLL_GET)	992	if (flags & FOLL_GET)
993	get_page(page);	993	get_page_foll(page);
994		994
995	out:	995	out:
996	return page;	996	return page;
@@ -1202,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page)
1202	unsigned long head_index = page->index;	1202	unsigned long head_index = page->index;
1203	struct zone *zone = page_zone(page);	1203	struct zone *zone = page_zone(page);
1204	int zonestat;	1204	int zonestat;
		1205	int tail_count = 0;
1205		1206
1206	/* prevent PageLRU to go away from under us, and freeze lru stats */	1207	/* prevent PageLRU to go away from under us, and freeze lru stats */
1207	spin_lock_irq(&zone->lru_lock);	1208	spin_lock_irq(&zone->lru_lock);
@@ -1210,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page)
1210	for (i = 1; i < HPAGE_PMD_NR; i++) {	1211	for (i = 1; i < HPAGE_PMD_NR; i++) {
1211	struct page *page_tail = page + i;	1212	struct page *page_tail = page + i;
1212		1213
1213	/* tail_page->_count cannot change */	1214	/* tail_page->_mapcount cannot change */
1214	atomic_sub(atomic_read(&page_tail->_count), &page->_count);	1215	BUG_ON(page_mapcount(page_tail) < 0);
1215	BUG_ON(page_count(page) <= 0);	1216	tail_count += page_mapcount(page_tail);
1216	atomic_add(page_mapcount(page) + 1, &page_tail->_count);	1217	/* check for overflow */
1217	BUG_ON(atomic_read(&page_tail->_count) <= 0);	1218	BUG_ON(tail_count < 0);
		1219	BUG_ON(atomic_read(&page_tail->_count) != 0);
		1220	/*
		1221	* tail_page->_count is zero and not changing from
		1222	* under us. But get_page_unless_zero() may be running
		1223	* from under us on the tail_page. If we used
		1224	* atomic_set() below instead of atomic_add(), we
		1225	* would then run atomic_set() concurrently with
		1226	* get_page_unless_zero(), and atomic_set() is
		1227	* implemented in C not using locked ops. spin_unlock
		1228	* on x86 sometime uses locked ops because of PPro
		1229	* errata 66, 92, so unless somebody can guarantee
		1230	* atomic_set() here would be safe on all archs (and
		1231	* not only on x86), it's safer to use atomic_add().
		1232	*/
		1233	atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
		1234	&page_tail->_count);
1218		1235
1219	/* after clearing PageTail the gup refcount can be released */	1236	/* after clearing PageTail the gup refcount can be released */
1220	smp_mb();	1237	smp_mb();
@@ -1232,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page)
1232	(1L << PG_uptodate)));	1249	(1L << PG_uptodate)));
1233	page_tail->flags \|= (1L << PG_dirty);	1250	page_tail->flags \|= (1L << PG_dirty);
1234		1251
1235	/*	1252	/* clear PageTail before overwriting first_page */
1236	* 1) clear PageTail before overwriting first_page
1237	* 2) clear PageTail before clearing PageHead for VM_BUG_ON
1238	*/
1239	smp_wmb();	1253	smp_wmb();
1240		1254
1241	/*	1255	/*
@@ -1252,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page)
1252	* status is achieved setting a reserved bit in the	1266	* status is achieved setting a reserved bit in the
1253	* pmd, not by clearing the present bit.	1267	* pmd, not by clearing the present bit.
1254	*/	1268	*/
1255	BUG_ON(page_mapcount(page_tail));
1256	page_tail->_mapcount = page->_mapcount;	1269	page_tail->_mapcount = page->_mapcount;
1257		1270
1258	BUG_ON(page_tail->mapping);	1271	BUG_ON(page_tail->mapping);
@@ -1269,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page)
1269		1282
1270	lru_add_page_tail(zone, page, page_tail);	1283	lru_add_page_tail(zone, page, page_tail);
1271	}	1284	}
		1285	atomic_sub(tail_count, &page->_count);
		1286	BUG_ON(atomic_read(&page->_count) <= 0);
1272		1287
1273	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);	1288	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1274	__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);	1289	__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);


diff --git a/mm/internal.h b/mm/internal.h index d071d380fb49..2189af491783 100644 --- a/mm/internal.h +++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
37	atomic_dec(&page->_count);	37	atomic_dec(&page->_count);
38	}	38	}
39		39
		40	static inline void __get_page_tail_foll(struct page *page,
		41	bool get_page_head)
		42	{
		43	/*
		44	* If we're getting a tail page, the elevated page->_count is
		45	* required only in the head page and we will elevate the head
		46	* page->_count and tail page->_mapcount.
		47	*
		48	* We elevate page_tail->_mapcount for tail pages to force
		49	* page_tail->_count to be zero at all times to avoid getting
		50	* false positives from get_page_unless_zero() with
		51	* speculative page access (like in
		52	* page_cache_get_speculative()) on tail pages.
		53	*/
		54	VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
		55	VM_BUG_ON(atomic_read(&page->_count) != 0);
		56	VM_BUG_ON(page_mapcount(page) < 0);
		57	if (get_page_head)
		58	atomic_inc(&page->first_page->_count);
		59	atomic_inc(&page->_mapcount);
		60	}
		61
		62	/*
		63	* This is meant to be called as the FOLL_GET operation of
		64	* follow_page() and it must be called while holding the proper PT
		65	* lock while the pte (or pmd_trans_huge) is still mapping the page.
		66	*/
		67	static inline void get_page_foll(struct page *page)
		68	{
		69	if (unlikely(PageTail(page)))
		70	/*
		71	* This is safe only because
		72	* __split_huge_page_refcount() can't run under
		73	* get_page_foll() because we hold the proper PT lock.
		74	*/
		75	__get_page_tail_foll(page, true);
		76	else {
		77	/*
		78	* Getting a normal page or the head of a compound page
		79	* requires to already have an elevated page->_count.
		80	*/
		81	VM_BUG_ON(atomic_read(&page->_count) <= 0);
		82	atomic_inc(&page->_count);
		83	}
		84	}
		85
40	extern unsigned long highest_memmap_pfn;	86	extern unsigned long highest_memmap_pfn;
41		87
42	/*	88	/*


diff --git a/mm/memory.c b/mm/memory.c index a56e3ba816b2..b2b87315cdc6 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -1503,7 +1503,7 @@ split_fallthrough:
1503	}	1503	}
1504		1504
1505	if (flags & FOLL_GET)	1505	if (flags & FOLL_GET)
1506	get_page(page);	1506	get_page_foll(page);
1507	if (flags & FOLL_TOUCH) {	1507	if (flags & FOLL_TOUCH) {
1508	if ((flags & FOLL_WRITE) &&	1508	if ((flags & FOLL_WRITE) &&
1509	!pte_dirty(pte) && !PageDirty(page))	1509	!pte_dirty(pte) && !PageDirty(page))


diff --git a/mm/swap.c b/mm/swap.c index 3a442f18b0b3..87627f181c3f 100644 --- a/mm/swap.c +++ b/mm/swap.c
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
78	{	78	{
79	if (unlikely(PageTail(page))) {	79	if (unlikely(PageTail(page))) {
80	/* __split_huge_page_refcount can run under us */	80	/* __split_huge_page_refcount can run under us */
81	struct page *page_head = page->first_page;	81	struct page *page_head = compound_trans_head(page);
82	smp_rmb();	82
83	/*	83	if (likely(page != page_head &&
84	* If PageTail is still set after smp_rmb() we can be sure	84	get_page_unless_zero(page_head))) {
85	* that the page->first_page we read wasn't a dangling pointer.
86	* See __split_huge_page_refcount() smp_wmb().
87	*/
88	if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
89	unsigned long flags;	85	unsigned long flags;
90	/*	86	/*
91	* Verify that our page_head wasn't converted	87	* page_head wasn't a dangling pointer but it
92	* to a a regular page before we got a	88	* may not be a head page anymore by the time
93	* reference on it.	89	* we obtain the lock. That is ok as long as it
		90	* can't be freed from under us.
94	*/	91	*/
95	if (unlikely(!PageHead(page_head))) {
96	/* PageHead is cleared after PageTail */
97	smp_rmb();
98	VM_BUG_ON(PageTail(page));
99	goto out_put_head;
100	}
101	/*
102	* Only run compound_lock on a valid PageHead,
103	* after having it pinned with
104	* get_page_unless_zero() above.
105	*/
106	smp_mb();
107	/* page_head wasn't a dangling pointer */
108	flags = compound_lock_irqsave(page_head);	92	flags = compound_lock_irqsave(page_head);
109	if (unlikely(!PageTail(page))) {	93	if (unlikely(!PageTail(page))) {
110	/* __split_huge_page_refcount run before us */	94	/* __split_huge_page_refcount run before us */
111	compound_unlock_irqrestore(page_head, flags);	95	compound_unlock_irqrestore(page_head, flags);
112	VM_BUG_ON(PageHead(page_head));	96	VM_BUG_ON(PageHead(page_head));
113	out_put_head:
114	if (put_page_testzero(page_head))	97	if (put_page_testzero(page_head))
115	__put_single_page(page_head);	98	__put_single_page(page_head);
116	out_put_single:	99	out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
121	VM_BUG_ON(page_head != page->first_page);	104	VM_BUG_ON(page_head != page->first_page);
122	/*	105	/*
123	* We can release the refcount taken by	106	* We can release the refcount taken by
124	* get_page_unless_zero now that	107	* get_page_unless_zero() now that
125	* split_huge_page_refcount is blocked on the	108	* __split_huge_page_refcount() is blocked on
126	* compound_lock.	109	* the compound_lock.
127	*/	110	*/
128	if (put_page_testzero(page_head))	111	if (put_page_testzero(page_head))
129	VM_BUG_ON(1);	112	VM_BUG_ON(1);
130	/* __split_huge_page_refcount will wait now */	113	/* __split_huge_page_refcount will wait now */
131	VM_BUG_ON(atomic_read(&page->_count) <= 0);	114	VM_BUG_ON(page_mapcount(page) <= 0);
132	atomic_dec(&page->_count);	115	atomic_dec(&page->_mapcount);
133	VM_BUG_ON(atomic_read(&page_head->_count) <= 0);	116	VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
		117	VM_BUG_ON(atomic_read(&page->_count) != 0);
134	compound_unlock_irqrestore(page_head, flags);	118	compound_unlock_irqrestore(page_head, flags);
135	if (put_page_testzero(page_head)) {	119	if (put_page_testzero(page_head)) {
136	if (PageHead(page_head))	120	if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
160	}	144	}
161	EXPORT_SYMBOL(put_page);	145	EXPORT_SYMBOL(put_page);
162		146
		147	/*
		148	* This function is exported but must not be called by anything other
		149	* than get_page(). It implements the slow path of get_page().
		150	*/
		151	bool __get_page_tail(struct page *page)
		152	{
		153	/*
		154	* This takes care of get_page() if run on a tail page
		155	* returned by one of the get_user_pages/follow_page variants.
		156	* get_user_pages/follow_page itself doesn't need the compound
		157	* lock because it runs __get_page_tail_foll() under the
		158	* proper PT lock that already serializes against
		159	* split_huge_page().
		160	*/
		161	unsigned long flags;
		162	bool got = false;
		163	struct page *page_head = compound_trans_head(page);
		164
		165	if (likely(page != page_head && get_page_unless_zero(page_head))) {
		166	/*
		167	* page_head wasn't a dangling pointer but it
		168	* may not be a head page anymore by the time
		169	* we obtain the lock. That is ok as long as it
		170	* can't be freed from under us.
		171	*/
		172	flags = compound_lock_irqsave(page_head);
		173	/* here __split_huge_page_refcount won't run anymore */
		174	if (likely(PageTail(page))) {
		175	__get_page_tail_foll(page, false);
		176	got = true;
		177	}
		178	compound_unlock_irqrestore(page_head, flags);
		179	if (unlikely(!got))
		180	put_page(page_head);
		181	}
		182	return got;
		183	}
		184	EXPORT_SYMBOL(__get_page_tail);
		185
163	/**	186	/**
164	* put_pages_list() - release a list of pages	187	* put_pages_list() - release a list of pages
165	* @pages: list of pages threaded on page->lru	188	* @pages: list of pages threaded on page->lru