aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2011-11-02 16:36:59 -0400
committerGreg Kroah-Hartman <gregkh@suse.de>2011-11-11 12:36:29 -0500
commit68fe9d9c796303de600dbc622086768ca4d8408b (patch)
tree4d7c0a4d52960df61e3c61e4a894a182de5ae839 /mm
parenta00fb1451d630898c1380349ac3776688e58010f (diff)
mm: thp: tail page refcounting fix
commit 70b50f94f1644e2aa7cb374819cfd93f3c28d725 upstream. Michel while working on the working set estimation code, noticed that calling get_page_unless_zero() on a random pfn_to_page(random_pfn) wasn't safe, if the pfn ended up being a tail page of a transparent hugepage under splitting by __split_huge_page_refcount(). He then found the problem could also theoretically materialize with page_cache_get_speculative() during the speculative radix tree lookups that uses get_page_unless_zero() in SMP if the radix tree page is freed and reallocated and get_user_pages is called on it before page_cache_get_speculative has a chance to call get_page_unless_zero(). So the best way to fix the problem is to keep page_tail->_count zero at all times. This will guarantee that get_page_unless_zero() can never succeed on any tail page. page_tail->_mapcount is guaranteed zero and is unused for all tail pages of a compound page, so we can simply account the tail page references there and transfer them to tail_page->_count in __split_huge_page_refcount() (in addition to the head_page->_mapcount). While debugging this s/_count/_mapcount/ change I also noticed get_page is called by direct-io.c on pages returned by get_user_pages. That wasn't entirely safe because the two atomic_inc in get_page weren't atomic. As opposed to other get_user_page users like secondary-MMU page fault to establish the shadow pagetables would never call any superflous get_page after get_user_page returns. It's safer to make get_page universally safe for tail pages and to use get_page_foll() within follow_page (inside get_user_pages()). get_page_foll() is safe to do the refcounting for tail pages without taking any locks because it is run within PT lock protected critical sections (PT lock for pte and page_table_lock for pmd_trans_huge). The standard get_page() as invoked by direct-io instead will now take the compound_lock but still only for tail pages. The direct-io paths are usually I/O bound and the compound_lock is per THP so very finegrined, so there's no risk of scalability issues with it. A simple direct-io benchmarks with all lockdep prove locking and spinlock debugging infrastructure enabled shows identical performance and no overhead. So it's worth it. Ideally direct-io should stop calling get_page() on pages returned by get_user_pages(). The spinlock in get_page() is already optimized away for no-THP builds but doing get_page() on tail pages returned by GUP is generally a rare operation and usually only run in I/O paths. This new refcounting on page_tail->_mapcount in addition to avoiding new RCU critical sections will also allow the working set estimation code to work without any further complexity associated to the tail page refcounting with THP. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Michel Lespinasse <walken@google.com> Reviewed-by: Michel Lespinasse <walken@google.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c37
-rw-r--r--mm/internal.h46
-rw-r--r--mm/memory.c2
-rw-r--r--mm/swap.c83
4 files changed, 126 insertions, 42 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 81532f297fd..cc5acf9998b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -989,7 +989,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
989 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 989 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
990 VM_BUG_ON(!PageCompound(page)); 990 VM_BUG_ON(!PageCompound(page));
991 if (flags & FOLL_GET) 991 if (flags & FOLL_GET)
992 get_page(page); 992 get_page_foll(page);
993 993
994out: 994out:
995 return page; 995 return page;
@@ -1156,6 +1156,7 @@ static void __split_huge_page_refcount(struct page *page)
1156 unsigned long head_index = page->index; 1156 unsigned long head_index = page->index;
1157 struct zone *zone = page_zone(page); 1157 struct zone *zone = page_zone(page);
1158 int zonestat; 1158 int zonestat;
1159 int tail_count = 0;
1159 1160
1160 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1161 /* prevent PageLRU to go away from under us, and freeze lru stats */
1161 spin_lock_irq(&zone->lru_lock); 1162 spin_lock_irq(&zone->lru_lock);
@@ -1164,11 +1165,27 @@ static void __split_huge_page_refcount(struct page *page)
1164 for (i = 1; i < HPAGE_PMD_NR; i++) { 1165 for (i = 1; i < HPAGE_PMD_NR; i++) {
1165 struct page *page_tail = page + i; 1166 struct page *page_tail = page + i;
1166 1167
1167 /* tail_page->_count cannot change */ 1168 /* tail_page->_mapcount cannot change */
1168 atomic_sub(atomic_read(&page_tail->_count), &page->_count); 1169 BUG_ON(page_mapcount(page_tail) < 0);
1169 BUG_ON(page_count(page) <= 0); 1170 tail_count += page_mapcount(page_tail);
1170 atomic_add(page_mapcount(page) + 1, &page_tail->_count); 1171 /* check for overflow */
1171 BUG_ON(atomic_read(&page_tail->_count) <= 0); 1172 BUG_ON(tail_count < 0);
1173 BUG_ON(atomic_read(&page_tail->_count) != 0);
1174 /*
1175 * tail_page->_count is zero and not changing from
1176 * under us. But get_page_unless_zero() may be running
1177 * from under us on the tail_page. If we used
1178 * atomic_set() below instead of atomic_add(), we
1179 * would then run atomic_set() concurrently with
1180 * get_page_unless_zero(), and atomic_set() is
1181 * implemented in C not using locked ops. spin_unlock
1182 * on x86 sometime uses locked ops because of PPro
1183 * errata 66, 92, so unless somebody can guarantee
1184 * atomic_set() here would be safe on all archs (and
1185 * not only on x86), it's safer to use atomic_add().
1186 */
1187 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
1188 &page_tail->_count);
1172 1189
1173 /* after clearing PageTail the gup refcount can be released */ 1190 /* after clearing PageTail the gup refcount can be released */
1174 smp_mb(); 1191 smp_mb();
@@ -1186,10 +1203,7 @@ static void __split_huge_page_refcount(struct page *page)
1186 (1L << PG_uptodate))); 1203 (1L << PG_uptodate)));
1187 page_tail->flags |= (1L << PG_dirty); 1204 page_tail->flags |= (1L << PG_dirty);
1188 1205
1189 /* 1206 /* clear PageTail before overwriting first_page */
1190 * 1) clear PageTail before overwriting first_page
1191 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1192 */
1193 smp_wmb(); 1207 smp_wmb();
1194 1208
1195 /* 1209 /*
@@ -1206,7 +1220,6 @@ static void __split_huge_page_refcount(struct page *page)
1206 * status is achieved setting a reserved bit in the 1220 * status is achieved setting a reserved bit in the
1207 * pmd, not by clearing the present bit. 1221 * pmd, not by clearing the present bit.
1208 */ 1222 */
1209 BUG_ON(page_mapcount(page_tail));
1210 page_tail->_mapcount = page->_mapcount; 1223 page_tail->_mapcount = page->_mapcount;
1211 1224
1212 BUG_ON(page_tail->mapping); 1225 BUG_ON(page_tail->mapping);
@@ -1223,6 +1236,8 @@ static void __split_huge_page_refcount(struct page *page)
1223 1236
1224 lru_add_page_tail(zone, page, page_tail); 1237 lru_add_page_tail(zone, page, page_tail);
1225 } 1238 }
1239 atomic_sub(tail_count, &page->_count);
1240 BUG_ON(atomic_read(&page->_count) <= 0);
1226 1241
1227 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1242 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1228 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1243 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
diff --git a/mm/internal.h b/mm/internal.h
index d071d380fb4..2189af49178 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
37 atomic_dec(&page->_count); 37 atomic_dec(&page->_count);
38} 38}
39 39
40static inline void __get_page_tail_foll(struct page *page,
41 bool get_page_head)
42{
43 /*
44 * If we're getting a tail page, the elevated page->_count is
45 * required only in the head page and we will elevate the head
46 * page->_count and tail page->_mapcount.
47 *
48 * We elevate page_tail->_mapcount for tail pages to force
49 * page_tail->_count to be zero at all times to avoid getting
50 * false positives from get_page_unless_zero() with
51 * speculative page access (like in
52 * page_cache_get_speculative()) on tail pages.
53 */
54 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
55 VM_BUG_ON(atomic_read(&page->_count) != 0);
56 VM_BUG_ON(page_mapcount(page) < 0);
57 if (get_page_head)
58 atomic_inc(&page->first_page->_count);
59 atomic_inc(&page->_mapcount);
60}
61
62/*
63 * This is meant to be called as the FOLL_GET operation of
64 * follow_page() and it must be called while holding the proper PT
65 * lock while the pte (or pmd_trans_huge) is still mapping the page.
66 */
67static inline void get_page_foll(struct page *page)
68{
69 if (unlikely(PageTail(page)))
70 /*
71 * This is safe only because
72 * __split_huge_page_refcount() can't run under
73 * get_page_foll() because we hold the proper PT lock.
74 */
75 __get_page_tail_foll(page, true);
76 else {
77 /*
78 * Getting a normal page or the head of a compound page
79 * requires to already have an elevated page->_count.
80 */
81 VM_BUG_ON(atomic_read(&page->_count) <= 0);
82 atomic_inc(&page->_count);
83 }
84}
85
40extern unsigned long highest_memmap_pfn; 86extern unsigned long highest_memmap_pfn;
41 87
42/* 88/*
diff --git a/mm/memory.c b/mm/memory.c
index d961e1914d1..95a77998ab5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1514,7 +1514,7 @@ split_fallthrough:
1514 } 1514 }
1515 1515
1516 if (flags & FOLL_GET) 1516 if (flags & FOLL_GET)
1517 get_page(page); 1517 get_page_foll(page);
1518 if (flags & FOLL_TOUCH) { 1518 if (flags & FOLL_TOUCH) {
1519 if ((flags & FOLL_WRITE) && 1519 if ((flags & FOLL_WRITE) &&
1520 !pte_dirty(pte) && !PageDirty(page)) 1520 !pte_dirty(pte) && !PageDirty(page))
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f18b0b..87627f181c3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
78{ 78{
79 if (unlikely(PageTail(page))) { 79 if (unlikely(PageTail(page))) {
80 /* __split_huge_page_refcount can run under us */ 80 /* __split_huge_page_refcount can run under us */
81 struct page *page_head = page->first_page; 81 struct page *page_head = compound_trans_head(page);
82 smp_rmb(); 82
83 /* 83 if (likely(page != page_head &&
84 * If PageTail is still set after smp_rmb() we can be sure 84 get_page_unless_zero(page_head))) {
85 * that the page->first_page we read wasn't a dangling pointer.
86 * See __split_huge_page_refcount() smp_wmb().
87 */
88 if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
89 unsigned long flags; 85 unsigned long flags;
90 /* 86 /*
91 * Verify that our page_head wasn't converted 87 * page_head wasn't a dangling pointer but it
92 * to a a regular page before we got a 88 * may not be a head page anymore by the time
93 * reference on it. 89 * we obtain the lock. That is ok as long as it
90 * can't be freed from under us.
94 */ 91 */
95 if (unlikely(!PageHead(page_head))) {
96 /* PageHead is cleared after PageTail */
97 smp_rmb();
98 VM_BUG_ON(PageTail(page));
99 goto out_put_head;
100 }
101 /*
102 * Only run compound_lock on a valid PageHead,
103 * after having it pinned with
104 * get_page_unless_zero() above.
105 */
106 smp_mb();
107 /* page_head wasn't a dangling pointer */
108 flags = compound_lock_irqsave(page_head); 92 flags = compound_lock_irqsave(page_head);
109 if (unlikely(!PageTail(page))) { 93 if (unlikely(!PageTail(page))) {
110 /* __split_huge_page_refcount run before us */ 94 /* __split_huge_page_refcount run before us */
111 compound_unlock_irqrestore(page_head, flags); 95 compound_unlock_irqrestore(page_head, flags);
112 VM_BUG_ON(PageHead(page_head)); 96 VM_BUG_ON(PageHead(page_head));
113 out_put_head:
114 if (put_page_testzero(page_head)) 97 if (put_page_testzero(page_head))
115 __put_single_page(page_head); 98 __put_single_page(page_head);
116 out_put_single: 99 out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
121 VM_BUG_ON(page_head != page->first_page); 104 VM_BUG_ON(page_head != page->first_page);
122 /* 105 /*
123 * We can release the refcount taken by 106 * We can release the refcount taken by
124 * get_page_unless_zero now that 107 * get_page_unless_zero() now that
125 * split_huge_page_refcount is blocked on the 108 * __split_huge_page_refcount() is blocked on
126 * compound_lock. 109 * the compound_lock.
127 */ 110 */
128 if (put_page_testzero(page_head)) 111 if (put_page_testzero(page_head))
129 VM_BUG_ON(1); 112 VM_BUG_ON(1);
130 /* __split_huge_page_refcount will wait now */ 113 /* __split_huge_page_refcount will wait now */
131 VM_BUG_ON(atomic_read(&page->_count) <= 0); 114 VM_BUG_ON(page_mapcount(page) <= 0);
132 atomic_dec(&page->_count); 115 atomic_dec(&page->_mapcount);
133 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 116 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
117 VM_BUG_ON(atomic_read(&page->_count) != 0);
134 compound_unlock_irqrestore(page_head, flags); 118 compound_unlock_irqrestore(page_head, flags);
135 if (put_page_testzero(page_head)) { 119 if (put_page_testzero(page_head)) {
136 if (PageHead(page_head)) 120 if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
160} 144}
161EXPORT_SYMBOL(put_page); 145EXPORT_SYMBOL(put_page);
162 146
147/*
148 * This function is exported but must not be called by anything other
149 * than get_page(). It implements the slow path of get_page().
150 */
151bool __get_page_tail(struct page *page)
152{
153 /*
154 * This takes care of get_page() if run on a tail page
155 * returned by one of the get_user_pages/follow_page variants.
156 * get_user_pages/follow_page itself doesn't need the compound
157 * lock because it runs __get_page_tail_foll() under the
158 * proper PT lock that already serializes against
159 * split_huge_page().
160 */
161 unsigned long flags;
162 bool got = false;
163 struct page *page_head = compound_trans_head(page);
164
165 if (likely(page != page_head && get_page_unless_zero(page_head))) {
166 /*
167 * page_head wasn't a dangling pointer but it
168 * may not be a head page anymore by the time
169 * we obtain the lock. That is ok as long as it
170 * can't be freed from under us.
171 */
172 flags = compound_lock_irqsave(page_head);
173 /* here __split_huge_page_refcount won't run anymore */
174 if (likely(PageTail(page))) {
175 __get_page_tail_foll(page, false);
176 got = true;
177 }
178 compound_unlock_irqrestore(page_head, flags);
179 if (unlikely(!got))
180 put_page(page_head);
181 }
182 return got;
183}
184EXPORT_SYMBOL(__get_page_tail);
185
163/** 186/**
164 * put_pages_list() - release a list of pages 187 * put_pages_list() - release a list of pages
165 * @pages: list of pages threaded on page->lru 188 * @pages: list of pages threaded on page->lru