aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2011-11-02 16:36:59 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-11-02 19:06:57 -0400
commit70b50f94f1644e2aa7cb374819cfd93f3c28d725 (patch)
tree79198cd9a92600140827a670d1ed5eefdcd23d79 /mm/huge_memory.c
parent994c0e992522c123298b4a91b72f5e67ba2d1123 (diff)
mm: thp: tail page refcounting fix
Michel while working on the working set estimation code, noticed that calling get_page_unless_zero() on a random pfn_to_page(random_pfn) wasn't safe, if the pfn ended up being a tail page of a transparent hugepage under splitting by __split_huge_page_refcount(). He then found the problem could also theoretically materialize with page_cache_get_speculative() during the speculative radix tree lookups that uses get_page_unless_zero() in SMP if the radix tree page is freed and reallocated and get_user_pages is called on it before page_cache_get_speculative has a chance to call get_page_unless_zero(). So the best way to fix the problem is to keep page_tail->_count zero at all times. This will guarantee that get_page_unless_zero() can never succeed on any tail page. page_tail->_mapcount is guaranteed zero and is unused for all tail pages of a compound page, so we can simply account the tail page references there and transfer them to tail_page->_count in __split_huge_page_refcount() (in addition to the head_page->_mapcount). While debugging this s/_count/_mapcount/ change I also noticed get_page is called by direct-io.c on pages returned by get_user_pages. That wasn't entirely safe because the two atomic_inc in get_page weren't atomic. As opposed to other get_user_page users like secondary-MMU page fault to establish the shadow pagetables would never call any superflous get_page after get_user_page returns. It's safer to make get_page universally safe for tail pages and to use get_page_foll() within follow_page (inside get_user_pages()). get_page_foll() is safe to do the refcounting for tail pages without taking any locks because it is run within PT lock protected critical sections (PT lock for pte and page_table_lock for pmd_trans_huge). The standard get_page() as invoked by direct-io instead will now take the compound_lock but still only for tail pages. The direct-io paths are usually I/O bound and the compound_lock is per THP so very finegrined, so there's no risk of scalability issues with it. A simple direct-io benchmarks with all lockdep prove locking and spinlock debugging infrastructure enabled shows identical performance and no overhead. So it's worth it. Ideally direct-io should stop calling get_page() on pages returned by get_user_pages(). The spinlock in get_page() is already optimized away for no-THP builds but doing get_page() on tail pages returned by GUP is generally a rare operation and usually only run in I/O paths. This new refcounting on page_tail->_mapcount in addition to avoiding new RCU critical sections will also allow the working set estimation code to work without any further complexity associated to the tail page refcounting with THP. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Michel Lespinasse <walken@google.com> Reviewed-by: Michel Lespinasse <walken@google.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: <stable@kernel.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c37
1 files changed, 26 insertions, 11 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 860ec211ddd6..4298abaae153 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -990,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
990 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 990 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
991 VM_BUG_ON(!PageCompound(page)); 991 VM_BUG_ON(!PageCompound(page));
992 if (flags & FOLL_GET) 992 if (flags & FOLL_GET)
993 get_page(page); 993 get_page_foll(page);
994 994
995out: 995out:
996 return page; 996 return page;
@@ -1202,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page)
1202 unsigned long head_index = page->index; 1202 unsigned long head_index = page->index;
1203 struct zone *zone = page_zone(page); 1203 struct zone *zone = page_zone(page);
1204 int zonestat; 1204 int zonestat;
1205 int tail_count = 0;
1205 1206
1206 /* prevent PageLRU to go away from under us, and freeze lru stats */ 1207 /* prevent PageLRU to go away from under us, and freeze lru stats */
1207 spin_lock_irq(&zone->lru_lock); 1208 spin_lock_irq(&zone->lru_lock);
@@ -1210,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page)
1210 for (i = 1; i < HPAGE_PMD_NR; i++) { 1211 for (i = 1; i < HPAGE_PMD_NR; i++) {
1211 struct page *page_tail = page + i; 1212 struct page *page_tail = page + i;
1212 1213
1213 /* tail_page->_count cannot change */ 1214 /* tail_page->_mapcount cannot change */
1214 atomic_sub(atomic_read(&page_tail->_count), &page->_count); 1215 BUG_ON(page_mapcount(page_tail) < 0);
1215 BUG_ON(page_count(page) <= 0); 1216 tail_count += page_mapcount(page_tail);
1216 atomic_add(page_mapcount(page) + 1, &page_tail->_count); 1217 /* check for overflow */
1217 BUG_ON(atomic_read(&page_tail->_count) <= 0); 1218 BUG_ON(tail_count < 0);
1219 BUG_ON(atomic_read(&page_tail->_count) != 0);
1220 /*
1221 * tail_page->_count is zero and not changing from
1222 * under us. But get_page_unless_zero() may be running
1223 * from under us on the tail_page. If we used
1224 * atomic_set() below instead of atomic_add(), we
1225 * would then run atomic_set() concurrently with
1226 * get_page_unless_zero(), and atomic_set() is
1227 * implemented in C not using locked ops. spin_unlock
1228 * on x86 sometime uses locked ops because of PPro
1229 * errata 66, 92, so unless somebody can guarantee
1230 * atomic_set() here would be safe on all archs (and
1231 * not only on x86), it's safer to use atomic_add().
1232 */
1233 atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
1234 &page_tail->_count);
1218 1235
1219 /* after clearing PageTail the gup refcount can be released */ 1236 /* after clearing PageTail the gup refcount can be released */
1220 smp_mb(); 1237 smp_mb();
@@ -1232,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page)
1232 (1L << PG_uptodate))); 1249 (1L << PG_uptodate)));
1233 page_tail->flags |= (1L << PG_dirty); 1250 page_tail->flags |= (1L << PG_dirty);
1234 1251
1235 /* 1252 /* clear PageTail before overwriting first_page */
1236 * 1) clear PageTail before overwriting first_page
1237 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1238 */
1239 smp_wmb(); 1253 smp_wmb();
1240 1254
1241 /* 1255 /*
@@ -1252,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page)
1252 * status is achieved setting a reserved bit in the 1266 * status is achieved setting a reserved bit in the
1253 * pmd, not by clearing the present bit. 1267 * pmd, not by clearing the present bit.
1254 */ 1268 */
1255 BUG_ON(page_mapcount(page_tail));
1256 page_tail->_mapcount = page->_mapcount; 1269 page_tail->_mapcount = page->_mapcount;
1257 1270
1258 BUG_ON(page_tail->mapping); 1271 BUG_ON(page_tail->mapping);
@@ -1269,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page)
1269 1282
1270 lru_add_page_tail(zone, page, page_tail); 1283 lru_add_page_tail(zone, page, page_tail);
1271 } 1284 }
1285 atomic_sub(tail_count, &page->_count);
1286 BUG_ON(atomic_read(&page->_count) <= 0);
1272 1287
1273 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1288 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1274 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); 1289 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);