diff options
author | Andrea Arcangeli <aarcange@redhat.com> | 2011-01-13 18:46:34 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-13 20:32:39 -0500 |
commit | a5b338f2b0b1ff73ae20c66ab831201549eaec01 (patch) | |
tree | a89787ea6d932d66651c4621ed70f855e596d1ff /kernel | |
parent | a95a82e96c48270980dd248ccd5546f1b49e6f8a (diff) |
thp: update futex compound knowledge
Futex code is smarter than most other gup_fast O_DIRECT code and knows
about the compound internals. However now doing a put_page(head_page)
will not release the pin on the tail page taken by gup-fast, leading to
all sort of refcounting bugchecks. Getting a stable head_page is a little
tricky.
page_head = page is there because if this is not a tail page it's also the
page_head. Only in case this is a tail page, compound_head is called,
otherwise it's guaranteed unnecessary. And if it's a tail page
compound_head has to run atomically inside irq disabled section
__get_user_pages_fast before returning. Otherwise ->first_page won't be a
stable pointer.
Disableing irq before __get_user_page_fast and releasing irq after running
compound_head is needed because if __get_user_page_fast returns == 1, it
means the huge pmd is established and cannot go away from under us.
pmdp_splitting_flush_notify in __split_huge_page_splitting will have to
wait for local_irq_enable before the IPI delivery can return. This means
__split_huge_page_refcount can't be running from under us, and in turn
when we run compound_head(page) we're not reading a dangling pointer from
tailpage->first_page. Then after we get to stable head page, we are
always safe to call compound_lock and after taking the compound lock on
head page we can finally re-check if the page returned by gup-fast is
still a tail page. in which case we're set and we didn't need to split
the hugepage in order to take a futex on it.
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/futex.c | 55 |
1 files changed, 45 insertions, 10 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92e6917..52075633373f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) | |||
233 | { | 233 | { |
234 | unsigned long address = (unsigned long)uaddr; | 234 | unsigned long address = (unsigned long)uaddr; |
235 | struct mm_struct *mm = current->mm; | 235 | struct mm_struct *mm = current->mm; |
236 | struct page *page; | 236 | struct page *page, *page_head; |
237 | int err; | 237 | int err; |
238 | 238 | ||
239 | /* | 239 | /* |
@@ -265,11 +265,46 @@ again: | |||
265 | if (err < 0) | 265 | if (err < 0) |
266 | return err; | 266 | return err; |
267 | 267 | ||
268 | page = compound_head(page); | 268 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
269 | lock_page(page); | 269 | page_head = page; |
270 | if (!page->mapping) { | 270 | if (unlikely(PageTail(page))) { |
271 | unlock_page(page); | ||
272 | put_page(page); | 271 | put_page(page); |
272 | /* serialize against __split_huge_page_splitting() */ | ||
273 | local_irq_disable(); | ||
274 | if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { | ||
275 | page_head = compound_head(page); | ||
276 | /* | ||
277 | * page_head is valid pointer but we must pin | ||
278 | * it before taking the PG_lock and/or | ||
279 | * PG_compound_lock. The moment we re-enable | ||
280 | * irqs __split_huge_page_splitting() can | ||
281 | * return and the head page can be freed from | ||
282 | * under us. We can't take the PG_lock and/or | ||
283 | * PG_compound_lock on a page that could be | ||
284 | * freed from under us. | ||
285 | */ | ||
286 | if (page != page_head) { | ||
287 | get_page(page_head); | ||
288 | put_page(page); | ||
289 | } | ||
290 | local_irq_enable(); | ||
291 | } else { | ||
292 | local_irq_enable(); | ||
293 | goto again; | ||
294 | } | ||
295 | } | ||
296 | #else | ||
297 | page_head = compound_head(page); | ||
298 | if (page != page_head) { | ||
299 | get_page(page_head); | ||
300 | put_page(page); | ||
301 | } | ||
302 | #endif | ||
303 | |||
304 | lock_page(page_head); | ||
305 | if (!page_head->mapping) { | ||
306 | unlock_page(page_head); | ||
307 | put_page(page_head); | ||
273 | goto again; | 308 | goto again; |
274 | } | 309 | } |
275 | 310 | ||
@@ -280,20 +315,20 @@ again: | |||
280 | * it's a read-only handle, it's expected that futexes attach to | 315 | * it's a read-only handle, it's expected that futexes attach to |
281 | * the object not the particular process. | 316 | * the object not the particular process. |
282 | */ | 317 | */ |
283 | if (PageAnon(page)) { | 318 | if (PageAnon(page_head)) { |
284 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ | 319 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ |
285 | key->private.mm = mm; | 320 | key->private.mm = mm; |
286 | key->private.address = address; | 321 | key->private.address = address; |
287 | } else { | 322 | } else { |
288 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ | 323 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
289 | key->shared.inode = page->mapping->host; | 324 | key->shared.inode = page_head->mapping->host; |
290 | key->shared.pgoff = page->index; | 325 | key->shared.pgoff = page_head->index; |
291 | } | 326 | } |
292 | 327 | ||
293 | get_futex_key_refs(key); | 328 | get_futex_key_refs(key); |
294 | 329 | ||
295 | unlock_page(page); | 330 | unlock_page(page_head); |
296 | put_page(page); | 331 | put_page(page_head); |
297 | return 0; | 332 | return 0; |
298 | } | 333 | } |
299 | 334 | ||