aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2016-02-09 14:15:14 -0500
committerIngo Molnar <mingo@kernel.org>2016-02-17 04:42:17 -0500
commit65d8fc777f6dcfee12785c057a6b57f679641c90 (patch)
treebfc75439ad649f9d7215a8296c1e9918ae553331 /kernel/futex.c
parent8ad7b378d0d016309014cae0f640434bca7b5e11 (diff)
futex: Remove requirement for lock_page() in get_futex_key()
When dealing with key handling for shared futexes, we can drastically reduce the usage/need of the page lock. 1) For anonymous pages, the associated futex object is the mm_struct which does not require the page lock. 2) For inode based, keys, we can check under RCU read lock if the page mapping is still valid and take reference to the inode. This just leaves one rare race that requires the page lock in the slow path when examining the swapcache. Additionally realtime users currently have a problem with the page lock being contended for unbounded periods of time during futex operations. Task A get_futex_key() lock_page() ---> preempted Now any other task trying to lock that page will have to wait until task A gets scheduled back in, which is an unbound time. With this patch, we pretty much have a lockless futex_get_key(). Experiments show that this patch can boost/speedup the hashing of shared futexes with the perf futex benchmarks (which is good for measuring such change) by up to 45% when there are high (> 100) thread counts on a 60 core Westmere. Lower counts are pretty much in the noise range or less than 10%, but mid range can be seen at over 30% overall throughput (hash ops/sec). This makes anon-mem shared futexes much closer to its private counterpart. Signed-off-by: Mel Gorman <mgorman@suse.de> [ Ported on top of thp refcount rework, changelog, comments, fixes. ] Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Cc: Chris Mason <clm@fb.com> Cc: Darren Hart <dvhart@linux.intel.com> Cc: Hugh Dickins <hughd@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Cc: dave@stgolabs.net Link: http://lkml.kernel.org/r/1455045314-8305-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c99
1 files changed, 91 insertions, 8 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index 08ac7009488b..bae542e4b2e9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -520,7 +520,20 @@ again:
520 else 520 else
521 err = 0; 521 err = 0;
522 522
523 lock_page(page); 523 /*
524 * The treatment of mapping from this point on is critical. The page
525 * lock protects many things but in this context the page lock
526 * stabilizes mapping, prevents inode freeing in the shared
527 * file-backed region case and guards against movement to swap cache.
528 *
529 * Strictly speaking the page lock is not needed in all cases being
530 * considered here and page lock forces unnecessarily serialization
531 * From this point on, mapping will be re-verified if necessary and
532 * page lock will be acquired only if it is unavoidable
533 */
534 page = compound_head(page);
535 mapping = READ_ONCE(page->mapping);
536
524 /* 537 /*
525 * If page->mapping is NULL, then it cannot be a PageAnon 538 * If page->mapping is NULL, then it cannot be a PageAnon
526 * page; but it might be the ZERO_PAGE or in the gate area or 539 * page; but it might be the ZERO_PAGE or in the gate area or
@@ -536,19 +549,31 @@ again:
536 * shmem_writepage move it from filecache to swapcache beneath us: 549 * shmem_writepage move it from filecache to swapcache beneath us:
537 * an unlikely race, but we do need to retry for page->mapping. 550 * an unlikely race, but we do need to retry for page->mapping.
538 */ 551 */
539 mapping = compound_head(page)->mapping; 552 if (unlikely(!mapping)) {
540 if (!mapping) { 553 int shmem_swizzled;
541 int shmem_swizzled = PageSwapCache(page); 554
555 /*
556 * Page lock is required to identify which special case above
557 * applies. If this is really a shmem page then the page lock
558 * will prevent unexpected transitions.
559 */
560 lock_page(page);
561 shmem_swizzled = PageSwapCache(page) || page->mapping;
542 unlock_page(page); 562 unlock_page(page);
543 put_page(page); 563 put_page(page);
564
544 if (shmem_swizzled) 565 if (shmem_swizzled)
545 goto again; 566 goto again;
567
546 return -EFAULT; 568 return -EFAULT;
547 } 569 }
548 570
549 /* 571 /*
550 * Private mappings are handled in a simple way. 572 * Private mappings are handled in a simple way.
551 * 573 *
574 * If the futex key is stored on an anonymous page, then the associated
575 * object is the mm which is implicitly pinned by the calling process.
576 *
552 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 577 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
553 * it's a read-only handle, it's expected that futexes attach to 578 * it's a read-only handle, it's expected that futexes attach to
554 * the object not the particular process. 579 * the object not the particular process.
@@ -566,16 +591,74 @@ again:
566 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 591 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
567 key->private.mm = mm; 592 key->private.mm = mm;
568 key->private.address = address; 593 key->private.address = address;
594
595 get_futex_key_refs(key); /* implies smp_mb(); (B) */
596
569 } else { 597 } else {
598 struct inode *inode;
599
600 /*
601 * The associated futex object in this case is the inode and
602 * the page->mapping must be traversed. Ordinarily this should
603 * be stabilised under page lock but it's not strictly
604 * necessary in this case as we just want to pin the inode, not
605 * update the radix tree or anything like that.
606 *
607 * The RCU read lock is taken as the inode is finally freed
608 * under RCU. If the mapping still matches expectations then the
609 * mapping->host can be safely accessed as being a valid inode.
610 */
611 rcu_read_lock();
612
613 if (READ_ONCE(page->mapping) != mapping) {
614 rcu_read_unlock();
615 put_page(page);
616
617 goto again;
618 }
619
620 inode = READ_ONCE(mapping->host);
621 if (!inode) {
622 rcu_read_unlock();
623 put_page(page);
624
625 goto again;
626 }
627
628 /*
629 * Take a reference unless it is about to be freed. Previously
630 * this reference was taken by ihold under the page lock
631 * pinning the inode in place so i_lock was unnecessary. The
632 * only way for this check to fail is if the inode was
633 * truncated in parallel so warn for now if this happens.
634 *
635 * We are not calling into get_futex_key_refs() in file-backed
636 * cases, therefore a successful atomic_inc return below will
637 * guarantee that get_futex_key() will still imply smp_mb(); (B).
638 */
639 if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
640 rcu_read_unlock();
641 put_page(page);
642
643 goto again;
644 }
645
646 /* Should be impossible but lets be paranoid for now */
647 if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
648 err = -EFAULT;
649 rcu_read_unlock();
650 iput(inode);
651
652 goto out;
653 }
654
570 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 655 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
571 key->shared.inode = mapping->host; 656 key->shared.inode = inode;
572 key->shared.pgoff = basepage_index(page); 657 key->shared.pgoff = basepage_index(page);
658 rcu_read_unlock();
573 } 659 }
574 660
575 get_futex_key_refs(key); /* implies smp_mb(); (B) */
576
577out: 661out:
578 unlock_page(page);
579 put_page(page); 662 put_page(page);
580 return err; 663 return err;
581} 664}