summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2015-02-11 18:27:17 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-11 20:06:05 -0500
commitf0818f472d8d527a96ec9cc2c3a56223497f9dd3 (patch)
tree1e7348f75cfa526fdc808fe48963220d03891b1f /mm
parentbe97a41b291e495d6cb767b3ee0f84ed05804892 (diff)
mm: gup: add get_user_pages_locked and get_user_pages_unlocked
FAULT_FOLL_ALLOW_RETRY allows the page fault to drop the mmap_sem for reading to reduce the mmap_sem contention (for writing), like while waiting for I/O completion. The problem is that right now practically no get_user_pages call uses FAULT_FOLL_ALLOW_RETRY, so we're not leveraging that nifty feature. Andres fixed it for the KVM page fault. However get_user_pages_fast remains uncovered, and 99% of other get_user_pages aren't using it either (the only exception being FOLL_NOWAIT in KVM which is really nonblocking and in fact it doesn't even release the mmap_sem). So this patchsets extends the optimization Andres did in the KVM page fault to the whole kernel. It makes most important places (including gup_fast) to use FAULT_FOLL_ALLOW_RETRY to reduce the mmap_sem hold times during I/O. The only few places that remains uncovered are drivers like v4l and other exceptions that tends to work on their own memory and they're not working on random user memory (for example like O_DIRECT that uses gup_fast and is fully covered by this patch). A follow up patch should probably also add a printk_once warning to get_user_pages that should go obsolete and be phased out eventually. The "vmas" parameter of get_user_pages makes it fundamentally incompatible with FAULT_FOLL_ALLOW_RETRY (vmas array becomes meaningless the moment the mmap_sem is released). While this is just an optimization, this becomes an absolute requirement for the userfaultfd feature http://lwn.net/Articles/615086/ . The userfaultfd allows to block the page fault, and in order to do so I need to drop the mmap_sem first. So this patch also ensures that all memory where userfaultfd could be registered by KVM, the very first fault (no matter if it is a regular page fault, or a get_user_pages) always has FAULT_FOLL_ALLOW_RETRY set. Then the userfaultfd blocks and it is waken only when the pagetable is already mapped. The second fault attempt after the wakeup doesn't need FAULT_FOLL_ALLOW_RETRY, so it's ok to retry without it. This patch (of 5): We can leverage the VM_FAULT_RETRY functionality in the page fault paths better by using either get_user_pages_locked or get_user_pages_unlocked. The former allows conversion of get_user_pages invocations that will have to pass a "&locked" parameter to know if the mmap_sem was dropped during the call. Example from: down_read(&mm->mmap_sem); do_something() get_user_pages(tsk, mm, ..., pages, NULL); up_read(&mm->mmap_sem); to: int locked = 1; down_read(&mm->mmap_sem); do_something() get_user_pages_locked(tsk, mm, ..., pages, &locked); if (locked) up_read(&mm->mmap_sem); The latter is suitable only as a drop in replacement of the form: down_read(&mm->mmap_sem); get_user_pages(tsk, mm, ..., pages, NULL); up_read(&mm->mmap_sem); into: get_user_pages_unlocked(tsk, mm, ..., pages); Where tsk, mm, the intermediate "..." paramters and "pages" can be any value as before. Just the last parameter of get_user_pages (vmas) must be NULL for get_user_pages_locked|unlocked to be usable (the latter original form wouldn't have been safe anyway if vmas wasn't null, for the former we just make it explicit by dropping the parameter). If vmas is not NULL these two methods cannot be used. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com> Reviewed-by: Peter Feiner <pfeiner@google.com> Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/gup.c177
-rw-r--r--mm/nommu.c23
2 files changed, 189 insertions, 11 deletions
diff --git a/mm/gup.c b/mm/gup.c
index 1a8ab05918e0..71a37738a326 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -575,6 +575,165 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
575 return 0; 575 return 0;
576} 576}
577 577
578static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
579 struct mm_struct *mm,
580 unsigned long start,
581 unsigned long nr_pages,
582 int write, int force,
583 struct page **pages,
584 struct vm_area_struct **vmas,
585 int *locked, bool notify_drop)
586{
587 int flags = FOLL_TOUCH;
588 long ret, pages_done;
589 bool lock_dropped;
590
591 if (locked) {
592 /* if VM_FAULT_RETRY can be returned, vmas become invalid */
593 BUG_ON(vmas);
594 /* check caller initialized locked */
595 BUG_ON(*locked != 1);
596 }
597
598 if (pages)
599 flags |= FOLL_GET;
600 if (write)
601 flags |= FOLL_WRITE;
602 if (force)
603 flags |= FOLL_FORCE;
604
605 pages_done = 0;
606 lock_dropped = false;
607 for (;;) {
608 ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
609 vmas, locked);
610 if (!locked)
611 /* VM_FAULT_RETRY couldn't trigger, bypass */
612 return ret;
613
614 /* VM_FAULT_RETRY cannot return errors */
615 if (!*locked) {
616 BUG_ON(ret < 0);
617 BUG_ON(ret >= nr_pages);
618 }
619
620 if (!pages)
621 /* If it's a prefault don't insist harder */
622 return ret;
623
624 if (ret > 0) {
625 nr_pages -= ret;
626 pages_done += ret;
627 if (!nr_pages)
628 break;
629 }
630 if (*locked) {
631 /* VM_FAULT_RETRY didn't trigger */
632 if (!pages_done)
633 pages_done = ret;
634 break;
635 }
636 /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
637 pages += ret;
638 start += ret << PAGE_SHIFT;
639
640 /*
641 * Repeat on the address that fired VM_FAULT_RETRY
642 * without FAULT_FLAG_ALLOW_RETRY but with
643 * FAULT_FLAG_TRIED.
644 */
645 *locked = 1;
646 lock_dropped = true;
647 down_read(&mm->mmap_sem);
648 ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
649 pages, NULL, NULL);
650 if (ret != 1) {
651 BUG_ON(ret > 1);
652 if (!pages_done)
653 pages_done = ret;
654 break;
655 }
656 nr_pages--;
657 pages_done++;
658 if (!nr_pages)
659 break;
660 pages++;
661 start += PAGE_SIZE;
662 }
663 if (notify_drop && lock_dropped && *locked) {
664 /*
665 * We must let the caller know we temporarily dropped the lock
666 * and so the critical section protected by it was lost.
667 */
668 up_read(&mm->mmap_sem);
669 *locked = 0;
670 }
671 return pages_done;
672}
673
674/*
675 * We can leverage the VM_FAULT_RETRY functionality in the page fault
676 * paths better by using either get_user_pages_locked() or
677 * get_user_pages_unlocked().
678 *
679 * get_user_pages_locked() is suitable to replace the form:
680 *
681 * down_read(&mm->mmap_sem);
682 * do_something()
683 * get_user_pages(tsk, mm, ..., pages, NULL);
684 * up_read(&mm->mmap_sem);
685 *
686 * to:
687 *
688 * int locked = 1;
689 * down_read(&mm->mmap_sem);
690 * do_something()
691 * get_user_pages_locked(tsk, mm, ..., pages, &locked);
692 * if (locked)
693 * up_read(&mm->mmap_sem);
694 */
695long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
696 unsigned long start, unsigned long nr_pages,
697 int write, int force, struct page **pages,
698 int *locked)
699{
700 return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
701 pages, NULL, locked, true);
702}
703EXPORT_SYMBOL(get_user_pages_locked);
704
705/*
706 * get_user_pages_unlocked() is suitable to replace the form:
707 *
708 * down_read(&mm->mmap_sem);
709 * get_user_pages(tsk, mm, ..., pages, NULL);
710 * up_read(&mm->mmap_sem);
711 *
712 * with:
713 *
714 * get_user_pages_unlocked(tsk, mm, ..., pages);
715 *
716 * It is functionally equivalent to get_user_pages_fast so
717 * get_user_pages_fast should be used instead, if the two parameters
718 * "tsk" and "mm" are respectively equal to current and current->mm,
719 * or if "force" shall be set to 1 (get_user_pages_fast misses the
720 * "force" parameter).
721 */
722long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
723 unsigned long start, unsigned long nr_pages,
724 int write, int force, struct page **pages)
725{
726 long ret;
727 int locked = 1;
728 down_read(&mm->mmap_sem);
729 ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
730 pages, NULL, &locked, false);
731 if (locked)
732 up_read(&mm->mmap_sem);
733 return ret;
734}
735EXPORT_SYMBOL(get_user_pages_unlocked);
736
578/* 737/*
579 * get_user_pages() - pin user pages in memory 738 * get_user_pages() - pin user pages in memory
580 * @tsk: the task_struct to use for page fault accounting, or 739 * @tsk: the task_struct to use for page fault accounting, or
@@ -624,22 +783,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
624 * use the correct cache flushing APIs. 783 * use the correct cache flushing APIs.
625 * 784 *
626 * See also get_user_pages_fast, for performance critical applications. 785 * See also get_user_pages_fast, for performance critical applications.
786 *
787 * get_user_pages should be phased out in favor of
788 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
789 * should use get_user_pages because it cannot pass
790 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
627 */ 791 */
628long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 792long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
629 unsigned long start, unsigned long nr_pages, int write, 793 unsigned long start, unsigned long nr_pages, int write,
630 int force, struct page **pages, struct vm_area_struct **vmas) 794 int force, struct page **pages, struct vm_area_struct **vmas)
631{ 795{
632 int flags = FOLL_TOUCH; 796 return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
633 797 pages, vmas, NULL, false);
634 if (pages)
635 flags |= FOLL_GET;
636 if (write)
637 flags |= FOLL_WRITE;
638 if (force)
639 flags |= FOLL_FORCE;
640
641 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
642 NULL);
643} 798}
644EXPORT_SYMBOL(get_user_pages); 799EXPORT_SYMBOL(get_user_pages);
645 800
diff --git a/mm/nommu.c b/mm/nommu.c
index 541bed64e348..bfb690b0f986 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -214,6 +214,29 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
214} 214}
215EXPORT_SYMBOL(get_user_pages); 215EXPORT_SYMBOL(get_user_pages);
216 216
217long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
218 unsigned long start, unsigned long nr_pages,
219 int write, int force, struct page **pages,
220 int *locked)
221{
222 return get_user_pages(tsk, mm, start, nr_pages, write, force,
223 pages, NULL);
224}
225EXPORT_SYMBOL(get_user_pages_locked);
226
227long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
228 unsigned long start, unsigned long nr_pages,
229 int write, int force, struct page **pages)
230{
231 long ret;
232 down_read(&mm->mmap_sem);
233 ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
234 pages, NULL);
235 up_read(&mm->mmap_sem);
236 return ret;
237}
238EXPORT_SYMBOL(get_user_pages_unlocked);
239
217/** 240/**
218 * follow_pfn - look up PFN at a user virtual address 241 * follow_pfn - look up PFN at a user virtual address
219 * @vma: memory mapping 242 * @vma: memory mapping