diff options
Diffstat (limited to 'mm/gup.c')
-rw-r--r-- | mm/gup.c | 246 |
1 files changed, 205 insertions, 41 deletions
@@ -55,7 +55,7 @@ retry: | |||
55 | */ | 55 | */ |
56 | if (likely(!(flags & FOLL_MIGRATION))) | 56 | if (likely(!(flags & FOLL_MIGRATION))) |
57 | goto no_page; | 57 | goto no_page; |
58 | if (pte_none(pte) || pte_file(pte)) | 58 | if (pte_none(pte)) |
59 | goto no_page; | 59 | goto no_page; |
60 | entry = pte_to_swp_entry(pte); | 60 | entry = pte_to_swp_entry(pte); |
61 | if (!is_migration_entry(entry)) | 61 | if (!is_migration_entry(entry)) |
@@ -64,7 +64,7 @@ retry: | |||
64 | migration_entry_wait(mm, pmd, address); | 64 | migration_entry_wait(mm, pmd, address); |
65 | goto retry; | 65 | goto retry; |
66 | } | 66 | } |
67 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | 67 | if ((flags & FOLL_NUMA) && pte_protnone(pte)) |
68 | goto no_page; | 68 | goto no_page; |
69 | if ((flags & FOLL_WRITE) && !pte_write(pte)) { | 69 | if ((flags & FOLL_WRITE) && !pte_write(pte)) { |
70 | pte_unmap_unlock(ptep, ptl); | 70 | pte_unmap_unlock(ptep, ptl); |
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
167 | if (pud_none(*pud)) | 167 | if (pud_none(*pud)) |
168 | return no_page_table(vma, flags); | 168 | return no_page_table(vma, flags); |
169 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { | 169 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { |
170 | if (flags & FOLL_GET) | 170 | page = follow_huge_pud(mm, address, pud, flags); |
171 | return NULL; | 171 | if (page) |
172 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | 172 | return page; |
173 | return page; | 173 | return no_page_table(vma, flags); |
174 | } | 174 | } |
175 | if (unlikely(pud_bad(*pud))) | 175 | if (unlikely(pud_bad(*pud))) |
176 | return no_page_table(vma, flags); | 176 | return no_page_table(vma, flags); |
@@ -179,21 +179,12 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
179 | if (pmd_none(*pmd)) | 179 | if (pmd_none(*pmd)) |
180 | return no_page_table(vma, flags); | 180 | return no_page_table(vma, flags); |
181 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { | 181 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { |
182 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 182 | page = follow_huge_pmd(mm, address, pmd, flags); |
183 | if (flags & FOLL_GET) { | 183 | if (page) |
184 | /* | 184 | return page; |
185 | * Refcount on tail pages are not well-defined and | 185 | return no_page_table(vma, flags); |
186 | * shouldn't be taken. The caller should handle a NULL | ||
187 | * return when trying to follow tail pages. | ||
188 | */ | ||
189 | if (PageHead(page)) | ||
190 | get_page(page); | ||
191 | else | ||
192 | page = NULL; | ||
193 | } | ||
194 | return page; | ||
195 | } | 186 | } |
196 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | 187 | if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) |
197 | return no_page_table(vma, flags); | 188 | return no_page_table(vma, flags); |
198 | if (pmd_trans_huge(*pmd)) { | 189 | if (pmd_trans_huge(*pmd)) { |
199 | if (flags & FOLL_SPLIT) { | 190 | if (flags & FOLL_SPLIT) { |
@@ -296,7 +287,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, | |||
296 | return -ENOMEM; | 287 | return -ENOMEM; |
297 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | 288 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) |
298 | return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; | 289 | return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; |
299 | if (ret & VM_FAULT_SIGBUS) | 290 | if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) |
300 | return -EFAULT; | 291 | return -EFAULT; |
301 | BUG(); | 292 | BUG(); |
302 | } | 293 | } |
@@ -571,7 +562,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
571 | return -ENOMEM; | 562 | return -ENOMEM; |
572 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | 563 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) |
573 | return -EHWPOISON; | 564 | return -EHWPOISON; |
574 | if (ret & VM_FAULT_SIGBUS) | 565 | if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) |
575 | return -EFAULT; | 566 | return -EFAULT; |
576 | BUG(); | 567 | BUG(); |
577 | } | 568 | } |
@@ -584,6 +575,185 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
584 | return 0; | 575 | return 0; |
585 | } | 576 | } |
586 | 577 | ||
578 | static __always_inline long __get_user_pages_locked(struct task_struct *tsk, | ||
579 | struct mm_struct *mm, | ||
580 | unsigned long start, | ||
581 | unsigned long nr_pages, | ||
582 | int write, int force, | ||
583 | struct page **pages, | ||
584 | struct vm_area_struct **vmas, | ||
585 | int *locked, bool notify_drop, | ||
586 | unsigned int flags) | ||
587 | { | ||
588 | long ret, pages_done; | ||
589 | bool lock_dropped; | ||
590 | |||
591 | if (locked) { | ||
592 | /* if VM_FAULT_RETRY can be returned, vmas become invalid */ | ||
593 | BUG_ON(vmas); | ||
594 | /* check caller initialized locked */ | ||
595 | BUG_ON(*locked != 1); | ||
596 | } | ||
597 | |||
598 | if (pages) | ||
599 | flags |= FOLL_GET; | ||
600 | if (write) | ||
601 | flags |= FOLL_WRITE; | ||
602 | if (force) | ||
603 | flags |= FOLL_FORCE; | ||
604 | |||
605 | pages_done = 0; | ||
606 | lock_dropped = false; | ||
607 | for (;;) { | ||
608 | ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, | ||
609 | vmas, locked); | ||
610 | if (!locked) | ||
611 | /* VM_FAULT_RETRY couldn't trigger, bypass */ | ||
612 | return ret; | ||
613 | |||
614 | /* VM_FAULT_RETRY cannot return errors */ | ||
615 | if (!*locked) { | ||
616 | BUG_ON(ret < 0); | ||
617 | BUG_ON(ret >= nr_pages); | ||
618 | } | ||
619 | |||
620 | if (!pages) | ||
621 | /* If it's a prefault don't insist harder */ | ||
622 | return ret; | ||
623 | |||
624 | if (ret > 0) { | ||
625 | nr_pages -= ret; | ||
626 | pages_done += ret; | ||
627 | if (!nr_pages) | ||
628 | break; | ||
629 | } | ||
630 | if (*locked) { | ||
631 | /* VM_FAULT_RETRY didn't trigger */ | ||
632 | if (!pages_done) | ||
633 | pages_done = ret; | ||
634 | break; | ||
635 | } | ||
636 | /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ | ||
637 | pages += ret; | ||
638 | start += ret << PAGE_SHIFT; | ||
639 | |||
640 | /* | ||
641 | * Repeat on the address that fired VM_FAULT_RETRY | ||
642 | * without FAULT_FLAG_ALLOW_RETRY but with | ||
643 | * FAULT_FLAG_TRIED. | ||
644 | */ | ||
645 | *locked = 1; | ||
646 | lock_dropped = true; | ||
647 | down_read(&mm->mmap_sem); | ||
648 | ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, | ||
649 | pages, NULL, NULL); | ||
650 | if (ret != 1) { | ||
651 | BUG_ON(ret > 1); | ||
652 | if (!pages_done) | ||
653 | pages_done = ret; | ||
654 | break; | ||
655 | } | ||
656 | nr_pages--; | ||
657 | pages_done++; | ||
658 | if (!nr_pages) | ||
659 | break; | ||
660 | pages++; | ||
661 | start += PAGE_SIZE; | ||
662 | } | ||
663 | if (notify_drop && lock_dropped && *locked) { | ||
664 | /* | ||
665 | * We must let the caller know we temporarily dropped the lock | ||
666 | * and so the critical section protected by it was lost. | ||
667 | */ | ||
668 | up_read(&mm->mmap_sem); | ||
669 | *locked = 0; | ||
670 | } | ||
671 | return pages_done; | ||
672 | } | ||
673 | |||
674 | /* | ||
675 | * We can leverage the VM_FAULT_RETRY functionality in the page fault | ||
676 | * paths better by using either get_user_pages_locked() or | ||
677 | * get_user_pages_unlocked(). | ||
678 | * | ||
679 | * get_user_pages_locked() is suitable to replace the form: | ||
680 | * | ||
681 | * down_read(&mm->mmap_sem); | ||
682 | * do_something() | ||
683 | * get_user_pages(tsk, mm, ..., pages, NULL); | ||
684 | * up_read(&mm->mmap_sem); | ||
685 | * | ||
686 | * to: | ||
687 | * | ||
688 | * int locked = 1; | ||
689 | * down_read(&mm->mmap_sem); | ||
690 | * do_something() | ||
691 | * get_user_pages_locked(tsk, mm, ..., pages, &locked); | ||
692 | * if (locked) | ||
693 | * up_read(&mm->mmap_sem); | ||
694 | */ | ||
695 | long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, | ||
696 | unsigned long start, unsigned long nr_pages, | ||
697 | int write, int force, struct page **pages, | ||
698 | int *locked) | ||
699 | { | ||
700 | return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, | ||
701 | pages, NULL, locked, true, FOLL_TOUCH); | ||
702 | } | ||
703 | EXPORT_SYMBOL(get_user_pages_locked); | ||
704 | |||
705 | /* | ||
706 | * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to | ||
707 | * pass additional gup_flags as last parameter (like FOLL_HWPOISON). | ||
708 | * | ||
709 | * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the | ||
710 | * caller if required (just like with __get_user_pages). "FOLL_GET", | ||
711 | * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed | ||
712 | * according to the parameters "pages", "write", "force" | ||
713 | * respectively. | ||
714 | */ | ||
715 | __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
716 | unsigned long start, unsigned long nr_pages, | ||
717 | int write, int force, struct page **pages, | ||
718 | unsigned int gup_flags) | ||
719 | { | ||
720 | long ret; | ||
721 | int locked = 1; | ||
722 | down_read(&mm->mmap_sem); | ||
723 | ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, | ||
724 | pages, NULL, &locked, false, gup_flags); | ||
725 | if (locked) | ||
726 | up_read(&mm->mmap_sem); | ||
727 | return ret; | ||
728 | } | ||
729 | EXPORT_SYMBOL(__get_user_pages_unlocked); | ||
730 | |||
731 | /* | ||
732 | * get_user_pages_unlocked() is suitable to replace the form: | ||
733 | * | ||
734 | * down_read(&mm->mmap_sem); | ||
735 | * get_user_pages(tsk, mm, ..., pages, NULL); | ||
736 | * up_read(&mm->mmap_sem); | ||
737 | * | ||
738 | * with: | ||
739 | * | ||
740 | * get_user_pages_unlocked(tsk, mm, ..., pages); | ||
741 | * | ||
742 | * It is functionally equivalent to get_user_pages_fast so | ||
743 | * get_user_pages_fast should be used instead, if the two parameters | ||
744 | * "tsk" and "mm" are respectively equal to current and current->mm, | ||
745 | * or if "force" shall be set to 1 (get_user_pages_fast misses the | ||
746 | * "force" parameter). | ||
747 | */ | ||
748 | long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
749 | unsigned long start, unsigned long nr_pages, | ||
750 | int write, int force, struct page **pages) | ||
751 | { | ||
752 | return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, | ||
753 | force, pages, FOLL_TOUCH); | ||
754 | } | ||
755 | EXPORT_SYMBOL(get_user_pages_unlocked); | ||
756 | |||
587 | /* | 757 | /* |
588 | * get_user_pages() - pin user pages in memory | 758 | * get_user_pages() - pin user pages in memory |
589 | * @tsk: the task_struct to use for page fault accounting, or | 759 | * @tsk: the task_struct to use for page fault accounting, or |
@@ -633,22 +803,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
633 | * use the correct cache flushing APIs. | 803 | * use the correct cache flushing APIs. |
634 | * | 804 | * |
635 | * See also get_user_pages_fast, for performance critical applications. | 805 | * See also get_user_pages_fast, for performance critical applications. |
806 | * | ||
807 | * get_user_pages should be phased out in favor of | ||
808 | * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing | ||
809 | * should use get_user_pages because it cannot pass | ||
810 | * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. | ||
636 | */ | 811 | */ |
637 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 812 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
638 | unsigned long start, unsigned long nr_pages, int write, | 813 | unsigned long start, unsigned long nr_pages, int write, |
639 | int force, struct page **pages, struct vm_area_struct **vmas) | 814 | int force, struct page **pages, struct vm_area_struct **vmas) |
640 | { | 815 | { |
641 | int flags = FOLL_TOUCH; | 816 | return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, |
642 | 817 | pages, vmas, NULL, false, FOLL_TOUCH); | |
643 | if (pages) | ||
644 | flags |= FOLL_GET; | ||
645 | if (write) | ||
646 | flags |= FOLL_WRITE; | ||
647 | if (force) | ||
648 | flags |= FOLL_FORCE; | ||
649 | |||
650 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, | ||
651 | NULL); | ||
652 | } | 818 | } |
653 | EXPORT_SYMBOL(get_user_pages); | 819 | EXPORT_SYMBOL(get_user_pages); |
654 | 820 | ||
@@ -740,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
740 | 906 | ||
741 | /* | 907 | /* |
742 | * Similar to the PMD case below, NUMA hinting must take slow | 908 | * Similar to the PMD case below, NUMA hinting must take slow |
743 | * path | 909 | * path using the pte_protnone check. |
744 | */ | 910 | */ |
745 | if (!pte_present(pte) || pte_special(pte) || | 911 | if (!pte_present(pte) || pte_special(pte) || |
746 | pte_numa(pte) || (write && !pte_write(pte))) | 912 | pte_protnone(pte) || (write && !pte_write(pte))) |
747 | goto pte_unmap; | 913 | goto pte_unmap; |
748 | 914 | ||
749 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 915 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
@@ -926,7 +1092,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
926 | 1092 | ||
927 | pmdp = pmd_offset(&pud, addr); | 1093 | pmdp = pmd_offset(&pud, addr); |
928 | do { | 1094 | do { |
929 | pmd_t pmd = ACCESS_ONCE(*pmdp); | 1095 | pmd_t pmd = READ_ONCE(*pmdp); |
930 | 1096 | ||
931 | next = pmd_addr_end(addr, end); | 1097 | next = pmd_addr_end(addr, end); |
932 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | 1098 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) |
@@ -938,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
938 | * slowpath for accounting purposes and so that they | 1104 | * slowpath for accounting purposes and so that they |
939 | * can be serialised against THP migration. | 1105 | * can be serialised against THP migration. |
940 | */ | 1106 | */ |
941 | if (pmd_numa(pmd)) | 1107 | if (pmd_protnone(pmd)) |
942 | return 0; | 1108 | return 0; |
943 | 1109 | ||
944 | if (!gup_huge_pmd(pmd, pmdp, addr, next, write, | 1110 | if (!gup_huge_pmd(pmd, pmdp, addr, next, write, |
@@ -1077,10 +1243,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1077 | start += nr << PAGE_SHIFT; | 1243 | start += nr << PAGE_SHIFT; |
1078 | pages += nr; | 1244 | pages += nr; |
1079 | 1245 | ||
1080 | down_read(&mm->mmap_sem); | 1246 | ret = get_user_pages_unlocked(current, mm, start, |
1081 | ret = get_user_pages(current, mm, start, | 1247 | nr_pages - nr, write, 0, pages); |
1082 | nr_pages - nr, write, 0, pages, NULL); | ||
1083 | up_read(&mm->mmap_sem); | ||
1084 | 1248 | ||
1085 | /* Have to be a bit careful with return values */ | 1249 | /* Have to be a bit careful with return values */ |
1086 | if (nr > 0) { | 1250 | if (nr > 0) { |