aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorSteve Capper <steve.capper@linaro.org>2014-10-09 18:29:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-09 22:26:00 -0400
commit2667f50e8b81457fcb4a3dbe6aff3e81ea009e13 (patch)
tree1b8aa815ef85ed7034e6cb63c0837ff75db28fc5 /mm
parentbaa2ef83981c71ceb00f68fbdac323253c2c3e42 (diff)
mm: introduce a general RCU get_user_pages_fast()
This series implements general forms of get_user_pages_fast and __get_user_pages_fast in core code and activates them for arm and arm64. These are required for Transparent HugePages to function correctly, as a futex on a THP tail will otherwise result in an infinite loop (due to the core implementation of __get_user_pages_fast always returning 0). Unfortunately, a futex on THP tail can be quite common for certain workloads; thus THP is unreliable without a __get_user_pages_fast implementation. This series may also be beneficial for direct-IO heavy workloads and certain KVM workloads. This patch (of 6): get_user_pages_fast() attempts to pin user pages by walking the page tables directly and avoids taking locks. Thus the walker needs to be protected from page table pages being freed from under it, and needs to block any THP splits. One way to achieve this is to have the walker disable interrupts, and rely on IPIs from the TLB flushing code blocking before the page table pages are freed. On some platforms we have hardware broadcast of TLB invalidations, thus the TLB flushing code doesn't necessarily need to broadcast IPIs; and spuriously broadcasting IPIs can hurt system performance if done too often. This problem has been solved on PowerPC and Sparc by batching up page table pages belonging to more than one mm_user, then scheduling an rcu_sched callback to free the pages. This RCU page table free logic has been promoted to core code and is activated when one enables HAVE_RCU_TABLE_FREE. Unfortunately, these architectures implement their own get_user_pages_fast routines. The RCU page table free logic coupled with an IPI broadcast on THP split (which is a rare event), allows one to protect a page table walker by merely disabling the interrupts during the walk. This patch provides a general RCU implementation of get_user_pages_fast that can be used by architectures that perform hardware broadcast of TLB invalidations. It is based heavily on the PowerPC implementation by Nick Piggin. [akpm@linux-foundation.org: various comment fixes] Signed-off-by: Steve Capper <steve.capper@linaro.org> Tested-by: Dann Frazier <dann.frazier@canonical.com> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Acked-by: Hugh Dickins <hughd@google.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Will Deacon <will.deacon@arm.com> Cc: Christoffer Dall <christoffer.dall@linaro.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/gup.c354
2 files changed, 357 insertions, 0 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 886db2158538..0ceb8a567dab 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP
137config HAVE_MEMBLOCK_PHYS_MAP 137config HAVE_MEMBLOCK_PHYS_MAP
138 boolean 138 boolean
139 139
140config HAVE_GENERIC_RCU_GUP
141 boolean
142
140config ARCH_DISCARD_MEMBLOCK 143config ARCH_DISCARD_MEMBLOCK
141 boolean 144 boolean
142 145
diff --git a/mm/gup.c b/mm/gup.c
index af7ea3e0826b..cd62c8c90d4a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,6 +10,10 @@
10#include <linux/swap.h> 10#include <linux/swap.h>
11#include <linux/swapops.h> 11#include <linux/swapops.h>
12 12
13#include <linux/sched.h>
14#include <linux/rwsem.h>
15#include <asm/pgtable.h>
16
13#include "internal.h" 17#include "internal.h"
14 18
15static struct page *no_page_table(struct vm_area_struct *vma, 19static struct page *no_page_table(struct vm_area_struct *vma,
@@ -676,3 +680,353 @@ struct page *get_dump_page(unsigned long addr)
676 return page; 680 return page;
677} 681}
678#endif /* CONFIG_ELF_CORE */ 682#endif /* CONFIG_ELF_CORE */
683
684/*
685 * Generic RCU Fast GUP
686 *
687 * get_user_pages_fast attempts to pin user pages by walking the page
688 * tables directly and avoids taking locks. Thus the walker needs to be
689 * protected from page table pages being freed from under it, and should
690 * block any THP splits.
691 *
692 * One way to achieve this is to have the walker disable interrupts, and
693 * rely on IPIs from the TLB flushing code blocking before the page table
694 * pages are freed. This is unsuitable for architectures that do not need
695 * to broadcast an IPI when invalidating TLBs.
696 *
697 * Another way to achieve this is to batch up page table containing pages
698 * belonging to more than one mm_user, then rcu_sched a callback to free those
699 * pages. Disabling interrupts will allow the fast_gup walker to both block
700 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
701 * (which is a relatively rare event). The code below adopts this strategy.
702 *
703 * Before activating this code, please be aware that the following assumptions
704 * are currently made:
705 *
706 * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
707 * pages containing page tables.
708 *
709 * *) THP splits will broadcast an IPI, this can be achieved by overriding
710 * pmdp_splitting_flush.
711 *
712 * *) ptes can be read atomically by the architecture.
713 *
714 * *) access_ok is sufficient to validate userspace address ranges.
715 *
716 * The last two assumptions can be relaxed by the addition of helper functions.
717 *
718 * This code is based heavily on the PowerPC implementation by Nick Piggin.
719 */
720#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
721
722#ifdef __HAVE_ARCH_PTE_SPECIAL
723static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
724 int write, struct page **pages, int *nr)
725{
726 pte_t *ptep, *ptem;
727 int ret = 0;
728
729 ptem = ptep = pte_offset_map(&pmd, addr);
730 do {
731 /*
732 * In the line below we are assuming that the pte can be read
733 * atomically. If this is not the case for your architecture,
734 * please wrap this in a helper function!
735 *
736 * for an example see gup_get_pte in arch/x86/mm/gup.c
737 */
738 pte_t pte = ACCESS_ONCE(*ptep);
739 struct page *page;
740
741 /*
742 * Similar to the PMD case below, NUMA hinting must take slow
743 * path
744 */
745 if (!pte_present(pte) || pte_special(pte) ||
746 pte_numa(pte) || (write && !pte_write(pte)))
747 goto pte_unmap;
748
749 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
750 page = pte_page(pte);
751
752 if (!page_cache_get_speculative(page))
753 goto pte_unmap;
754
755 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
756 put_page(page);
757 goto pte_unmap;
758 }
759
760 pages[*nr] = page;
761 (*nr)++;
762
763 } while (ptep++, addr += PAGE_SIZE, addr != end);
764
765 ret = 1;
766
767pte_unmap:
768 pte_unmap(ptem);
769 return ret;
770}
771#else
772
773/*
774 * If we can't determine whether or not a pte is special, then fail immediately
775 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
776 * to be special.
777 *
778 * For a futex to be placed on a THP tail page, get_futex_key requires a
779 * __get_user_pages_fast implementation that can pin pages. Thus it's still
780 * useful to have gup_huge_pmd even if we can't operate on ptes.
781 */
782static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
783 int write, struct page **pages, int *nr)
784{
785 return 0;
786}
787#endif /* __HAVE_ARCH_PTE_SPECIAL */
788
789static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
790 unsigned long end, int write, struct page **pages, int *nr)
791{
792 struct page *head, *page, *tail;
793 int refs;
794
795 if (write && !pmd_write(orig))
796 return 0;
797
798 refs = 0;
799 head = pmd_page(orig);
800 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
801 tail = page;
802 do {
803 VM_BUG_ON_PAGE(compound_head(page) != head, page);
804 pages[*nr] = page;
805 (*nr)++;
806 page++;
807 refs++;
808 } while (addr += PAGE_SIZE, addr != end);
809
810 if (!page_cache_add_speculative(head, refs)) {
811 *nr -= refs;
812 return 0;
813 }
814
815 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
816 *nr -= refs;
817 while (refs--)
818 put_page(head);
819 return 0;
820 }
821
822 /*
823 * Any tail pages need their mapcount reference taken before we
824 * return. (This allows the THP code to bump their ref count when
825 * they are split into base pages).
826 */
827 while (refs--) {
828 if (PageTail(tail))
829 get_huge_page_tail(tail);
830 tail++;
831 }
832
833 return 1;
834}
835
836static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
837 unsigned long end, int write, struct page **pages, int *nr)
838{
839 struct page *head, *page, *tail;
840 int refs;
841
842 if (write && !pud_write(orig))
843 return 0;
844
845 refs = 0;
846 head = pud_page(orig);
847 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
848 tail = page;
849 do {
850 VM_BUG_ON_PAGE(compound_head(page) != head, page);
851 pages[*nr] = page;
852 (*nr)++;
853 page++;
854 refs++;
855 } while (addr += PAGE_SIZE, addr != end);
856
857 if (!page_cache_add_speculative(head, refs)) {
858 *nr -= refs;
859 return 0;
860 }
861
862 if (unlikely(pud_val(orig) != pud_val(*pudp))) {
863 *nr -= refs;
864 while (refs--)
865 put_page(head);
866 return 0;
867 }
868
869 while (refs--) {
870 if (PageTail(tail))
871 get_huge_page_tail(tail);
872 tail++;
873 }
874
875 return 1;
876}
877
878static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
879 int write, struct page **pages, int *nr)
880{
881 unsigned long next;
882 pmd_t *pmdp;
883
884 pmdp = pmd_offset(&pud, addr);
885 do {
886 pmd_t pmd = ACCESS_ONCE(*pmdp);
887
888 next = pmd_addr_end(addr, end);
889 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
890 return 0;
891
892 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
893 /*
894 * NUMA hinting faults need to be handled in the GUP
895 * slowpath for accounting purposes and so that they
896 * can be serialised against THP migration.
897 */
898 if (pmd_numa(pmd))
899 return 0;
900
901 if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
902 pages, nr))
903 return 0;
904
905 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
906 return 0;
907 } while (pmdp++, addr = next, addr != end);
908
909 return 1;
910}
911
912static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end,
913 int write, struct page **pages, int *nr)
914{
915 unsigned long next;
916 pud_t *pudp;
917
918 pudp = pud_offset(pgdp, addr);
919 do {
920 pud_t pud = ACCESS_ONCE(*pudp);
921
922 next = pud_addr_end(addr, end);
923 if (pud_none(pud))
924 return 0;
925 if (pud_huge(pud)) {
926 if (!gup_huge_pud(pud, pudp, addr, next, write,
927 pages, nr))
928 return 0;
929 } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
930 return 0;
931 } while (pudp++, addr = next, addr != end);
932
933 return 1;
934}
935
936/*
937 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
938 * the regular GUP. It will only return non-negative values.
939 */
940int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
941 struct page **pages)
942{
943 struct mm_struct *mm = current->mm;
944 unsigned long addr, len, end;
945 unsigned long next, flags;
946 pgd_t *pgdp;
947 int nr = 0;
948
949 start &= PAGE_MASK;
950 addr = start;
951 len = (unsigned long) nr_pages << PAGE_SHIFT;
952 end = start + len;
953
954 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
955 start, len)))
956 return 0;
957
958 /*
959 * Disable interrupts. We use the nested form as we can already have
960 * interrupts disabled by get_futex_key.
961 *
962 * With interrupts disabled, we block page table pages from being
963 * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
964 * for more details.
965 *
966 * We do not adopt an rcu_read_lock(.) here as we also want to
967 * block IPIs that come from THPs splitting.
968 */
969
970 local_irq_save(flags);
971 pgdp = pgd_offset(mm, addr);
972 do {
973 next = pgd_addr_end(addr, end);
974 if (pgd_none(*pgdp))
975 break;
976 else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
977 break;
978 } while (pgdp++, addr = next, addr != end);
979 local_irq_restore(flags);
980
981 return nr;
982}
983
984/**
985 * get_user_pages_fast() - pin user pages in memory
986 * @start: starting user address
987 * @nr_pages: number of pages from start to pin
988 * @write: whether pages will be written to
989 * @pages: array that receives pointers to the pages pinned.
990 * Should be at least nr_pages long.
991 *
992 * Attempt to pin user pages in memory without taking mm->mmap_sem.
993 * If not successful, it will fall back to taking the lock and
994 * calling get_user_pages().
995 *
996 * Returns number of pages pinned. This may be fewer than the number
997 * requested. If nr_pages is 0 or negative, returns 0. If no pages
998 * were pinned, returns -errno.
999 */
1000int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1001 struct page **pages)
1002{
1003 struct mm_struct *mm = current->mm;
1004 int nr, ret;
1005
1006 start &= PAGE_MASK;
1007 nr = __get_user_pages_fast(start, nr_pages, write, pages);
1008 ret = nr;
1009
1010 if (nr < nr_pages) {
1011 /* Try to get the remaining pages with get_user_pages */
1012 start += nr << PAGE_SHIFT;
1013 pages += nr;
1014
1015 down_read(&mm->mmap_sem);
1016 ret = get_user_pages(current, mm, start,
1017 nr_pages - nr, write, 0, pages, NULL);
1018 up_read(&mm->mmap_sem);
1019
1020 /* Have to be a bit careful with return values */
1021 if (nr > 0) {
1022 if (ret < 0)
1023 ret = nr;
1024 else
1025 ret += nr;
1026 }
1027 }
1028
1029 return ret;
1030}
1031
1032#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */