mm: introduce a general RCU get_user_pages_fast()

This series implements general forms of get_user_pages_fast and __get_user_pages_fast in core code and activates them for arm and arm64. These are required for Transparent HugePages to function correctly, as a futex on a THP tail will otherwise result in an infinite loop (due to the core implementation of __get_user_pages_fast always returning 0). Unfortunately, a futex on THP tail can be quite common for certain workloads; thus THP is unreliable without a __get_user_pages_fast implementation. This series may also be beneficial for direct-IO heavy workloads and certain KVM workloads. This patch (of 6): get_user_pages_fast() attempts to pin user pages by walking the page tables directly and avoids taking locks. Thus the walker needs to be protected from page table pages being freed from under it, and needs to block any THP splits. One way to achieve this is to have the walker disable interrupts, and rely on IPIs from the TLB flushing code blocking before the page table pages are freed. On some platforms we have hardware broadcast of TLB invalidations, thus the TLB flushing code doesn't necessarily need to broadcast IPIs; and spuriously broadcasting IPIs can hurt system performance if done too often. This problem has been solved on PowerPC and Sparc by batching up page table pages belonging to more than one mm_user, then scheduling an rcu_sched callback to free the pages. This RCU page table free logic has been promoted to core code and is activated when one enables HAVE_RCU_TABLE_FREE. Unfortunately, these architectures implement their own get_user_pages_fast routines. The RCU page table free logic coupled with an IPI broadcast on THP split (which is a rare event), allows one to protect a page table walker by merely disabling the interrupts during the walk. This patch provides a general RCU implementation of get_user_pages_fast that can be used by architectures that perform hardware broadcast of TLB invalidations. It is based heavily on the PowerPC implementation by Nick Piggin. [akpm@linux-foundation.org: various comment fixes] Signed-off-by: Steve Capper <steve.capper@linaro.org> Tested-by: Dann Frazier <dann.frazier@canonical.com> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Acked-by: Hugh Dickins <hughd@google.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Will Deacon <will.deacon@arm.com> Cc: Christoffer Dall <christoffer.dall@linaro.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Steve Capper <steve.capper@linaro.org> 2014-10-09 18:29:14 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-10-09 22:26:00 -0400
commit: 2667f50e8b81457fcb4a3dbe6aff3e81ea009e13 (patch)
tree: 1b8aa815ef85ed7034e6cb63c0837ff75db28fc5 /mm
parent: baa2ef83981c71ceb00f68fbdac323253c2c3e42 (diff)
2 files changed, 357 insertions, 0 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 886db2158538..0ceb8a567dab 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP
 config HAVE_MEMBLOCK_PHYS_MAP
        boolean
+config HAVE_GENERIC_RCU_GUP
+        boolean
 config ARCH_DISCARD_MEMBLOCK
        boolean
diff --git a/mm/gup.c b/mm/gup.c
index af7ea3e0826b..cd62c8c90d4a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,6 +10,10 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
 #include "internal.h"
 static struct page *no_page_table(struct vm_area_struct *vma,
@@ -676,3 +680,353 @@ struct page *get_dump_page(unsigned long addr)
        return page;
 }
 #endif /* CONFIG_ELF_CORE */
+/*
+ * Generic RCU Fast GUP
+ *
+ * get_user_pages_fast attempts to pin user pages by walking the page
+ * tables directly and avoids taking locks. Thus the walker needs to be
+ * protected from page table pages being freed from under it, and should
+ * block any THP splits.
+ *
+ * One way to achieve this is to have the walker disable interrupts, and
+ * rely on IPIs from the TLB flushing code blocking before the page table
+ * pages are freed. This is unsuitable for architectures that do not need
+ * to broadcast an IPI when invalidating TLBs.
+ *
+ * Another way to achieve this is to batch up page table containing pages
+ * belonging to more than one mm_user, then rcu_sched a callback to free those
+ * pages. Disabling interrupts will allow the fast_gup walker to both block
+ * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
+ * (which is a relatively rare event). The code below adopts this strategy.
+ *
+ * Before activating this code, please be aware that the following assumptions
+ * are currently made:
+ *
+ *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
+ *      pages containing page tables.
+ *
+ *  *) THP splits will broadcast an IPI, this can be achieved by overriding
+ *      pmdp_splitting_flush.
+ *
+ *  *) ptes can be read atomically by the architecture.
+ *
+ *  *) access_ok is sufficient to validate userspace address ranges.
+ *
+ * The last two assumptions can be relaxed by the addition of helper functions.
+ *
+ * This code is based heavily on the PowerPC implementation by Nick Piggin.
+ */
+#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
+                         int write, struct page **pages, int *nr)
+{
+        pte_t *ptep, *ptem;
+        int ret = 0;
+        ptem = ptep = pte_offset_map(&pmd, addr);
+        do {
+                /*
+                 * In the line below we are assuming that the pte can be read
+                 * atomically. If this is not the case for your architecture,
+                 * please wrap this in a helper function!
+                 *
+                 * for an example see gup_get_pte in arch/x86/mm/gup.c
+                 */
+                pte_t pte = ACCESS_ONCE(*ptep);
+                struct page *page;
+                /*
+                 * Similar to the PMD case below, NUMA hinting must take slow
+                 * path
+                 */
+                if (!pte_present(pte) || pte_special(pte) ||
+                        pte_numa(pte) || (write && !pte_write(pte)))
+                        goto pte_unmap;
+                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+                page = pte_page(pte);
+                if (!page_cache_get_speculative(page))
+                        goto pte_unmap;
+                if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+                        put_page(page);
+                        goto pte_unmap;
+                }
+                pages[*nr] = page;
+                (*nr)++;
+        } while (ptep++, addr += PAGE_SIZE, addr != end);
+        ret = 1;
+pte_unmap:
+        pte_unmap(ptem);
+        return ret;
+}
+#else
+/*
+ * If we can't determine whether or not a pte is special, then fail immediately
+ * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
+ * to be special.
+ *
+ * For a futex to be placed on a THP tail page, get_futex_key requires a
+ * __get_user_pages_fast implementation that can pin pages. Thus it's still
+ * useful to have gup_huge_pmd even if we can't operate on ptes.
+ */
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
+                         int write, struct page **pages, int *nr)
+{
+        return 0;
+}
+#endif /* __HAVE_ARCH_PTE_SPECIAL */
+static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        struct page *head, *page, *tail;
+        int refs;
+        if (write && !pmd_write(orig))
+                return 0;
+        refs = 0;
+        head = pmd_page(orig);
+        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+        tail = page;
+        do {
+                VM_BUG_ON_PAGE(compound_head(page) != head, page);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (addr += PAGE_SIZE, addr != end);
+        if (!page_cache_add_speculative(head, refs)) {
+                *nr -= refs;
+                return 0;
+        }
+        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+                *nr -= refs;
+                while (refs--)
+                        put_page(head);
+                return 0;
+        }
+        /*
+         * Any tail pages need their mapcount reference taken before we
+         * return. (This allows the THP code to bump their ref count when
+         * they are split into base pages).
+         */
+        while (refs--) {
+                if (PageTail(tail))
+                        get_huge_page_tail(tail);
+                tail++;
+        }
+        return 1;
+}
+static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        struct page *head, *page, *tail;
+        int refs;
+        if (write && !pud_write(orig))
+                return 0;
+        refs = 0;
+        head = pud_page(orig);
+        page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+        tail = page;
+        do {
+                VM_BUG_ON_PAGE(compound_head(page) != head, page);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (addr += PAGE_SIZE, addr != end);
+        if (!page_cache_add_speculative(head, refs)) {
+                *nr -= refs;
+                return 0;
+        }
+        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+                *nr -= refs;
+                while (refs--)
+                        put_page(head);
+                return 0;
+        }
+        while (refs--) {
+                if (PageTail(tail))
+                        get_huge_page_tail(tail);
+                tail++;
+        }
+        return 1;
+}
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+                int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pmd_t *pmdp;
+        pmdp = pmd_offset(&pud, addr);
+        do {
+                pmd_t pmd = ACCESS_ONCE(*pmdp);
+                next = pmd_addr_end(addr, end);
+                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+                        return 0;
+                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
+                        /*
+                         * NUMA hinting faults need to be handled in the GUP
+                         * slowpath for accounting purposes and so that they
+                         * can be serialised against THP migration.
+                         */
+                        if (pmd_numa(pmd))
+                                return 0;
+                        if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
+                                pages, nr))
+                                return 0;
+                } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+                                return 0;
+        } while (pmdp++, addr = next, addr != end);
+        return 1;
+}
+static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end,
+                int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pud_t *pudp;
+        pudp = pud_offset(pgdp, addr);
+        do {
+                pud_t pud = ACCESS_ONCE(*pudp);
+                next = pud_addr_end(addr, end);
+                if (pud_none(pud))
+                        return 0;
+                if (pud_huge(pud)) {
+                        if (!gup_huge_pud(pud, pudp, addr, next, write,
+                                        pages, nr))
+                                return 0;
+                } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+                        return 0;
+        } while (pudp++, addr = next, addr != end);
+        return 1;
+}
+/*
+ * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
+ * the regular GUP. It will only return non-negative values.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+                          struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long addr, len, end;
+        unsigned long next, flags;
+        pgd_t *pgdp;
+        int nr = 0;
+        start &= PAGE_MASK;
+        addr = start;
+        len = (unsigned long) nr_pages << PAGE_SHIFT;
+        end = start + len;
+        if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+                                        start, len)))
+                return 0;
+        /*
+         * Disable interrupts.  We use the nested form as we can already have
+         * interrupts disabled by get_futex_key.
+         *
+         * With interrupts disabled, we block page table pages from being
+         * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
+         * for more details.
+         *
+         * We do not adopt an rcu_read_lock(.) here as we also want to
+         * block IPIs that come from THPs splitting.
+         */
+        local_irq_save(flags);
+        pgdp = pgd_offset(mm, addr);
+        do {
+                next = pgd_addr_end(addr, end);
+                if (pgd_none(*pgdp))
+                        break;
+                else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
+                        break;
+        } while (pgdp++, addr = next, addr != end);
+        local_irq_restore(flags);
+        return nr;
+}
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start:      starting user address
+ * @nr_pages:   number of pages from start to pin
+ * @write:      whether pages will be written to
+ * @pages:      array that receives pointers to the pages pinned.
+ *              Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+                        struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        int nr, ret;
+        start &= PAGE_MASK;
+        nr = __get_user_pages_fast(start, nr_pages, write, pages);
+        ret = nr;
+        if (nr < nr_pages) {
+                /* Try to get the remaining pages with get_user_pages */
+                start += nr << PAGE_SHIFT;
+                pages += nr;
+                down_read(&mm->mmap_sem);
+                ret = get_user_pages(current, mm, start,
+                                     nr_pages - nr, write, 0, pages, NULL);
+                up_read(&mm->mmap_sem);
+                /* Have to be a bit careful with return values */
+                if (nr > 0) {
+                        if (ret < 0)
+                                ret = nr;
+                        else
+                                ret += nr;
+                }
+        }
+        return ret;
+}
+#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
author	Steve Capper <steve.capper@linaro.org>	2014-10-09 18:29:14 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-10-09 22:26:00 -0400
commit	2667f50e8b81457fcb4a3dbe6aff3e81ea009e13 (patch)
tree	1b8aa815ef85ed7034e6cb63c0837ff75db28fc5 /mm
parent	baa2ef83981c71ceb00f68fbdac323253c2c3e42 (diff)

diff --git a/mm/Kconfig b/mm/Kconfig index 886db2158538..0ceb8a567dab 100644 --- a/mm/Kconfig +++ b/mm/Kconfig
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP
137	config HAVE_MEMBLOCK_PHYS_MAP	137	config HAVE_MEMBLOCK_PHYS_MAP
138	boolean	138	boolean
139		139
		140	config HAVE_GENERIC_RCU_GUP
		141	boolean
		142
140	config ARCH_DISCARD_MEMBLOCK	143	config ARCH_DISCARD_MEMBLOCK
141	boolean	144	boolean
142		145


diff --git a/mm/gup.c b/mm/gup.c index af7ea3e0826b..cd62c8c90d4a 100644 --- a/mm/gup.c +++ b/mm/gup.c
@@ -10,6 +10,10 @@
10	#include <linux/swap.h>	10	#include <linux/swap.h>
11	#include <linux/swapops.h>	11	#include <linux/swapops.h>
12		12
		13	#include <linux/sched.h>
		14	#include <linux/rwsem.h>
		15	#include <asm/pgtable.h>
		16
13	#include "internal.h"	17	#include "internal.h"
14		18
15	static struct page no_page_table(struct vm_area_struct vma,	19	static struct page no_page_table(struct vm_area_struct vma,
@@ -676,3 +680,353 @@ struct page *get_dump_page(unsigned long addr)
676	return page;	680	return page;
677	}	681	}
678	#endif /* CONFIG_ELF_CORE */	682	#endif /* CONFIG_ELF_CORE */
		683
		684	/*
		685	* Generic RCU Fast GUP
		686	*
		687	* get_user_pages_fast attempts to pin user pages by walking the page
		688	* tables directly and avoids taking locks. Thus the walker needs to be
		689	* protected from page table pages being freed from under it, and should
		690	* block any THP splits.
		691	*
		692	* One way to achieve this is to have the walker disable interrupts, and
		693	* rely on IPIs from the TLB flushing code blocking before the page table
		694	* pages are freed. This is unsuitable for architectures that do not need
		695	* to broadcast an IPI when invalidating TLBs.
		696	*
		697	* Another way to achieve this is to batch up page table containing pages
		698	* belonging to more than one mm_user, then rcu_sched a callback to free those
		699	* pages. Disabling interrupts will allow the fast_gup walker to both block
		700	* the rcu_sched callback, and an IPI that we broadcast for splitting THPs
		701	* (which is a relatively rare event). The code below adopts this strategy.
		702	*
		703	* Before activating this code, please be aware that the following assumptions
		704	* are currently made:
		705	*
		706	* *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
		707	* pages containing page tables.
		708	*
		709	* *) THP splits will broadcast an IPI, this can be achieved by overriding
		710	* pmdp_splitting_flush.
		711	*
		712	* *) ptes can be read atomically by the architecture.
		713	*
		714	* *) access_ok is sufficient to validate userspace address ranges.
		715	*
		716	* The last two assumptions can be relaxed by the addition of helper functions.
		717	*
		718	* This code is based heavily on the PowerPC implementation by Nick Piggin.
		719	*/
		720	#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
		721
		722	#ifdef __HAVE_ARCH_PTE_SPECIAL
		723	static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
		724	int write, struct page *pages, int nr)
		725	{
		726	pte_t ptep, ptem;
		727	int ret = 0;
		728
		729	ptem = ptep = pte_offset_map(&pmd, addr);
		730	do {
		731	/*
		732	* In the line below we are assuming that the pte can be read
		733	* atomically. If this is not the case for your architecture,
		734	* please wrap this in a helper function!
		735	*
		736	* for an example see gup_get_pte in arch/x86/mm/gup.c
		737	*/
		738	pte_t pte = ACCESS_ONCE(*ptep);
		739	struct page *page;
		740
		741	/*
		742	* Similar to the PMD case below, NUMA hinting must take slow
		743	* path
		744	*/
		745	if (!pte_present(pte) \|\| pte_special(pte) \|\|
		746	pte_numa(pte) \|\| (write && !pte_write(pte)))
		747	goto pte_unmap;
		748
		749	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
		750	page = pte_page(pte);
		751
		752	if (!page_cache_get_speculative(page))
		753	goto pte_unmap;
		754
		755	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		756	put_page(page);
		757	goto pte_unmap;
		758	}
		759
		760	pages[*nr] = page;
		761	(*nr)++;
		762
		763	} while (ptep++, addr += PAGE_SIZE, addr != end);
		764
		765	ret = 1;
		766
		767	pte_unmap:
		768	pte_unmap(ptem);
		769	return ret;
		770	}
		771	#else
		772
		773	/*
		774	* If we can't determine whether or not a pte is special, then fail immediately
		775	* for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
		776	* to be special.
		777	*
		778	* For a futex to be placed on a THP tail page, get_futex_key requires a
		779	* __get_user_pages_fast implementation that can pin pages. Thus it's still
		780	* useful to have gup_huge_pmd even if we can't operate on ptes.
		781	*/
		782	static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
		783	int write, struct page *pages, int nr)
		784	{
		785	return 0;
		786	}
		787	#endif /* __HAVE_ARCH_PTE_SPECIAL */
		788
		789	static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
		790	unsigned long end, int write, struct page *pages, int nr)
		791	{
		792	struct page head, page, *tail;
		793	int refs;
		794
		795	if (write && !pmd_write(orig))
		796	return 0;
		797
		798	refs = 0;
		799	head = pmd_page(orig);
		800	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
		801	tail = page;
		802	do {
		803	VM_BUG_ON_PAGE(compound_head(page) != head, page);
		804	pages[*nr] = page;
		805	(*nr)++;
		806	page++;
		807	refs++;
		808	} while (addr += PAGE_SIZE, addr != end);
		809
		810	if (!page_cache_add_speculative(head, refs)) {
		811	*nr -= refs;
		812	return 0;
		813	}
		814
		815	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
		816	*nr -= refs;
		817	while (refs--)
		818	put_page(head);
		819	return 0;
		820	}
		821
		822	/*
		823	* Any tail pages need their mapcount reference taken before we
		824	* return. (This allows the THP code to bump their ref count when
		825	* they are split into base pages).
		826	*/
		827	while (refs--) {
		828	if (PageTail(tail))
		829	get_huge_page_tail(tail);
		830	tail++;
		831	}
		832
		833	return 1;
		834	}
		835
		836	static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
		837	unsigned long end, int write, struct page *pages, int nr)
		838	{
		839	struct page head, page, *tail;
		840	int refs;
		841
		842	if (write && !pud_write(orig))
		843	return 0;
		844
		845	refs = 0;
		846	head = pud_page(orig);
		847	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
		848	tail = page;
		849	do {
		850	VM_BUG_ON_PAGE(compound_head(page) != head, page);
		851	pages[*nr] = page;
		852	(*nr)++;
		853	page++;
		854	refs++;
		855	} while (addr += PAGE_SIZE, addr != end);
		856
		857	if (!page_cache_add_speculative(head, refs)) {
		858	*nr -= refs;
		859	return 0;
		860	}
		861
		862	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
		863	*nr -= refs;
		864	while (refs--)
		865	put_page(head);
		866	return 0;
		867	}
		868
		869	while (refs--) {
		870	if (PageTail(tail))
		871	get_huge_page_tail(tail);
		872	tail++;
		873	}
		874
		875	return 1;
		876	}
		877
		878	static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
		879	int write, struct page *pages, int nr)
		880	{
		881	unsigned long next;
		882	pmd_t *pmdp;
		883
		884	pmdp = pmd_offset(&pud, addr);
		885	do {
		886	pmd_t pmd = ACCESS_ONCE(*pmdp);
		887
		888	next = pmd_addr_end(addr, end);
		889	if (pmd_none(pmd) \|\| pmd_trans_splitting(pmd))
		890	return 0;
		891
		892	if (unlikely(pmd_trans_huge(pmd) \|\| pmd_huge(pmd))) {
		893	/*
		894	* NUMA hinting faults need to be handled in the GUP
		895	* slowpath for accounting purposes and so that they
		896	* can be serialised against THP migration.
		897	*/
		898	if (pmd_numa(pmd))
		899	return 0;
		900
		901	if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
		902	pages, nr))
		903	return 0;
		904
		905	} else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
		906	return 0;
		907	} while (pmdp++, addr = next, addr != end);
		908
		909	return 1;
		910	}
		911
		912	static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end,
		913	int write, struct page *pages, int nr)
		914	{
		915	unsigned long next;
		916	pud_t *pudp;
		917
		918	pudp = pud_offset(pgdp, addr);
		919	do {
		920	pud_t pud = ACCESS_ONCE(*pudp);
		921
		922	next = pud_addr_end(addr, end);
		923	if (pud_none(pud))
		924	return 0;
		925	if (pud_huge(pud)) {
		926	if (!gup_huge_pud(pud, pudp, addr, next, write,
		927	pages, nr))
		928	return 0;
		929	} else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
		930	return 0;
		931	} while (pudp++, addr = next, addr != end);
		932
		933	return 1;
		934	}
		935
		936	/*
		937	* Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
		938	* the regular GUP. It will only return non-negative values.
		939	*/
		940	int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
		941	struct page **pages)
		942	{
		943	struct mm_struct *mm = current->mm;
		944	unsigned long addr, len, end;
		945	unsigned long next, flags;
		946	pgd_t *pgdp;
		947	int nr = 0;
		948
		949	start &= PAGE_MASK;
		950	addr = start;
		951	len = (unsigned long) nr_pages << PAGE_SHIFT;
		952	end = start + len;
		953
		954	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
		955	start, len)))
		956	return 0;
		957
		958	/*
		959	* Disable interrupts. We use the nested form as we can already have
		960	* interrupts disabled by get_futex_key.
		961	*
		962	* With interrupts disabled, we block page table pages from being
		963	* freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
		964	* for more details.
		965	*
		966	* We do not adopt an rcu_read_lock(.) here as we also want to
		967	* block IPIs that come from THPs splitting.
		968	*/
		969
		970	local_irq_save(flags);
		971	pgdp = pgd_offset(mm, addr);
		972	do {
		973	next = pgd_addr_end(addr, end);
		974	if (pgd_none(*pgdp))
		975	break;
		976	else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
		977	break;
		978	} while (pgdp++, addr = next, addr != end);
		979	local_irq_restore(flags);
		980
		981	return nr;
		982	}
		983
		984	/**
		985	* get_user_pages_fast() - pin user pages in memory
		986	* @start: starting user address
		987	* @nr_pages: number of pages from start to pin
		988	* @write: whether pages will be written to
		989	* @pages: array that receives pointers to the pages pinned.
		990	* Should be at least nr_pages long.
		991	*
		992	* Attempt to pin user pages in memory without taking mm->mmap_sem.
		993	* If not successful, it will fall back to taking the lock and
		994	* calling get_user_pages().
		995	*
		996	* Returns number of pages pinned. This may be fewer than the number
		997	* requested. If nr_pages is 0 or negative, returns 0. If no pages
		998	* were pinned, returns -errno.
		999	*/
		1000	int get_user_pages_fast(unsigned long start, int nr_pages, int write,
		1001	struct page **pages)
		1002	{
		1003	struct mm_struct *mm = current->mm;
		1004	int nr, ret;
		1005
		1006	start &= PAGE_MASK;
		1007	nr = __get_user_pages_fast(start, nr_pages, write, pages);
		1008	ret = nr;
		1009
		1010	if (nr < nr_pages) {
		1011	/* Try to get the remaining pages with get_user_pages */
		1012	start += nr << PAGE_SHIFT;
		1013	pages += nr;
		1014
		1015	down_read(&mm->mmap_sem);
		1016	ret = get_user_pages(current, mm, start,
		1017	nr_pages - nr, write, 0, pages, NULL);
		1018	up_read(&mm->mmap_sem);
		1019
		1020	/* Have to be a bit careful with return values */
		1021	if (nr > 0) {
		1022	if (ret < 0)
		1023	ret = nr;
		1024	else
		1025	ret += nr;
		1026	}
		1027	}
		1028
		1029	return ret;
		1030	}
		1031
		1032	#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */