5 files changed, 264 insertions, 0 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6b2debfabddc..6bdde845818e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,7 @@ config X86
        select HAVE_IDE
        select HAVE_OPROFILE
        select HAVE_IOREMAP_PROT
+        select HAVE_GET_USER_PAGES_FAST
        select HAVE_KPROBES
        select ARCH_WANT_OPTIONAL_GPIOLIB if !X86_RDC321X
        select HAVE_KRETPROBES
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 1fbb844c3d7a..2977ea37791f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,7 @@
 obj-y   :=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
            pat.o pgtable.o
+obj-$(CONFIG_HAVE_GET_USER_PAGES_FAST) += gup.o
 obj-$(CONFIG_X86_32)            += pgtable_32.o
 obj-$(CONFIG_HUGETLB_PAGE)      += hugetlbpage.o
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..6f733121f32e
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,258 @@
+/*
+ * Lockless get_user_pages_fast for x86
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/highmem.h>
+#include <asm/pgtable.h>
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+#ifndef CONFIG_X86_PAE
+        return *ptep;
+#else
+        /*
+         * With get_user_pages_fast, we walk down the pagetables without taking
+         * any locks.  For this we would like to load the pointers atoimcally,
+         * but that is not possible (without expensive cmpxchg8b) on PAE.  What
+         * we do have is the guarantee that a pte will only either go from not
+         * present to present, or present to not present or both -- it will not
+         * switch to a completely different present page without a TLB flush in
+         * between; something that we are blocking by holding interrupts off.
+         *
+         * Setting ptes from not present to present goes:
+         * ptep->pte_high = h;
+         * smp_wmb();
+         * ptep->pte_low = l;
+         *
+         * And present to not present goes:
+         * ptep->pte_low = 0;
+         * smp_wmb();
+         * ptep->pte_high = 0;
+         *
+         * We must ensure here that the load of pte_low sees l iff pte_high
+         * sees h. We load pte_high *after* loading pte_low, which ensures we
+         * don't see an older value of pte_high.  *Then* we recheck pte_low,
+         * which ensures that we haven't picked up a changed pte high. We might
+         * have got rubbish values from pte_low and pte_high, but we are
+         * guaranteed that pte_low will not have the present bit set *unless*
+         * it is 'l'. And get_user_pages_fast only operates on present ptes, so
+         * we're safe.
+         *
+         * gup_get_pte should not be used or copied outside gup.c without being
+         * very careful -- it does not atomically load the pte or anything that
+         * is likely to be useful for you.
+         */
+        pte_t pte;
+retry:
+        pte.pte_low = ptep->pte_low;
+        smp_rmb();
+        pte.pte_high = ptep->pte_high;
+        smp_rmb();
+        if (unlikely(pte.pte_low != ptep->pte_low))
+                goto retry;
+        return pte;
+#endif
+}
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        unsigned long mask;
+        pte_t *ptep;
+        mask = _PAGE_PRESENT|_PAGE_USER;
+        if (write)
+                mask |= _PAGE_RW;
+        ptep = pte_offset_map(&pmd, addr);
+        do {
+                pte_t pte = gup_get_pte(ptep);
+                struct page *page;
+                if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+                        pte_unmap(ptep);
+                        return 0;
+                }
+                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+                page = pte_page(pte);
+                get_page(page);
+                pages[*nr] = page;
+                (*nr)++;
+        } while (ptep++, addr += PAGE_SIZE, addr != end);
+        pte_unmap(ptep - 1);
+        return 1;
+}
+static inline void get_head_page_multiple(struct page *page, int nr)
+{
+        VM_BUG_ON(page != compound_head(page));
+        VM_BUG_ON(page_count(page) == 0);
+        atomic_add(nr, &page->_count);
+}
+static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        unsigned long mask;
+        pte_t pte = *(pte_t *)&pmd;
+        struct page *head, *page;
+        int refs;
+        mask = _PAGE_PRESENT|_PAGE_USER;
+        if (write)
+                mask |= _PAGE_RW;
+        if ((pte_val(pte) & mask) != mask)
+                return 0;
+        /* hugepages are never "special" */
+        VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+        refs = 0;
+        head = pte_page(pte);
+        page = head + ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
+        do {
+                VM_BUG_ON(compound_head(page) != head);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (addr += PAGE_SIZE, addr != end);
+        get_head_page_multiple(head, refs);
+        return 1;
+}
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+                int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pmd_t *pmdp;
+        pmdp = pmd_offset(&pud, addr);
+        do {
+                pmd_t pmd = *pmdp;
+                next = pmd_addr_end(addr, end);
+                if (pmd_none(pmd))
+                        return 0;
+                if (unlikely(pmd_large(pmd))) {
+                        if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+                                return 0;
+                } else {
+                        if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+                                return 0;
+                }
+        } while (pmdp++, addr = next, addr != end);
+        return 1;
+}
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+                        int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pud_t *pudp;
+        pudp = pud_offset(&pgd, addr);
+        do {
+                pud_t pud = *pudp;
+                next = pud_addr_end(addr, end);
+                if (pud_none(pud))
+                        return 0;
+                if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+                        return 0;
+        } while (pudp++, addr = next, addr != end);
+        return 1;
+}
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+                        struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long end = start + (nr_pages << PAGE_SHIFT);
+        unsigned long addr = start;
+        unsigned long next;
+        pgd_t *pgdp;
+        int nr = 0;
+        if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+                                        start, nr_pages*PAGE_SIZE)))
+                goto slow_irqon;
+        /*
+         * XXX: batch / limit 'nr', to avoid large irq off latency
+         * needs some instrumenting to determine the common sizes used by
+         * important workloads (eg. DB2), and whether limiting the batch size
+         * will decrease performance.
+         *
+         * It seems like we're in the clear for the moment. Direct-IO is
+         * the main guy that batches up lots of get_user_pages, and even
+         * they are limited to 64-at-a-time which is not so many.
+         */
+        /*
+         * This doesn't prevent pagetable teardown, but does prevent
+         * the pagetables and pages from being freed on x86.
+         *
+         * So long as we atomically load page table pointers versus teardown
+         * (which we do on x86, with the above PAE exception), we can follow the
+         * address down to the the page and take a ref on it.
+         */
+        local_irq_disable();
+        pgdp = pgd_offset(mm, addr);
+        do {
+                pgd_t pgd = *pgdp;
+                next = pgd_addr_end(addr, end);
+                if (pgd_none(pgd))
+                        goto slow;
+                if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+                        goto slow;
+        } while (pgdp++, addr = next, addr != end);
+        local_irq_enable();
+        VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+        return nr;
+        {
+                int ret;
+slow:
+                local_irq_enable();
+slow_irqon:
+                /* Try to get the remaining pages with get_user_pages */
+                start += nr << PAGE_SHIFT;
+                pages += nr;
+                down_read(&mm->mmap_sem);
+                ret = get_user_pages(current, mm, start,
+                        (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+                up_read(&mm->mmap_sem);
+                /* Have to be a bit careful with return values */
+                if (nr > 0) {
+                        if (ret < 0)
+                                ret = nr;
+                        else
+                                ret += nr;
+                }
+                return ret;
+        }
+}
diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h
index f6fa4d841bbc..5f702d1d5218 100644
--- a/include/asm-x86/uaccess.h
+++ b/include/asm-x86/uaccess.h
@@ -451,3 +451,4 @@ extern struct movsl_mask {
 #endif
 #endif
diff --git a/mm/Kconfig b/mm/Kconfig
index aa799007a11b..efee5d379df4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -77,6 +77,9 @@ config FLAT_NODE_MEM_MAP
        def_bool y
        depends on !SPARSEMEM
+config HAVE_GET_USER_PAGES_FAST
+        bool
 #
 # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
 # to represent different areas of memory.  This variable allows

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6b2debfabddc..6bdde845818e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig
@@ -22,6 +22,7 @@ config X86
22	select HAVE_IDE	22	select HAVE_IDE
23	select HAVE_OPROFILE	23	select HAVE_OPROFILE
24	select HAVE_IOREMAP_PROT	24	select HAVE_IOREMAP_PROT
		25	select HAVE_GET_USER_PAGES_FAST
25	select HAVE_KPROBES	26	select HAVE_KPROBES
26	select ARCH_WANT_OPTIONAL_GPIOLIB if !X86_RDC321X	27	select ARCH_WANT_OPTIONAL_GPIOLIB if !X86_RDC321X
27	select HAVE_KRETPROBES	28	select HAVE_KRETPROBES


diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 1fbb844c3d7a..2977ea37791f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile
@@ -1,6 +1,7 @@
1	obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \	1	obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2	pat.o pgtable.o	2	pat.o pgtable.o
3		3
		4	obj-$(CONFIG_HAVE_GET_USER_PAGES_FAST) += gup.o
4	obj-$(CONFIG_X86_32) += pgtable_32.o	5	obj-$(CONFIG_X86_32) += pgtable_32.o
5		6
6	obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o	7	obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o


diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c new file mode 100644 index 000000000000..6f733121f32e --- /dev/null +++ b/arch/x86/mm/gup.c
@@ -0,0 +1,258 @@
		1	/*
		2	* Lockless get_user_pages_fast for x86
		3	*
		4	* Copyright (C) 2008 Nick Piggin
		5	* Copyright (C) 2008 Novell Inc.
		6	*/
		7	#include <linux/sched.h>
		8	#include <linux/mm.h>
		9	#include <linux/vmstat.h>
		10	#include <linux/highmem.h>
		11
		12	#include <asm/pgtable.h>
		13
		14	static inline pte_t gup_get_pte(pte_t *ptep)
		15	{
		16	#ifndef CONFIG_X86_PAE
		17	return *ptep;
		18	#else
		19	/*
		20	* With get_user_pages_fast, we walk down the pagetables without taking
		21	* any locks. For this we would like to load the pointers atoimcally,
		22	* but that is not possible (without expensive cmpxchg8b) on PAE. What
		23	* we do have is the guarantee that a pte will only either go from not
		24	* present to present, or present to not present or both -- it will not
		25	* switch to a completely different present page without a TLB flush in
		26	* between; something that we are blocking by holding interrupts off.
		27	*
		28	* Setting ptes from not present to present goes:
		29	* ptep->pte_high = h;
		30	* smp_wmb();
		31	* ptep->pte_low = l;
		32	*
		33	* And present to not present goes:
		34	* ptep->pte_low = 0;
		35	* smp_wmb();
		36	* ptep->pte_high = 0;
		37	*
		38	* We must ensure here that the load of pte_low sees l iff pte_high
		39	* sees h. We load pte_high after loading pte_low, which ensures we
		40	* don't see an older value of pte_high. Then we recheck pte_low,
		41	* which ensures that we haven't picked up a changed pte high. We might
		42	* have got rubbish values from pte_low and pte_high, but we are
		43	* guaranteed that pte_low will not have the present bit set unless
		44	* it is 'l'. And get_user_pages_fast only operates on present ptes, so
		45	* we're safe.
		46	*
		47	* gup_get_pte should not be used or copied outside gup.c without being
		48	* very careful -- it does not atomically load the pte or anything that
		49	* is likely to be useful for you.
		50	*/
		51	pte_t pte;
		52
		53	retry:
		54	pte.pte_low = ptep->pte_low;
		55	smp_rmb();
		56	pte.pte_high = ptep->pte_high;
		57	smp_rmb();
		58	if (unlikely(pte.pte_low != ptep->pte_low))
		59	goto retry;
		60
		61	return pte;
		62	#endif
		63	}
		64
		65	/*
		66	* The performance critical leaf functions are made noinline otherwise gcc
		67	* inlines everything into a single function which results in too much
		68	* register pressure.
		69	*/
		70	static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
		71	unsigned long end, int write, struct page *pages, int nr)
		72	{
		73	unsigned long mask;
		74	pte_t *ptep;
		75
		76	mask = _PAGE_PRESENT\|_PAGE_USER;
		77	if (write)
		78	mask \|= _PAGE_RW;
		79
		80	ptep = pte_offset_map(&pmd, addr);
		81	do {
		82	pte_t pte = gup_get_pte(ptep);
		83	struct page *page;
		84
		85	if ((pte_val(pte) & (mask \| _PAGE_SPECIAL)) != mask) {
		86	pte_unmap(ptep);
		87	return 0;
		88	}
		89	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
		90	page = pte_page(pte);
		91	get_page(page);
		92	pages[*nr] = page;
		93	(*nr)++;
		94
		95	} while (ptep++, addr += PAGE_SIZE, addr != end);
		96	pte_unmap(ptep - 1);
		97
		98	return 1;
		99	}
		100
		101	static inline void get_head_page_multiple(struct page *page, int nr)
		102	{
		103	VM_BUG_ON(page != compound_head(page));
		104	VM_BUG_ON(page_count(page) == 0);
		105	atomic_add(nr, &page->_count);
		106	}
		107
		108	static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
		109	unsigned long end, int write, struct page *pages, int nr)
		110	{
		111	unsigned long mask;
		112	pte_t pte = (pte_t )&pmd;
		113	struct page head, page;
		114	int refs;
		115
		116	mask = _PAGE_PRESENT\|_PAGE_USER;
		117	if (write)
		118	mask \|= _PAGE_RW;
		119	if ((pte_val(pte) & mask) != mask)
		120	return 0;
		121	/* hugepages are never "special" */
		122	VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
		123	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
		124
		125	refs = 0;
		126	head = pte_page(pte);
		127	page = head + ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
		128	do {
		129	VM_BUG_ON(compound_head(page) != head);
		130	pages[*nr] = page;
		131	(*nr)++;
		132	page++;
		133	refs++;
		134	} while (addr += PAGE_SIZE, addr != end);
		135	get_head_page_multiple(head, refs);
		136
		137	return 1;
		138	}
		139
		140	static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
		141	int write, struct page *pages, int nr)
		142	{
		143	unsigned long next;
		144	pmd_t *pmdp;
		145
		146	pmdp = pmd_offset(&pud, addr);
		147	do {
		148	pmd_t pmd = *pmdp;
		149
		150	next = pmd_addr_end(addr, end);
		151	if (pmd_none(pmd))
		152	return 0;
		153	if (unlikely(pmd_large(pmd))) {
		154	if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
		155	return 0;
		156	} else {
		157	if (!gup_pte_range(pmd, addr, next, write, pages, nr))
		158	return 0;
		159	}
		160	} while (pmdp++, addr = next, addr != end);
		161
		162	return 1;
		163	}
		164
		165	static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
		166	int write, struct page *pages, int nr)
		167	{
		168	unsigned long next;
		169	pud_t *pudp;
		170
		171	pudp = pud_offset(&pgd, addr);
		172	do {
		173	pud_t pud = *pudp;
		174
		175	next = pud_addr_end(addr, end);
		176	if (pud_none(pud))
		177	return 0;
		178	if (!gup_pmd_range(pud, addr, next, write, pages, nr))
		179	return 0;
		180	} while (pudp++, addr = next, addr != end);
		181
		182	return 1;
		183	}
		184
		185	int get_user_pages_fast(unsigned long start, int nr_pages, int write,
		186	struct page **pages)
		187	{
		188	struct mm_struct *mm = current->mm;
		189	unsigned long end = start + (nr_pages << PAGE_SHIFT);
		190	unsigned long addr = start;
		191	unsigned long next;
		192	pgd_t *pgdp;
		193	int nr = 0;
		194
		195	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
		196	start, nr_pages*PAGE_SIZE)))
		197	goto slow_irqon;
		198
		199	/*
		200	* XXX: batch / limit 'nr', to avoid large irq off latency
		201	* needs some instrumenting to determine the common sizes used by
		202	* important workloads (eg. DB2), and whether limiting the batch size
		203	* will decrease performance.
		204	*
		205	* It seems like we're in the clear for the moment. Direct-IO is
		206	* the main guy that batches up lots of get_user_pages, and even
		207	* they are limited to 64-at-a-time which is not so many.
		208	*/
		209	/*
		210	* This doesn't prevent pagetable teardown, but does prevent
		211	* the pagetables and pages from being freed on x86.
		212	*
		213	* So long as we atomically load page table pointers versus teardown
		214	* (which we do on x86, with the above PAE exception), we can follow the
		215	* address down to the the page and take a ref on it.
		216	*/
		217	local_irq_disable();
		218	pgdp = pgd_offset(mm, addr);
		219	do {
		220	pgd_t pgd = *pgdp;
		221
		222	next = pgd_addr_end(addr, end);
		223	if (pgd_none(pgd))
		224	goto slow;
		225	if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
		226	goto slow;
		227	} while (pgdp++, addr = next, addr != end);
		228	local_irq_enable();
		229
		230	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
		231	return nr;
		232
		233	{
		234	int ret;
		235
		236	slow:
		237	local_irq_enable();
		238	slow_irqon:
		239	/* Try to get the remaining pages with get_user_pages */
		240	start += nr << PAGE_SHIFT;
		241	pages += nr;
		242
		243	down_read(&mm->mmap_sem);
		244	ret = get_user_pages(current, mm, start,
		245	(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
		246	up_read(&mm->mmap_sem);
		247
		248	/* Have to be a bit careful with return values */
		249	if (nr > 0) {
		250	if (ret < 0)
		251	ret = nr;
		252	else
		253	ret += nr;
		254	}
		255
		256	return ret;
		257	}
		258	}


diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h index f6fa4d841bbc..5f702d1d5218 100644 --- a/include/asm-x86/uaccess.h +++ b/include/asm-x86/uaccess.h
@@ -451,3 +451,4 @@ extern struct movsl_mask {
451	#endif	451	#endif
452		452
453	#endif	453	#endif
		454


diff --git a/mm/Kconfig b/mm/Kconfig index aa799007a11b..efee5d379df4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig
@@ -77,6 +77,9 @@ config FLAT_NODE_MEM_MAP
77	def_bool y	77	def_bool y
78	depends on !SPARSEMEM	78	depends on !SPARSEMEM
79		79
		80	config HAVE_GET_USER_PAGES_FAST
		81	bool
		82
80	#	83	#
81	# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's	84	# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
82	# to represent different areas of memory. This variable allows	85	# to represent different areas of memory. This variable allows