1 files changed, 280 insertions, 0 deletions
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
new file mode 100644
index 000000000000..9fdf4d6335e4
--- /dev/null
+++ b/arch/powerpc/mm/gup.c
@@ -0,0 +1,280 @@
+/*
+ * Lockless get_user_pages_fast for powerpc
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#undef DEBUG
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        unsigned long mask, result;
+        pte_t *ptep;
+        result = _PAGE_PRESENT|_PAGE_USER;
+        if (write)
+                result |= _PAGE_RW;
+        mask = result | _PAGE_SPECIAL;
+        ptep = pte_offset_kernel(&pmd, addr);
+        do {
+                pte_t pte = *ptep;
+                struct page *page;
+                if ((pte_val(pte) & mask) != result)
+                        return 0;
+                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+                page = pte_page(pte);
+                if (!page_cache_get_speculative(page))
+                        return 0;
+                if (unlikely(pte != *ptep)) {
+                        put_page(page);
+                        return 0;
+                }
+                pages[*nr] = page;
+                (*nr)++;
+        } while (ptep++, addr += PAGE_SIZE, addr != end);
+        return 1;
+}
+#ifdef CONFIG_HUGETLB_PAGE
+static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
+                                 unsigned long *addr, unsigned long end,
+                                 int write, struct page **pages, int *nr)
+{
+        unsigned long mask;
+        unsigned long pte_end;
+        struct page *head, *page;
+        pte_t pte;
+        int refs;
+        pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
+        if (pte_end < end)
+                end = pte_end;
+        pte = *ptep;
+        mask = _PAGE_PRESENT|_PAGE_USER;
+        if (write)
+                mask |= _PAGE_RW;
+        if ((pte_val(pte) & mask) != mask)
+                return 0;
+        /* hugepages are never "special" */
+        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+        refs = 0;
+        head = pte_page(pte);
+        page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
+        do {
+                VM_BUG_ON(compound_head(page) != head);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (*addr += PAGE_SIZE, *addr != end);
+        if (!page_cache_add_speculative(head, refs)) {
+                *nr -= refs;
+                return 0;
+        }
+        if (unlikely(pte != *ptep)) {
+                /* Could be optimized better */
+                while (*nr) {
+                        put_page(page);
+                        (*nr)--;
+                }
+        }
+        return 1;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+                int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pmd_t *pmdp;
+        pmdp = pmd_offset(&pud, addr);
+        do {
+                pmd_t pmd = *pmdp;
+                next = pmd_addr_end(addr, end);
+                if (pmd_none(pmd))
+                        return 0;
+                if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+                        return 0;
+        } while (pmdp++, addr = next, addr != end);
+        return 1;
+}
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+                int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pud_t *pudp;
+        pudp = pud_offset(&pgd, addr);
+        do {
+                pud_t pud = *pudp;
+                next = pud_addr_end(addr, end);
+                if (pud_none(pud))
+                        return 0;
+                if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+                        return 0;
+        } while (pudp++, addr = next, addr != end);
+        return 1;
+}
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+                        struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long addr, len, end;
+        unsigned long next;
+        pgd_t *pgdp;
+        int psize, nr = 0;
+        unsigned int shift;
+        pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
+        start &= PAGE_MASK;
+        addr = start;
+        len = (unsigned long) nr_pages << PAGE_SHIFT;
+        end = start + len;
+        if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+                                        start, len)))
+                goto slow_irqon;
+        pr_debug("  aligned: %lx .. %lx\n", start, end);
+#ifdef CONFIG_HUGETLB_PAGE
+        /* We bail out on slice boundary crossing when hugetlb is
+         * enabled in order to not have to deal with two different
+         * page table formats
+         */
+        if (addr < SLICE_LOW_TOP) {
+                if (end > SLICE_LOW_TOP)
+                        goto slow_irqon;
+                if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
+                             GET_LOW_SLICE_INDEX(end - 1)))
+                        goto slow_irqon;
+        } else {
+                if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
+                             GET_HIGH_SLICE_INDEX(end - 1)))
+                        goto slow_irqon;
+        }
+#endif /* CONFIG_HUGETLB_PAGE */
+        /*
+         * XXX: batch / limit 'nr', to avoid large irq off latency
+         * needs some instrumenting to determine the common sizes used by
+         * important workloads (eg. DB2), and whether limiting the batch size
+         * will decrease performance.
+         *
+         * It seems like we're in the clear for the moment. Direct-IO is
+         * the main guy that batches up lots of get_user_pages, and even
+         * they are limited to 64-at-a-time which is not so many.
+         */
+        /*
+         * This doesn't prevent pagetable teardown, but does prevent
+         * the pagetables from being freed on powerpc.
+         *
+         * So long as we atomically load page table pointers versus teardown,
+         * we can follow the address down to the the page and take a ref on it.
+         */
+        local_irq_disable();
+        psize = get_slice_psize(mm, addr);
+        shift = mmu_psize_defs[psize].shift;
+#ifdef CONFIG_HUGETLB_PAGE
+        if (unlikely(mmu_huge_psizes[psize])) {
+                pte_t *ptep;
+                unsigned long a = addr;
+                unsigned long sz = ((1UL) << shift);
+                struct hstate *hstate = size_to_hstate(sz);
+                BUG_ON(!hstate);
+                /*
+                 * XXX: could be optimized to avoid hstate
+                 * lookup entirely (just use shift)
+                 */
+                do {
+                        VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
+                        ptep = huge_pte_offset(mm, a);
+                        pr_debug(" %016lx: huge ptep %p\n", a, ptep);
+                        if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
+                                                   &nr))
+                                goto slow;
+                } while (a != end);
+        } else
+#endif /* CONFIG_HUGETLB_PAGE */
+        {
+                pgdp = pgd_offset(mm, addr);
+                do {
+                        pgd_t pgd = *pgdp;
+                        VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
+                        pr_debug("  %016lx: normal pgd %p\n", addr, (void *)pgd);
+                        next = pgd_addr_end(addr, end);
+                        if (pgd_none(pgd))
+                                goto slow;
+                        if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+                                goto slow;
+                } while (pgdp++, addr = next, addr != end);
+        }
+        local_irq_enable();
+        VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+        return nr;
+        {
+                int ret;
+slow:
+                local_irq_enable();
+slow_irqon:
+                pr_debug("  slow path ! nr = %d\n", nr);
+                /* Try to get the remaining pages with get_user_pages */
+                start += nr << PAGE_SHIFT;
+                pages += nr;
+                down_read(&mm->mmap_sem);
+                ret = get_user_pages(current, mm, start,
+                        (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+                up_read(&mm->mmap_sem);
+                /* Have to be a bit careful with return values */
+                if (nr > 0) {
+                        if (ret < 0)
+                                ret = nr;
+                        else
+                                ret += nr;
+                }
+                return ret;
+        }
+}

diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c new file mode 100644 index 000000000000..9fdf4d6335e4 --- /dev/null +++ b/arch/powerpc/mm/gup.c
@@ -0,0 +1,280 @@
	1	/*
	2	* Lockless get_user_pages_fast for powerpc
	3	*
	4	* Copyright (C) 2008 Nick Piggin
	5	* Copyright (C) 2008 Novell Inc.
	6	*/
	7	#undef DEBUG
	8
	9	#include <linux/sched.h>
	10	#include <linux/mm.h>
	11	#include <linux/hugetlb.h>
	12	#include <linux/vmstat.h>
	13	#include <linux/pagemap.h>
	14	#include <linux/rwsem.h>
	15	#include <asm/pgtable.h>
	16
	17	/*
	18	* The performance critical leaf functions are made noinline otherwise gcc
	19	* inlines everything into a single function which results in too much
	20	* register pressure.
	21	*/
	22	static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
	23	unsigned long end, int write, struct page *pages, int nr)
	24	{
	25	unsigned long mask, result;
	26	pte_t *ptep;
	27
	28	result = _PAGE_PRESENT\|_PAGE_USER;
	29	if (write)
	30	result \|= _PAGE_RW;
	31	mask = result \| _PAGE_SPECIAL;
	32
	33	ptep = pte_offset_kernel(&pmd, addr);
	34	do {
	35	pte_t pte = *ptep;
	36	struct page *page;
	37
	38	if ((pte_val(pte) & mask) != result)
	39	return 0;
	40	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
	41	page = pte_page(pte);
	42	if (!page_cache_get_speculative(page))
	43	return 0;
	44	if (unlikely(pte != *ptep)) {
	45	put_page(page);
	46	return 0;
	47	}
	48	pages[*nr] = page;
	49	(*nr)++;
	50
	51	} while (ptep++, addr += PAGE_SIZE, addr != end);
	52
	53	return 1;
	54	}
	55
	56	#ifdef CONFIG_HUGETLB_PAGE
	57	static noinline int gup_huge_pte(pte_t ptep, struct hstate hstate,
	58	unsigned long *addr, unsigned long end,
	59	int write, struct page *pages, int nr)
	60	{
	61	unsigned long mask;
	62	unsigned long pte_end;
	63	struct page head, page;
	64	pte_t pte;
	65	int refs;
	66
	67	pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
	68	if (pte_end < end)
	69	end = pte_end;
	70
	71	pte = *ptep;
	72	mask = _PAGE_PRESENT\|_PAGE_USER;
	73	if (write)
	74	mask \|= _PAGE_RW;
	75	if ((pte_val(pte) & mask) != mask)
	76	return 0;
	77	/* hugepages are never "special" */
	78	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
	79
	80	refs = 0;
	81	head = pte_page(pte);
	82	page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
	83	do {
	84	VM_BUG_ON(compound_head(page) != head);
	85	pages[*nr] = page;
	86	(*nr)++;
	87	page++;
	88	refs++;
	89	} while (addr += PAGE_SIZE, addr != end);
	90
	91	if (!page_cache_add_speculative(head, refs)) {
	92	*nr -= refs;
	93	return 0;
	94	}
	95	if (unlikely(pte != *ptep)) {
	96	/* Could be optimized better */
	97	while (*nr) {
	98	put_page(page);
	99	(*nr)--;
	100	}
	101	}
	102
	103	return 1;
	104	}
	105	#endif /* CONFIG_HUGETLB_PAGE */
	106
	107	static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
	108	int write, struct page *pages, int nr)
	109	{
	110	unsigned long next;
	111	pmd_t *pmdp;
	112
	113	pmdp = pmd_offset(&pud, addr);
	114	do {
	115	pmd_t pmd = *pmdp;
	116
	117	next = pmd_addr_end(addr, end);
	118	if (pmd_none(pmd))
	119	return 0;
	120	if (!gup_pte_range(pmd, addr, next, write, pages, nr))
	121	return 0;
	122	} while (pmdp++, addr = next, addr != end);
	123
	124	return 1;
	125	}
	126
	127	static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
	128	int write, struct page *pages, int nr)
	129	{
	130	unsigned long next;
	131	pud_t *pudp;
	132
	133	pudp = pud_offset(&pgd, addr);
	134	do {
	135	pud_t pud = *pudp;
	136
	137	next = pud_addr_end(addr, end);
	138	if (pud_none(pud))
	139	return 0;
	140	if (!gup_pmd_range(pud, addr, next, write, pages, nr))
	141	return 0;
	142	} while (pudp++, addr = next, addr != end);
	143
	144	return 1;
	145	}
	146
	147	int get_user_pages_fast(unsigned long start, int nr_pages, int write,
	148	struct page **pages)
	149	{
	150	struct mm_struct *mm = current->mm;
	151	unsigned long addr, len, end;
	152	unsigned long next;
	153	pgd_t *pgdp;
	154	int psize, nr = 0;
	155	unsigned int shift;
	156
	157	pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
	158
	159	start &= PAGE_MASK;
	160	addr = start;
	161	len = (unsigned long) nr_pages << PAGE_SHIFT;
	162	end = start + len;
	163
	164	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
	165	start, len)))
	166	goto slow_irqon;
	167
	168	pr_debug(" aligned: %lx .. %lx\n", start, end);
	169
	170	#ifdef CONFIG_HUGETLB_PAGE
	171	/* We bail out on slice boundary crossing when hugetlb is
	172	* enabled in order to not have to deal with two different
	173	* page table formats
	174	*/
	175	if (addr < SLICE_LOW_TOP) {
	176	if (end > SLICE_LOW_TOP)
	177	goto slow_irqon;
	178
	179	if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
	180	GET_LOW_SLICE_INDEX(end - 1)))
	181	goto slow_irqon;
	182	} else {
	183	if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
	184	GET_HIGH_SLICE_INDEX(end - 1)))
	185	goto slow_irqon;
	186	}
	187	#endif /* CONFIG_HUGETLB_PAGE */
	188
	189	/*
	190	* XXX: batch / limit 'nr', to avoid large irq off latency
	191	* needs some instrumenting to determine the common sizes used by
	192	* important workloads (eg. DB2), and whether limiting the batch size
	193	* will decrease performance.
	194	*
	195	* It seems like we're in the clear for the moment. Direct-IO is
	196	* the main guy that batches up lots of get_user_pages, and even
	197	* they are limited to 64-at-a-time which is not so many.
	198	*/
	199	/*
	200	* This doesn't prevent pagetable teardown, but does prevent
	201	* the pagetables from being freed on powerpc.
	202	*
	203	* So long as we atomically load page table pointers versus teardown,
	204	* we can follow the address down to the the page and take a ref on it.
	205	*/
	206	local_irq_disable();
	207
	208	psize = get_slice_psize(mm, addr);
	209	shift = mmu_psize_defs[psize].shift;
	210
	211	#ifdef CONFIG_HUGETLB_PAGE
	212	if (unlikely(mmu_huge_psizes[psize])) {
	213	pte_t *ptep;
	214	unsigned long a = addr;
	215	unsigned long sz = ((1UL) << shift);
	216	struct hstate *hstate = size_to_hstate(sz);
	217
	218	BUG_ON(!hstate);
	219	/*
	220	* XXX: could be optimized to avoid hstate
	221	* lookup entirely (just use shift)
	222	*/
	223
	224	do {
	225	VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
	226	ptep = huge_pte_offset(mm, a);
	227	pr_debug(" %016lx: huge ptep %p\n", a, ptep);
	228	if (!ptep \|\| !gup_huge_pte(ptep, hstate, &a, end, write, pages,
	229	&nr))
	230	goto slow;
	231	} while (a != end);
	232	} else
	233	#endif /* CONFIG_HUGETLB_PAGE */
	234	{
	235	pgdp = pgd_offset(mm, addr);
	236	do {
	237	pgd_t pgd = *pgdp;
	238
	239	VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
	240	pr_debug(" %016lx: normal pgd %p\n", addr, (void *)pgd);
	241	next = pgd_addr_end(addr, end);
	242	if (pgd_none(pgd))
	243	goto slow;
	244	if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
	245	goto slow;
	246	} while (pgdp++, addr = next, addr != end);
	247	}
	248	local_irq_enable();
	249
	250	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
	251	return nr;
	252
	253	{
	254	int ret;
	255
	256	slow:
	257	local_irq_enable();
	258	slow_irqon:
	259	pr_debug(" slow path ! nr = %d\n", nr);
	260
	261	/* Try to get the remaining pages with get_user_pages */
	262	start += nr << PAGE_SHIFT;
	263	pages += nr;
	264
	265	down_read(&mm->mmap_sem);
	266	ret = get_user_pages(current, mm, start,
	267	(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
	268	up_read(&mm->mmap_sem);
	269
	270	/* Have to be a bit careful with return values */
	271	if (nr > 0) {
	272	if (ret < 0)
	273	ret = nr;
	274	else
	275	ret += nr;
	276	}
	277
	278	return ret;
	279	}
	280	}