diff options
author | Hillf Danton <dhillf@gmail.com> | 2011-11-22 09:38:03 -0500 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2011-12-07 17:03:45 -0500 |
commit | b1c10bea620f79109b5cc9935267bea4f6f29ac6 (patch) | |
tree | 2b529b61862b6f5b3834a174c246a18b0255f28a /arch/mips/mm | |
parent | 5639bc4a64786c94eba3d2ba6a4ff4b290da1fb1 (diff) |
MIPS: Add fast get_user_pages
Gup is used in a few cases, say futex.
This work is derived from the x86 version, and operations of pte and pmd are
adapted to the defines of MIPS in straight forward manner.
[ralf@linux-mips.org: Fixed up reject in arch/mips/mm/Makefile due to
whitespace formatting differences. Fixed build error in gup.c due to
conflicting changes elsewhere in the kernel.]
Signed-off-by: Hillf Danton <dhillf@gmail.com>
Cc: David Daney <david.daney@cavium.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/2859/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Diffstat (limited to 'arch/mips/mm')
-rw-r--r-- | arch/mips/mm/Makefile | 4 | ||||
-rw-r--r-- | arch/mips/mm/gup.c | 315 |
2 files changed, 317 insertions, 2 deletions
diff --git a/arch/mips/mm/Makefile b/arch/mips/mm/Makefile index 4d8c1623eee2..3ca2a065cf76 100644 --- a/arch/mips/mm/Makefile +++ b/arch/mips/mm/Makefile | |||
@@ -3,8 +3,8 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-y += cache.o dma-default.o extable.o fault.o \ | 5 | obj-y += cache.o dma-default.o extable.o fault.o \ |
6 | init.o mmap.o tlbex.o tlbex-fault.o uasm.o \ | 6 | gup.o init.o mmap.o page.o tlbex.o \ |
7 | page.o | 7 | tlbex-fault.o uasm.o |
8 | 8 | ||
9 | obj-$(CONFIG_32BIT) += ioremap.o pgtable-32.o | 9 | obj-$(CONFIG_32BIT) += ioremap.o pgtable-32.o |
10 | obj-$(CONFIG_64BIT) += pgtable-64.o | 10 | obj-$(CONFIG_64BIT) += pgtable-64.o |
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c new file mode 100644 index 000000000000..33aadbcf170b --- /dev/null +++ b/arch/mips/mm/gup.c | |||
@@ -0,0 +1,315 @@ | |||
1 | /* | ||
2 | * Lockless get_user_pages_fast for MIPS | ||
3 | * | ||
4 | * Copyright (C) 2008 Nick Piggin | ||
5 | * Copyright (C) 2008 Novell Inc. | ||
6 | * Copyright (C) 2011 Ralf Baechle | ||
7 | */ | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/vmstat.h> | ||
11 | #include <linux/highmem.h> | ||
12 | #include <linux/swap.h> | ||
13 | #include <linux/hugetlb.h> | ||
14 | |||
15 | #include <asm/pgtable.h> | ||
16 | |||
17 | static inline pte_t gup_get_pte(pte_t *ptep) | ||
18 | { | ||
19 | #if defined(CONFIG_64BIT_PHYS_ADDR) && defined(CONFIG_CPU_MIPS32) | ||
20 | pte_t pte; | ||
21 | |||
22 | retry: | ||
23 | pte.pte_low = ptep->pte_low; | ||
24 | smp_rmb(); | ||
25 | pte.pte_high = ptep->pte_high; | ||
26 | smp_rmb(); | ||
27 | if (unlikely(pte.pte_low != ptep->pte_low)) | ||
28 | goto retry; | ||
29 | |||
30 | return pte; | ||
31 | #else | ||
32 | return ACCESS_ONCE(*ptep); | ||
33 | #endif | ||
34 | } | ||
35 | |||
36 | static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | ||
37 | int write, struct page **pages, int *nr) | ||
38 | { | ||
39 | pte_t *ptep = pte_offset_map(&pmd, addr); | ||
40 | do { | ||
41 | pte_t pte = gup_get_pte(ptep); | ||
42 | struct page *page; | ||
43 | |||
44 | if (!pte_present(pte) || | ||
45 | pte_special(pte) || (write && !pte_write(pte))) { | ||
46 | pte_unmap(ptep); | ||
47 | return 0; | ||
48 | } | ||
49 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
50 | page = pte_page(pte); | ||
51 | get_page(page); | ||
52 | SetPageReferenced(page); | ||
53 | pages[*nr] = page; | ||
54 | (*nr)++; | ||
55 | |||
56 | } while (ptep++, addr += PAGE_SIZE, addr != end); | ||
57 | |||
58 | pte_unmap(ptep - 1); | ||
59 | return 1; | ||
60 | } | ||
61 | |||
62 | static inline void get_head_page_multiple(struct page *page, int nr) | ||
63 | { | ||
64 | VM_BUG_ON(page != compound_head(page)); | ||
65 | VM_BUG_ON(page_count(page) == 0); | ||
66 | atomic_add(nr, &page->_count); | ||
67 | SetPageReferenced(page); | ||
68 | } | ||
69 | |||
70 | static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, | ||
71 | int write, struct page **pages, int *nr) | ||
72 | { | ||
73 | pte_t pte = *(pte_t *)&pmd; | ||
74 | struct page *head, *page; | ||
75 | int refs; | ||
76 | |||
77 | if (write && !pte_write(pte)) | ||
78 | return 0; | ||
79 | /* hugepages are never "special" */ | ||
80 | VM_BUG_ON(pte_special(pte)); | ||
81 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
82 | |||
83 | refs = 0; | ||
84 | head = pte_page(pte); | ||
85 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
86 | do { | ||
87 | VM_BUG_ON(compound_head(page) != head); | ||
88 | pages[*nr] = page; | ||
89 | if (PageTail(page)) | ||
90 | get_huge_page_tail(page); | ||
91 | (*nr)++; | ||
92 | page++; | ||
93 | refs++; | ||
94 | } while (addr += PAGE_SIZE, addr != end); | ||
95 | |||
96 | get_head_page_multiple(head, refs); | ||
97 | return 1; | ||
98 | } | ||
99 | |||
100 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | ||
101 | int write, struct page **pages, int *nr) | ||
102 | { | ||
103 | unsigned long next; | ||
104 | pmd_t *pmdp; | ||
105 | |||
106 | pmdp = pmd_offset(&pud, addr); | ||
107 | do { | ||
108 | pmd_t pmd = *pmdp; | ||
109 | |||
110 | next = pmd_addr_end(addr, end); | ||
111 | /* | ||
112 | * The pmd_trans_splitting() check below explains why | ||
113 | * pmdp_splitting_flush has to flush the tlb, to stop | ||
114 | * this gup-fast code from running while we set the | ||
115 | * splitting bit in the pmd. Returning zero will take | ||
116 | * the slow path that will call wait_split_huge_page() | ||
117 | * if the pmd is still in splitting state. gup-fast | ||
118 | * can't because it has irq disabled and | ||
119 | * wait_split_huge_page() would never return as the | ||
120 | * tlb flush IPI wouldn't run. | ||
121 | */ | ||
122 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | ||
123 | return 0; | ||
124 | if (unlikely(pmd_huge(pmd))) { | ||
125 | if (!gup_huge_pmd(pmd, addr, next, write, pages,nr)) | ||
126 | return 0; | ||
127 | } else { | ||
128 | if (!gup_pte_range(pmd, addr, next, write, pages,nr)) | ||
129 | return 0; | ||
130 | } | ||
131 | } while (pmdp++, addr = next, addr != end); | ||
132 | |||
133 | return 1; | ||
134 | } | ||
135 | |||
136 | static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, | ||
137 | int write, struct page **pages, int *nr) | ||
138 | { | ||
139 | pte_t pte = *(pte_t *)&pud; | ||
140 | struct page *head, *page; | ||
141 | int refs; | ||
142 | |||
143 | if (write && !pte_write(pte)) | ||
144 | return 0; | ||
145 | /* hugepages are never "special" */ | ||
146 | VM_BUG_ON(pte_special(pte)); | ||
147 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
148 | |||
149 | refs = 0; | ||
150 | head = pte_page(pte); | ||
151 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
152 | do { | ||
153 | VM_BUG_ON(compound_head(page) != head); | ||
154 | pages[*nr] = page; | ||
155 | (*nr)++; | ||
156 | page++; | ||
157 | refs++; | ||
158 | } while (addr += PAGE_SIZE, addr != end); | ||
159 | |||
160 | get_head_page_multiple(head, refs); | ||
161 | return 1; | ||
162 | } | ||
163 | |||
164 | static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, | ||
165 | int write, struct page **pages, int *nr) | ||
166 | { | ||
167 | unsigned long next; | ||
168 | pud_t *pudp; | ||
169 | |||
170 | pudp = pud_offset(&pgd, addr); | ||
171 | do { | ||
172 | pud_t pud = *pudp; | ||
173 | |||
174 | next = pud_addr_end(addr, end); | ||
175 | if (pud_none(pud)) | ||
176 | return 0; | ||
177 | if (unlikely(pud_huge(pud))) { | ||
178 | if (!gup_huge_pud(pud, addr, next, write, pages,nr)) | ||
179 | return 0; | ||
180 | } else { | ||
181 | if (!gup_pmd_range(pud, addr, next, write, pages,nr)) | ||
182 | return 0; | ||
183 | } | ||
184 | } while (pudp++, addr = next, addr != end); | ||
185 | |||
186 | return 1; | ||
187 | } | ||
188 | |||
189 | /* | ||
190 | * Like get_user_pages_fast() except its IRQ-safe in that it won't fall | ||
191 | * back to the regular GUP. | ||
192 | */ | ||
193 | int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
194 | struct page **pages) | ||
195 | { | ||
196 | struct mm_struct *mm = current->mm; | ||
197 | unsigned long addr, len, end; | ||
198 | unsigned long next; | ||
199 | unsigned long flags; | ||
200 | pgd_t *pgdp; | ||
201 | int nr = 0; | ||
202 | |||
203 | start &= PAGE_MASK; | ||
204 | addr = start; | ||
205 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
206 | end = start + len; | ||
207 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | ||
208 | (void __user *)start, len))) | ||
209 | return 0; | ||
210 | |||
211 | /* | ||
212 | * XXX: batch / limit 'nr', to avoid large irq off latency | ||
213 | * needs some instrumenting to determine the common sizes used by | ||
214 | * important workloads (eg. DB2), and whether limiting the batch | ||
215 | * size will decrease performance. | ||
216 | * | ||
217 | * It seems like we're in the clear for the moment. Direct-IO is | ||
218 | * the main guy that batches up lots of get_user_pages, and even | ||
219 | * they are limited to 64-at-a-time which is not so many. | ||
220 | */ | ||
221 | /* | ||
222 | * This doesn't prevent pagetable teardown, but does prevent | ||
223 | * the pagetables and pages from being freed. | ||
224 | * | ||
225 | * So long as we atomically load page table pointers versus teardown, | ||
226 | * we can follow the address down to the page and take a ref on it. | ||
227 | */ | ||
228 | local_irq_save(flags); | ||
229 | pgdp = pgd_offset(mm, addr); | ||
230 | do { | ||
231 | pgd_t pgd = *pgdp; | ||
232 | |||
233 | next = pgd_addr_end(addr, end); | ||
234 | if (pgd_none(pgd)) | ||
235 | break; | ||
236 | if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) | ||
237 | break; | ||
238 | } while (pgdp++, addr = next, addr != end); | ||
239 | local_irq_restore(flags); | ||
240 | |||
241 | return nr; | ||
242 | } | ||
243 | |||
244 | /** | ||
245 | * get_user_pages_fast() - pin user pages in memory | ||
246 | * @start: starting user address | ||
247 | * @nr_pages: number of pages from start to pin | ||
248 | * @write: whether pages will be written to | ||
249 | * @pages: array that receives pointers to the pages pinned. | ||
250 | * Should be at least nr_pages long. | ||
251 | * | ||
252 | * Attempt to pin user pages in memory without taking mm->mmap_sem. | ||
253 | * If not successful, it will fall back to taking the lock and | ||
254 | * calling get_user_pages(). | ||
255 | * | ||
256 | * Returns number of pages pinned. This may be fewer than the number | ||
257 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
258 | * were pinned, returns -errno. | ||
259 | */ | ||
260 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
261 | struct page **pages) | ||
262 | { | ||
263 | struct mm_struct *mm = current->mm; | ||
264 | unsigned long addr, len, end; | ||
265 | unsigned long next; | ||
266 | pgd_t *pgdp; | ||
267 | int ret, nr = 0; | ||
268 | |||
269 | start &= PAGE_MASK; | ||
270 | addr = start; | ||
271 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
272 | |||
273 | end = start + len; | ||
274 | if (end < start) | ||
275 | goto slow_irqon; | ||
276 | |||
277 | /* XXX: batch / limit 'nr' */ | ||
278 | local_irq_disable(); | ||
279 | pgdp = pgd_offset(mm, addr); | ||
280 | do { | ||
281 | pgd_t pgd = *pgdp; | ||
282 | |||
283 | next = pgd_addr_end(addr, end); | ||
284 | if (pgd_none(pgd)) | ||
285 | goto slow; | ||
286 | if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) | ||
287 | goto slow; | ||
288 | } while (pgdp++, addr = next, addr != end); | ||
289 | local_irq_enable(); | ||
290 | |||
291 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); | ||
292 | return nr; | ||
293 | slow: | ||
294 | local_irq_enable(); | ||
295 | |||
296 | slow_irqon: | ||
297 | /* Try to get the remaining pages with get_user_pages */ | ||
298 | start += nr << PAGE_SHIFT; | ||
299 | pages += nr; | ||
300 | |||
301 | down_read(&mm->mmap_sem); | ||
302 | ret = get_user_pages(current, mm, start, | ||
303 | (end - start) >> PAGE_SHIFT, | ||
304 | write, 0, pages, NULL); | ||
305 | up_read(&mm->mmap_sem); | ||
306 | |||
307 | /* Have to be a bit careful with return values */ | ||
308 | if (nr > 0) { | ||
309 | if (ret < 0) | ||
310 | ret = nr; | ||
311 | else | ||
312 | ret += nr; | ||
313 | } | ||
314 | return ret; | ||
315 | } | ||