diff options
author | Nick Piggin <npiggin@suse.de> | 2008-07-30 01:23:13 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2008-07-30 01:26:54 -0400 |
commit | ce0ad7f0952581ba75ab6aee55bb1ed9bb22cf4f (patch) | |
tree | bf2a8845a031cb685219db2ddcb3d296b4a9ffab /arch/powerpc/mm | |
parent | 7d2a175b9bf6e9422bebe95130a3c79a25ff4602 (diff) |
powerpc/mm: Lockless get_user_pages_fast() for 64-bit (v3)
Implement lockless get_user_pages_fast for 64-bit powerpc.
Page table existence is guaranteed with RCU, and speculative page references
are used to take a reference to the pages without having a prior existence
guarantee on them.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r-- | arch/powerpc/mm/Makefile | 3 | ||||
-rw-r--r-- | arch/powerpc/mm/gup.c | 280 |
2 files changed, 282 insertions, 1 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 1c00e0196f6c..e7392b45a5ef 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile | |||
@@ -12,7 +12,8 @@ obj-y := fault.o mem.o \ | |||
12 | mmu_context_$(CONFIG_WORD_SIZE).o | 12 | mmu_context_$(CONFIG_WORD_SIZE).o |
13 | hash-$(CONFIG_PPC_NATIVE) := hash_native_64.o | 13 | hash-$(CONFIG_PPC_NATIVE) := hash_native_64.o |
14 | obj-$(CONFIG_PPC64) += hash_utils_64.o \ | 14 | obj-$(CONFIG_PPC64) += hash_utils_64.o \ |
15 | slb_low.o slb.o stab.o mmap.o $(hash-y) | 15 | slb_low.o slb.o stab.o \ |
16 | gup.o mmap.o $(hash-y) | ||
16 | obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o | 17 | obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o |
17 | obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ | 18 | obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ |
18 | tlb_$(CONFIG_WORD_SIZE).o | 19 | tlb_$(CONFIG_WORD_SIZE).o |
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c new file mode 100644 index 000000000000..9fdf4d6335e4 --- /dev/null +++ b/arch/powerpc/mm/gup.c | |||
@@ -0,0 +1,280 @@ | |||
1 | /* | ||
2 | * Lockless get_user_pages_fast for powerpc | ||
3 | * | ||
4 | * Copyright (C) 2008 Nick Piggin | ||
5 | * Copyright (C) 2008 Novell Inc. | ||
6 | */ | ||
7 | #undef DEBUG | ||
8 | |||
9 | #include <linux/sched.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/hugetlb.h> | ||
12 | #include <linux/vmstat.h> | ||
13 | #include <linux/pagemap.h> | ||
14 | #include <linux/rwsem.h> | ||
15 | #include <asm/pgtable.h> | ||
16 | |||
17 | /* | ||
18 | * The performance critical leaf functions are made noinline otherwise gcc | ||
19 | * inlines everything into a single function which results in too much | ||
20 | * register pressure. | ||
21 | */ | ||
22 | static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | ||
23 | unsigned long end, int write, struct page **pages, int *nr) | ||
24 | { | ||
25 | unsigned long mask, result; | ||
26 | pte_t *ptep; | ||
27 | |||
28 | result = _PAGE_PRESENT|_PAGE_USER; | ||
29 | if (write) | ||
30 | result |= _PAGE_RW; | ||
31 | mask = result | _PAGE_SPECIAL; | ||
32 | |||
33 | ptep = pte_offset_kernel(&pmd, addr); | ||
34 | do { | ||
35 | pte_t pte = *ptep; | ||
36 | struct page *page; | ||
37 | |||
38 | if ((pte_val(pte) & mask) != result) | ||
39 | return 0; | ||
40 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
41 | page = pte_page(pte); | ||
42 | if (!page_cache_get_speculative(page)) | ||
43 | return 0; | ||
44 | if (unlikely(pte != *ptep)) { | ||
45 | put_page(page); | ||
46 | return 0; | ||
47 | } | ||
48 | pages[*nr] = page; | ||
49 | (*nr)++; | ||
50 | |||
51 | } while (ptep++, addr += PAGE_SIZE, addr != end); | ||
52 | |||
53 | return 1; | ||
54 | } | ||
55 | |||
56 | #ifdef CONFIG_HUGETLB_PAGE | ||
57 | static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate, | ||
58 | unsigned long *addr, unsigned long end, | ||
59 | int write, struct page **pages, int *nr) | ||
60 | { | ||
61 | unsigned long mask; | ||
62 | unsigned long pte_end; | ||
63 | struct page *head, *page; | ||
64 | pte_t pte; | ||
65 | int refs; | ||
66 | |||
67 | pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate); | ||
68 | if (pte_end < end) | ||
69 | end = pte_end; | ||
70 | |||
71 | pte = *ptep; | ||
72 | mask = _PAGE_PRESENT|_PAGE_USER; | ||
73 | if (write) | ||
74 | mask |= _PAGE_RW; | ||
75 | if ((pte_val(pte) & mask) != mask) | ||
76 | return 0; | ||
77 | /* hugepages are never "special" */ | ||
78 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
79 | |||
80 | refs = 0; | ||
81 | head = pte_page(pte); | ||
82 | page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT); | ||
83 | do { | ||
84 | VM_BUG_ON(compound_head(page) != head); | ||
85 | pages[*nr] = page; | ||
86 | (*nr)++; | ||
87 | page++; | ||
88 | refs++; | ||
89 | } while (*addr += PAGE_SIZE, *addr != end); | ||
90 | |||
91 | if (!page_cache_add_speculative(head, refs)) { | ||
92 | *nr -= refs; | ||
93 | return 0; | ||
94 | } | ||
95 | if (unlikely(pte != *ptep)) { | ||
96 | /* Could be optimized better */ | ||
97 | while (*nr) { | ||
98 | put_page(page); | ||
99 | (*nr)--; | ||
100 | } | ||
101 | } | ||
102 | |||
103 | return 1; | ||
104 | } | ||
105 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
106 | |||
107 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | ||
108 | int write, struct page **pages, int *nr) | ||
109 | { | ||
110 | unsigned long next; | ||
111 | pmd_t *pmdp; | ||
112 | |||
113 | pmdp = pmd_offset(&pud, addr); | ||
114 | do { | ||
115 | pmd_t pmd = *pmdp; | ||
116 | |||
117 | next = pmd_addr_end(addr, end); | ||
118 | if (pmd_none(pmd)) | ||
119 | return 0; | ||
120 | if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | ||
121 | return 0; | ||
122 | } while (pmdp++, addr = next, addr != end); | ||
123 | |||
124 | return 1; | ||
125 | } | ||
126 | |||
127 | static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, | ||
128 | int write, struct page **pages, int *nr) | ||
129 | { | ||
130 | unsigned long next; | ||
131 | pud_t *pudp; | ||
132 | |||
133 | pudp = pud_offset(&pgd, addr); | ||
134 | do { | ||
135 | pud_t pud = *pudp; | ||
136 | |||
137 | next = pud_addr_end(addr, end); | ||
138 | if (pud_none(pud)) | ||
139 | return 0; | ||
140 | if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | ||
141 | return 0; | ||
142 | } while (pudp++, addr = next, addr != end); | ||
143 | |||
144 | return 1; | ||
145 | } | ||
146 | |||
147 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
148 | struct page **pages) | ||
149 | { | ||
150 | struct mm_struct *mm = current->mm; | ||
151 | unsigned long addr, len, end; | ||
152 | unsigned long next; | ||
153 | pgd_t *pgdp; | ||
154 | int psize, nr = 0; | ||
155 | unsigned int shift; | ||
156 | |||
157 | pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); | ||
158 | |||
159 | start &= PAGE_MASK; | ||
160 | addr = start; | ||
161 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
162 | end = start + len; | ||
163 | |||
164 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | ||
165 | start, len))) | ||
166 | goto slow_irqon; | ||
167 | |||
168 | pr_debug(" aligned: %lx .. %lx\n", start, end); | ||
169 | |||
170 | #ifdef CONFIG_HUGETLB_PAGE | ||
171 | /* We bail out on slice boundary crossing when hugetlb is | ||
172 | * enabled in order to not have to deal with two different | ||
173 | * page table formats | ||
174 | */ | ||
175 | if (addr < SLICE_LOW_TOP) { | ||
176 | if (end > SLICE_LOW_TOP) | ||
177 | goto slow_irqon; | ||
178 | |||
179 | if (unlikely(GET_LOW_SLICE_INDEX(addr) != | ||
180 | GET_LOW_SLICE_INDEX(end - 1))) | ||
181 | goto slow_irqon; | ||
182 | } else { | ||
183 | if (unlikely(GET_HIGH_SLICE_INDEX(addr) != | ||
184 | GET_HIGH_SLICE_INDEX(end - 1))) | ||
185 | goto slow_irqon; | ||
186 | } | ||
187 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
188 | |||
189 | /* | ||
190 | * XXX: batch / limit 'nr', to avoid large irq off latency | ||
191 | * needs some instrumenting to determine the common sizes used by | ||
192 | * important workloads (eg. DB2), and whether limiting the batch size | ||
193 | * will decrease performance. | ||
194 | * | ||
195 | * It seems like we're in the clear for the moment. Direct-IO is | ||
196 | * the main guy that batches up lots of get_user_pages, and even | ||
197 | * they are limited to 64-at-a-time which is not so many. | ||
198 | */ | ||
199 | /* | ||
200 | * This doesn't prevent pagetable teardown, but does prevent | ||
201 | * the pagetables from being freed on powerpc. | ||
202 | * | ||
203 | * So long as we atomically load page table pointers versus teardown, | ||
204 | * we can follow the address down to the the page and take a ref on it. | ||
205 | */ | ||
206 | local_irq_disable(); | ||
207 | |||
208 | psize = get_slice_psize(mm, addr); | ||
209 | shift = mmu_psize_defs[psize].shift; | ||
210 | |||
211 | #ifdef CONFIG_HUGETLB_PAGE | ||
212 | if (unlikely(mmu_huge_psizes[psize])) { | ||
213 | pte_t *ptep; | ||
214 | unsigned long a = addr; | ||
215 | unsigned long sz = ((1UL) << shift); | ||
216 | struct hstate *hstate = size_to_hstate(sz); | ||
217 | |||
218 | BUG_ON(!hstate); | ||
219 | /* | ||
220 | * XXX: could be optimized to avoid hstate | ||
221 | * lookup entirely (just use shift) | ||
222 | */ | ||
223 | |||
224 | do { | ||
225 | VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift); | ||
226 | ptep = huge_pte_offset(mm, a); | ||
227 | pr_debug(" %016lx: huge ptep %p\n", a, ptep); | ||
228 | if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages, | ||
229 | &nr)) | ||
230 | goto slow; | ||
231 | } while (a != end); | ||
232 | } else | ||
233 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
234 | { | ||
235 | pgdp = pgd_offset(mm, addr); | ||
236 | do { | ||
237 | pgd_t pgd = *pgdp; | ||
238 | |||
239 | VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift); | ||
240 | pr_debug(" %016lx: normal pgd %p\n", addr, (void *)pgd); | ||
241 | next = pgd_addr_end(addr, end); | ||
242 | if (pgd_none(pgd)) | ||
243 | goto slow; | ||
244 | if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) | ||
245 | goto slow; | ||
246 | } while (pgdp++, addr = next, addr != end); | ||
247 | } | ||
248 | local_irq_enable(); | ||
249 | |||
250 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); | ||
251 | return nr; | ||
252 | |||
253 | { | ||
254 | int ret; | ||
255 | |||
256 | slow: | ||
257 | local_irq_enable(); | ||
258 | slow_irqon: | ||
259 | pr_debug(" slow path ! nr = %d\n", nr); | ||
260 | |||
261 | /* Try to get the remaining pages with get_user_pages */ | ||
262 | start += nr << PAGE_SHIFT; | ||
263 | pages += nr; | ||
264 | |||
265 | down_read(&mm->mmap_sem); | ||
266 | ret = get_user_pages(current, mm, start, | ||
267 | (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); | ||
268 | up_read(&mm->mmap_sem); | ||
269 | |||
270 | /* Have to be a bit careful with return values */ | ||
271 | if (nr > 0) { | ||
272 | if (ret < 0) | ||
273 | ret = nr; | ||
274 | else | ||
275 | ret += nr; | ||
276 | } | ||
277 | |||
278 | return ret; | ||
279 | } | ||
280 | } | ||