diff options
author | Tejun Heo <tj@kernel.org> | 2010-04-09 05:57:01 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2010-05-01 02:30:50 -0400 |
commit | 9f6455325618821dcf6775d7972881fde32e77c5 (patch) | |
tree | 6031e6f28aaaa3bf8d8e08dd59031d94c19fa89e /mm/percpu-vm.c | |
parent | 88999a898b565960690f18e4a13a1e8a9fa4dfef (diff) |
percpu: move vmalloc based chunk management into percpu-vm.c
Separate out and move chunk management (creation/desctruction and
[de]population) code into percpu-vm.c which is included by percpu.c
and compiled together. The interface for chunk management is defined
as follows.
* pcpu_populate_chunk - populate the specified range of a chunk
* pcpu_depopulate_chunk - depopulate the specified range of a chunk
* pcpu_create_chunk - create a new chunk
* pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
* pcpu_addr_to_page - translate address to physical address
* pcpu_verify_alloc_info - check alloc_info is acceptable during init
Other than wrapping vmalloc_to_page() inside pcpu_addr_to_page() and
dummy pcpu_verify_alloc_info() implementation, this patch only moves
code around. This separation is to allow alternate chunk management
implementation.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Howells <dhowells@redhat.com>
Cc: Graff Yang <graff.yang@gmail.com>
Cc: Sonic Zhang <sonic.adi@gmail.com>
Diffstat (limited to 'mm/percpu-vm.c')
-rw-r--r-- | mm/percpu-vm.c | 451 |
1 files changed, 451 insertions, 0 deletions
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c new file mode 100644 index 000000000000..7d9c1d0ebd3f --- /dev/null +++ b/mm/percpu-vm.c | |||
@@ -0,0 +1,451 @@ | |||
1 | /* | ||
2 | * mm/percpu-vm.c - vmalloc area based chunk allocation | ||
3 | * | ||
4 | * Copyright (C) 2010 SUSE Linux Products GmbH | ||
5 | * Copyright (C) 2010 Tejun Heo <tj@kernel.org> | ||
6 | * | ||
7 | * This file is released under the GPLv2. | ||
8 | * | ||
9 | * Chunks are mapped into vmalloc areas and populated page by page. | ||
10 | * This is the default chunk allocator. | ||
11 | */ | ||
12 | |||
13 | static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, | ||
14 | unsigned int cpu, int page_idx) | ||
15 | { | ||
16 | /* must not be used on pre-mapped chunk */ | ||
17 | WARN_ON(chunk->immutable); | ||
18 | |||
19 | return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); | ||
20 | } | ||
21 | |||
22 | /** | ||
23 | * pcpu_get_pages_and_bitmap - get temp pages array and bitmap | ||
24 | * @chunk: chunk of interest | ||
25 | * @bitmapp: output parameter for bitmap | ||
26 | * @may_alloc: may allocate the array | ||
27 | * | ||
28 | * Returns pointer to array of pointers to struct page and bitmap, | ||
29 | * both of which can be indexed with pcpu_page_idx(). The returned | ||
30 | * array is cleared to zero and *@bitmapp is copied from | ||
31 | * @chunk->populated. Note that there is only one array and bitmap | ||
32 | * and access exclusion is the caller's responsibility. | ||
33 | * | ||
34 | * CONTEXT: | ||
35 | * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. | ||
36 | * Otherwise, don't care. | ||
37 | * | ||
38 | * RETURNS: | ||
39 | * Pointer to temp pages array on success, NULL on failure. | ||
40 | */ | ||
41 | static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, | ||
42 | unsigned long **bitmapp, | ||
43 | bool may_alloc) | ||
44 | { | ||
45 | static struct page **pages; | ||
46 | static unsigned long *bitmap; | ||
47 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); | ||
48 | size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * | ||
49 | sizeof(unsigned long); | ||
50 | |||
51 | if (!pages || !bitmap) { | ||
52 | if (may_alloc && !pages) | ||
53 | pages = pcpu_mem_alloc(pages_size); | ||
54 | if (may_alloc && !bitmap) | ||
55 | bitmap = pcpu_mem_alloc(bitmap_size); | ||
56 | if (!pages || !bitmap) | ||
57 | return NULL; | ||
58 | } | ||
59 | |||
60 | memset(pages, 0, pages_size); | ||
61 | bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); | ||
62 | |||
63 | *bitmapp = bitmap; | ||
64 | return pages; | ||
65 | } | ||
66 | |||
67 | /** | ||
68 | * pcpu_free_pages - free pages which were allocated for @chunk | ||
69 | * @chunk: chunk pages were allocated for | ||
70 | * @pages: array of pages to be freed, indexed by pcpu_page_idx() | ||
71 | * @populated: populated bitmap | ||
72 | * @page_start: page index of the first page to be freed | ||
73 | * @page_end: page index of the last page to be freed + 1 | ||
74 | * | ||
75 | * Free pages [@page_start and @page_end) in @pages for all units. | ||
76 | * The pages were allocated for @chunk. | ||
77 | */ | ||
78 | static void pcpu_free_pages(struct pcpu_chunk *chunk, | ||
79 | struct page **pages, unsigned long *populated, | ||
80 | int page_start, int page_end) | ||
81 | { | ||
82 | unsigned int cpu; | ||
83 | int i; | ||
84 | |||
85 | for_each_possible_cpu(cpu) { | ||
86 | for (i = page_start; i < page_end; i++) { | ||
87 | struct page *page = pages[pcpu_page_idx(cpu, i)]; | ||
88 | |||
89 | if (page) | ||
90 | __free_page(page); | ||
91 | } | ||
92 | } | ||
93 | } | ||
94 | |||
95 | /** | ||
96 | * pcpu_alloc_pages - allocates pages for @chunk | ||
97 | * @chunk: target chunk | ||
98 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() | ||
99 | * @populated: populated bitmap | ||
100 | * @page_start: page index of the first page to be allocated | ||
101 | * @page_end: page index of the last page to be allocated + 1 | ||
102 | * | ||
103 | * Allocate pages [@page_start,@page_end) into @pages for all units. | ||
104 | * The allocation is for @chunk. Percpu core doesn't care about the | ||
105 | * content of @pages and will pass it verbatim to pcpu_map_pages(). | ||
106 | */ | ||
107 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, | ||
108 | struct page **pages, unsigned long *populated, | ||
109 | int page_start, int page_end) | ||
110 | { | ||
111 | const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; | ||
112 | unsigned int cpu; | ||
113 | int i; | ||
114 | |||
115 | for_each_possible_cpu(cpu) { | ||
116 | for (i = page_start; i < page_end; i++) { | ||
117 | struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; | ||
118 | |||
119 | *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); | ||
120 | if (!*pagep) { | ||
121 | pcpu_free_pages(chunk, pages, populated, | ||
122 | page_start, page_end); | ||
123 | return -ENOMEM; | ||
124 | } | ||
125 | } | ||
126 | } | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | /** | ||
131 | * pcpu_pre_unmap_flush - flush cache prior to unmapping | ||
132 | * @chunk: chunk the regions to be flushed belongs to | ||
133 | * @page_start: page index of the first page to be flushed | ||
134 | * @page_end: page index of the last page to be flushed + 1 | ||
135 | * | ||
136 | * Pages in [@page_start,@page_end) of @chunk are about to be | ||
137 | * unmapped. Flush cache. As each flushing trial can be very | ||
138 | * expensive, issue flush on the whole region at once rather than | ||
139 | * doing it for each cpu. This could be an overkill but is more | ||
140 | * scalable. | ||
141 | */ | ||
142 | static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, | ||
143 | int page_start, int page_end) | ||
144 | { | ||
145 | flush_cache_vunmap( | ||
146 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | ||
147 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | ||
148 | } | ||
149 | |||
150 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) | ||
151 | { | ||
152 | unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); | ||
153 | } | ||
154 | |||
155 | /** | ||
156 | * pcpu_unmap_pages - unmap pages out of a pcpu_chunk | ||
157 | * @chunk: chunk of interest | ||
158 | * @pages: pages array which can be used to pass information to free | ||
159 | * @populated: populated bitmap | ||
160 | * @page_start: page index of the first page to unmap | ||
161 | * @page_end: page index of the last page to unmap + 1 | ||
162 | * | ||
163 | * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. | ||
164 | * Corresponding elements in @pages were cleared by the caller and can | ||
165 | * be used to carry information to pcpu_free_pages() which will be | ||
166 | * called after all unmaps are finished. The caller should call | ||
167 | * proper pre/post flush functions. | ||
168 | */ | ||
169 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, | ||
170 | struct page **pages, unsigned long *populated, | ||
171 | int page_start, int page_end) | ||
172 | { | ||
173 | unsigned int cpu; | ||
174 | int i; | ||
175 | |||
176 | for_each_possible_cpu(cpu) { | ||
177 | for (i = page_start; i < page_end; i++) { | ||
178 | struct page *page; | ||
179 | |||
180 | page = pcpu_chunk_page(chunk, cpu, i); | ||
181 | WARN_ON(!page); | ||
182 | pages[pcpu_page_idx(cpu, i)] = page; | ||
183 | } | ||
184 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), | ||
185 | page_end - page_start); | ||
186 | } | ||
187 | |||
188 | for (i = page_start; i < page_end; i++) | ||
189 | __clear_bit(i, populated); | ||
190 | } | ||
191 | |||
192 | /** | ||
193 | * pcpu_post_unmap_tlb_flush - flush TLB after unmapping | ||
194 | * @chunk: pcpu_chunk the regions to be flushed belong to | ||
195 | * @page_start: page index of the first page to be flushed | ||
196 | * @page_end: page index of the last page to be flushed + 1 | ||
197 | * | ||
198 | * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush | ||
199 | * TLB for the regions. This can be skipped if the area is to be | ||
200 | * returned to vmalloc as vmalloc will handle TLB flushing lazily. | ||
201 | * | ||
202 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once | ||
203 | * for the whole region. | ||
204 | */ | ||
205 | static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, | ||
206 | int page_start, int page_end) | ||
207 | { | ||
208 | flush_tlb_kernel_range( | ||
209 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | ||
210 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | ||
211 | } | ||
212 | |||
213 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, | ||
214 | int nr_pages) | ||
215 | { | ||
216 | return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, | ||
217 | PAGE_KERNEL, pages); | ||
218 | } | ||
219 | |||
220 | /** | ||
221 | * pcpu_map_pages - map pages into a pcpu_chunk | ||
222 | * @chunk: chunk of interest | ||
223 | * @pages: pages array containing pages to be mapped | ||
224 | * @populated: populated bitmap | ||
225 | * @page_start: page index of the first page to map | ||
226 | * @page_end: page index of the last page to map + 1 | ||
227 | * | ||
228 | * For each cpu, map pages [@page_start,@page_end) into @chunk. The | ||
229 | * caller is responsible for calling pcpu_post_map_flush() after all | ||
230 | * mappings are complete. | ||
231 | * | ||
232 | * This function is responsible for setting corresponding bits in | ||
233 | * @chunk->populated bitmap and whatever is necessary for reverse | ||
234 | * lookup (addr -> chunk). | ||
235 | */ | ||
236 | static int pcpu_map_pages(struct pcpu_chunk *chunk, | ||
237 | struct page **pages, unsigned long *populated, | ||
238 | int page_start, int page_end) | ||
239 | { | ||
240 | unsigned int cpu, tcpu; | ||
241 | int i, err; | ||
242 | |||
243 | for_each_possible_cpu(cpu) { | ||
244 | err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), | ||
245 | &pages[pcpu_page_idx(cpu, page_start)], | ||
246 | page_end - page_start); | ||
247 | if (err < 0) | ||
248 | goto err; | ||
249 | } | ||
250 | |||
251 | /* mapping successful, link chunk and mark populated */ | ||
252 | for (i = page_start; i < page_end; i++) { | ||
253 | for_each_possible_cpu(cpu) | ||
254 | pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], | ||
255 | chunk); | ||
256 | __set_bit(i, populated); | ||
257 | } | ||
258 | |||
259 | return 0; | ||
260 | |||
261 | err: | ||
262 | for_each_possible_cpu(tcpu) { | ||
263 | if (tcpu == cpu) | ||
264 | break; | ||
265 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), | ||
266 | page_end - page_start); | ||
267 | } | ||
268 | return err; | ||
269 | } | ||
270 | |||
271 | /** | ||
272 | * pcpu_post_map_flush - flush cache after mapping | ||
273 | * @chunk: pcpu_chunk the regions to be flushed belong to | ||
274 | * @page_start: page index of the first page to be flushed | ||
275 | * @page_end: page index of the last page to be flushed + 1 | ||
276 | * | ||
277 | * Pages [@page_start,@page_end) of @chunk have been mapped. Flush | ||
278 | * cache. | ||
279 | * | ||
280 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once | ||
281 | * for the whole region. | ||
282 | */ | ||
283 | static void pcpu_post_map_flush(struct pcpu_chunk *chunk, | ||
284 | int page_start, int page_end) | ||
285 | { | ||
286 | flush_cache_vmap( | ||
287 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | ||
288 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | ||
289 | } | ||
290 | |||
291 | /** | ||
292 | * pcpu_populate_chunk - populate and map an area of a pcpu_chunk | ||
293 | * @chunk: chunk of interest | ||
294 | * @off: offset to the area to populate | ||
295 | * @size: size of the area to populate in bytes | ||
296 | * | ||
297 | * For each cpu, populate and map pages [@page_start,@page_end) into | ||
298 | * @chunk. The area is cleared on return. | ||
299 | * | ||
300 | * CONTEXT: | ||
301 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. | ||
302 | */ | ||
303 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | ||
304 | { | ||
305 | int page_start = PFN_DOWN(off); | ||
306 | int page_end = PFN_UP(off + size); | ||
307 | int free_end = page_start, unmap_end = page_start; | ||
308 | struct page **pages; | ||
309 | unsigned long *populated; | ||
310 | unsigned int cpu; | ||
311 | int rs, re, rc; | ||
312 | |||
313 | /* quick path, check whether all pages are already there */ | ||
314 | rs = page_start; | ||
315 | pcpu_next_pop(chunk, &rs, &re, page_end); | ||
316 | if (rs == page_start && re == page_end) | ||
317 | goto clear; | ||
318 | |||
319 | /* need to allocate and map pages, this chunk can't be immutable */ | ||
320 | WARN_ON(chunk->immutable); | ||
321 | |||
322 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); | ||
323 | if (!pages) | ||
324 | return -ENOMEM; | ||
325 | |||
326 | /* alloc and map */ | ||
327 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | ||
328 | rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); | ||
329 | if (rc) | ||
330 | goto err_free; | ||
331 | free_end = re; | ||
332 | } | ||
333 | |||
334 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | ||
335 | rc = pcpu_map_pages(chunk, pages, populated, rs, re); | ||
336 | if (rc) | ||
337 | goto err_unmap; | ||
338 | unmap_end = re; | ||
339 | } | ||
340 | pcpu_post_map_flush(chunk, page_start, page_end); | ||
341 | |||
342 | /* commit new bitmap */ | ||
343 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); | ||
344 | clear: | ||
345 | for_each_possible_cpu(cpu) | ||
346 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); | ||
347 | return 0; | ||
348 | |||
349 | err_unmap: | ||
350 | pcpu_pre_unmap_flush(chunk, page_start, unmap_end); | ||
351 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) | ||
352 | pcpu_unmap_pages(chunk, pages, populated, rs, re); | ||
353 | pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); | ||
354 | err_free: | ||
355 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) | ||
356 | pcpu_free_pages(chunk, pages, populated, rs, re); | ||
357 | return rc; | ||
358 | } | ||
359 | |||
360 | /** | ||
361 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk | ||
362 | * @chunk: chunk to depopulate | ||
363 | * @off: offset to the area to depopulate | ||
364 | * @size: size of the area to depopulate in bytes | ||
365 | * @flush: whether to flush cache and tlb or not | ||
366 | * | ||
367 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | ||
368 | * from @chunk. If @flush is true, vcache is flushed before unmapping | ||
369 | * and tlb after. | ||
370 | * | ||
371 | * CONTEXT: | ||
372 | * pcpu_alloc_mutex. | ||
373 | */ | ||
374 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) | ||
375 | { | ||
376 | int page_start = PFN_DOWN(off); | ||
377 | int page_end = PFN_UP(off + size); | ||
378 | struct page **pages; | ||
379 | unsigned long *populated; | ||
380 | int rs, re; | ||
381 | |||
382 | /* quick path, check whether it's empty already */ | ||
383 | rs = page_start; | ||
384 | pcpu_next_unpop(chunk, &rs, &re, page_end); | ||
385 | if (rs == page_start && re == page_end) | ||
386 | return; | ||
387 | |||
388 | /* immutable chunks can't be depopulated */ | ||
389 | WARN_ON(chunk->immutable); | ||
390 | |||
391 | /* | ||
392 | * If control reaches here, there must have been at least one | ||
393 | * successful population attempt so the temp pages array must | ||
394 | * be available now. | ||
395 | */ | ||
396 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); | ||
397 | BUG_ON(!pages); | ||
398 | |||
399 | /* unmap and free */ | ||
400 | pcpu_pre_unmap_flush(chunk, page_start, page_end); | ||
401 | |||
402 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) | ||
403 | pcpu_unmap_pages(chunk, pages, populated, rs, re); | ||
404 | |||
405 | /* no need to flush tlb, vmalloc will handle it lazily */ | ||
406 | |||
407 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) | ||
408 | pcpu_free_pages(chunk, pages, populated, rs, re); | ||
409 | |||
410 | /* commit new bitmap */ | ||
411 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); | ||
412 | } | ||
413 | |||
414 | static struct pcpu_chunk *pcpu_create_chunk(void) | ||
415 | { | ||
416 | struct pcpu_chunk *chunk; | ||
417 | struct vm_struct **vms; | ||
418 | |||
419 | chunk = pcpu_alloc_chunk(); | ||
420 | if (!chunk) | ||
421 | return NULL; | ||
422 | |||
423 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, | ||
424 | pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); | ||
425 | if (!vms) { | ||
426 | pcpu_free_chunk(chunk); | ||
427 | return NULL; | ||
428 | } | ||
429 | |||
430 | chunk->data = vms; | ||
431 | chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; | ||
432 | return chunk; | ||
433 | } | ||
434 | |||
435 | static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) | ||
436 | { | ||
437 | if (chunk && chunk->data) | ||
438 | pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); | ||
439 | pcpu_free_chunk(chunk); | ||
440 | } | ||
441 | |||
442 | static struct page *pcpu_addr_to_page(void *addr) | ||
443 | { | ||
444 | return vmalloc_to_page(addr); | ||
445 | } | ||
446 | |||
447 | static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) | ||
448 | { | ||
449 | /* no extra restriction */ | ||
450 | return 0; | ||
451 | } | ||