aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-02-20 02:29:08 -0500
committerTejun Heo <tj@kernel.org>2009-02-20 02:29:08 -0500
commitfbf59bc9d74d1fb30b8e0630743aff2806eafcea (patch)
tree3f0a7b7cf809a25e27b7a5ba0b16321fdb901801
parent8fc48985006da4ceba24508db64ec77fc0dfe3bb (diff)
percpu: implement new dynamic percpu allocator
Impact: new scalable dynamic percpu allocator which allows dynamic percpu areas to be accessed the same way as static ones Implement scalable dynamic percpu allocator which can be used for both static and dynamic percpu areas. This will allow static and dynamic areas to share faster direct access methods. This feature is optional and enabled only when CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is defined by arch. Please read comment on top of mm/percpu.c for details. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org>
-rw-r--r--include/linux/percpu.h22
-rw-r--r--kernel/module.c31
-rw-r--r--mm/Makefile4
-rw-r--r--mm/percpu.c890
4 files changed, 943 insertions, 4 deletions
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index d99e24ae1811..18080995ff3e 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -76,23 +76,37 @@
76 76
77#ifdef CONFIG_SMP 77#ifdef CONFIG_SMP
78 78
79struct percpu_data { 79#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
80 void *ptrs[1];
81};
82 80
83#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) 81extern void *pcpu_base_addr;
84 82
83typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
84
85extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
86 struct page **pages, size_t cpu_size);
85/* 87/*
86 * Use this to get to a cpu's version of the per-cpu object 88 * Use this to get to a cpu's version of the per-cpu object
87 * dynamically allocated. Non-atomic access to the current CPU's 89 * dynamically allocated. Non-atomic access to the current CPU's
88 * version should probably be combined with get_cpu()/put_cpu(). 90 * version should probably be combined with get_cpu()/put_cpu().
89 */ 91 */
92#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
93
94#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
95
96struct percpu_data {
97 void *ptrs[1];
98};
99
100#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
101
90#define per_cpu_ptr(ptr, cpu) \ 102#define per_cpu_ptr(ptr, cpu) \
91({ \ 103({ \
92 struct percpu_data *__p = __percpu_disguise(ptr); \ 104 struct percpu_data *__p = __percpu_disguise(ptr); \
93 (__typeof__(ptr))__p->ptrs[(cpu)]; \ 105 (__typeof__(ptr))__p->ptrs[(cpu)]; \
94}) 106})
95 107
108#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
109
96extern void *__alloc_percpu(size_t size, size_t align); 110extern void *__alloc_percpu(size_t size, size_t align);
97extern void free_percpu(void *__pdata); 111extern void free_percpu(void *__pdata);
98 112
diff --git a/kernel/module.c b/kernel/module.c
index 52b3497b8748..1f0657ae555b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -51,6 +51,7 @@
51#include <linux/tracepoint.h> 51#include <linux/tracepoint.h>
52#include <linux/ftrace.h> 52#include <linux/ftrace.h>
53#include <linux/async.h> 53#include <linux/async.h>
54#include <linux/percpu.h>
54 55
55#if 0 56#if 0
56#define DEBUGP printk 57#define DEBUGP printk
@@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
366} 367}
367 368
368#ifdef CONFIG_SMP 369#ifdef CONFIG_SMP
370
371#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
372
373static void *percpu_modalloc(unsigned long size, unsigned long align,
374 const char *name)
375{
376 void *ptr;
377
378 if (align > PAGE_SIZE) {
379 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
380 name, align, PAGE_SIZE);
381 align = PAGE_SIZE;
382 }
383
384 ptr = __alloc_percpu(size, align);
385 if (!ptr)
386 printk(KERN_WARNING
387 "Could not allocate %lu bytes percpu data\n", size);
388 return ptr;
389}
390
391static void percpu_modfree(void *freeme)
392{
393 free_percpu(freeme);
394}
395
396#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
397
369/* Number of blocks used and allocated. */ 398/* Number of blocks used and allocated. */
370static unsigned int pcpu_num_used, pcpu_num_allocated; 399static unsigned int pcpu_num_used, pcpu_num_allocated;
371/* Size of each block. -ve means used. */ 400/* Size of each block. -ve means used. */
@@ -499,6 +528,8 @@ static int percpu_modinit(void)
499} 528}
500__initcall(percpu_modinit); 529__initcall(percpu_modinit);
501 530
531#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
532
502static unsigned int find_pcpusec(Elf_Ehdr *hdr, 533static unsigned int find_pcpusec(Elf_Ehdr *hdr,
503 Elf_Shdr *sechdrs, 534 Elf_Shdr *sechdrs,
504 const char *secstrings) 535 const char *secstrings)
diff --git a/mm/Makefile b/mm/Makefile
index 72255be57f89..818569b68f46 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
31obj-$(CONFIG_FS_XIP) += filemap_xip.o 31obj-$(CONFIG_FS_XIP) += filemap_xip.o
32obj-$(CONFIG_MIGRATION) += migrate.o 32obj-$(CONFIG_MIGRATION) += migrate.o
33ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
34obj-$(CONFIG_SMP) += percpu.o
35else
33obj-$(CONFIG_SMP) += allocpercpu.o 36obj-$(CONFIG_SMP) += allocpercpu.o
37endif
34obj-$(CONFIG_QUICKLIST) += quicklist.o 38obj-$(CONFIG_QUICKLIST) += quicklist.o
35obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 39obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 000000000000..4617d97e877c
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,890 @@
1/*
2 * linux/mm/percpu.c - percpu memory allocator
3 *
4 * Copyright (C) 2009 SUSE Linux Products GmbH
5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of num_possible_cpus() units and the first chunk
12 * is used for static percpu variables in the kernel image (special
13 * boot time alloc/init handling necessary as these areas need to be
14 * brought up before allocation services are running). Unit grows as
15 * necessary and all units grow or shrink in unison. When a chunk is
16 * filled up, another chunk is allocated. ie. in vmalloc area
17 *
18 * c0 c1 c2
19 * ------------------- ------------------- ------------
20 * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
21 * ------------------- ...... ------------------- .... ------------
22 *
23 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
26 * percpu base registers UNIT_SIZE apart.
27 *
28 * There are usually many small percpu allocations many of them as
29 * small as 4 bytes. The allocator organizes chunks into lists
30 * according to free size and tries to allocate from the fullest one.
31 * Each chunk keeps the maximum contiguous area size hint which is
32 * guaranteed to be eqaul to or larger than the maximum contiguous
33 * area in the chunk. This helps the allocator not to iterate the
34 * chunk maps unnecessarily.
35 *
36 * Allocation state in each chunk is kept using an array of integers
37 * on chunk->map. A positive value in the map represents a free
38 * region and negative allocated. Allocation inside a chunk is done
39 * by scanning this map sequentially and serving the first matching
40 * entry. This is mostly copied from the percpu_modalloc() allocator.
41 * Chunks are also linked into a rb tree to ease address to chunk
42 * mapping during free.
43 *
44 * To use this allocator, arch code should do the followings.
45 *
46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
47 *
48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
49 * regular address to percpu pointer and back
50 *
51 * - use pcpu_setup_static() during percpu area initialization to
52 * setup kernel static percpu area
53 */
54
55#include <linux/bitmap.h>
56#include <linux/bootmem.h>
57#include <linux/list.h>
58#include <linux/mm.h>
59#include <linux/module.h>
60#include <linux/mutex.h>
61#include <linux/percpu.h>
62#include <linux/pfn.h>
63#include <linux/rbtree.h>
64#include <linux/slab.h>
65#include <linux/vmalloc.h>
66
67#include <asm/cacheflush.h>
68#include <asm/tlbflush.h>
69
70#define PCPU_MIN_UNIT_PAGES_SHIFT 4 /* also max alloc size */
71#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
72#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
73
74struct pcpu_chunk {
75 struct list_head list; /* linked to pcpu_slot lists */
76 struct rb_node rb_node; /* key is chunk->vm->addr */
77 int free_size; /* free bytes in the chunk */
78 int contig_hint; /* max contiguous size hint */
79 struct vm_struct *vm; /* mapped vmalloc region */
80 int map_used; /* # of map entries used */
81 int map_alloc; /* # of map entries allocated */
82 int *map; /* allocation map */
83 struct page *page[]; /* #cpus * UNIT_PAGES */
84};
85
86static int pcpu_unit_pages_shift;
87static int pcpu_unit_pages;
88static int pcpu_unit_shift;
89static int pcpu_unit_size;
90static int pcpu_chunk_size;
91static int pcpu_nr_slots;
92static size_t pcpu_chunk_struct_size;
93
94/* the address of the first chunk which starts with the kernel static area */
95void *pcpu_base_addr;
96EXPORT_SYMBOL_GPL(pcpu_base_addr);
97
98/* the size of kernel static area */
99static int pcpu_static_size;
100
101/*
102 * One mutex to rule them all.
103 *
104 * The following mutex is grabbed in the outermost public alloc/free
105 * interface functions and released only when the operation is
106 * complete. As such, every function in this file other than the
107 * outermost functions are called under pcpu_mutex.
108 *
109 * It can easily be switched to use spinlock such that only the area
110 * allocation and page population commit are protected with it doing
111 * actual [de]allocation without holding any lock. However, given
112 * what this allocator does, I think it's better to let them run
113 * sequentially.
114 */
115static DEFINE_MUTEX(pcpu_mutex);
116
117static struct list_head *pcpu_slot; /* chunk list slots */
118static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
119
120static int pcpu_size_to_slot(int size)
121{
122 int highbit = fls(size);
123 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
124}
125
126static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
127{
128 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
129 return 0;
130
131 return pcpu_size_to_slot(chunk->free_size);
132}
133
134static int pcpu_page_idx(unsigned int cpu, int page_idx)
135{
136 return (cpu << pcpu_unit_pages_shift) + page_idx;
137}
138
139static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
140 unsigned int cpu, int page_idx)
141{
142 return &chunk->page[pcpu_page_idx(cpu, page_idx)];
143}
144
145static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
146 unsigned int cpu, int page_idx)
147{
148 return (unsigned long)chunk->vm->addr +
149 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
150}
151
152static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
153 int page_idx)
154{
155 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
156}
157
158/**
159 * pcpu_realloc - versatile realloc
160 * @p: the current pointer (can be NULL for new allocations)
161 * @size: the current size (can be 0 for new allocations)
162 * @new_size: the wanted new size (can be 0 for free)
163 *
164 * More robust realloc which can be used to allocate, resize or free a
165 * memory area of arbitrary size. If the needed size goes over
166 * PAGE_SIZE, kernel VM is used.
167 *
168 * RETURNS:
169 * The new pointer on success, NULL on failure.
170 */
171static void *pcpu_realloc(void *p, size_t size, size_t new_size)
172{
173 void *new;
174
175 if (new_size <= PAGE_SIZE)
176 new = kmalloc(new_size, GFP_KERNEL);
177 else
178 new = vmalloc(new_size);
179 if (new_size && !new)
180 return NULL;
181
182 memcpy(new, p, min(size, new_size));
183 if (new_size > size)
184 memset(new + size, 0, new_size - size);
185
186 if (size <= PAGE_SIZE)
187 kfree(p);
188 else
189 vfree(p);
190
191 return new;
192}
193
194/**
195 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
196 * @chunk: chunk of interest
197 * @oslot: the previous slot it was on
198 *
199 * This function is called after an allocation or free changed @chunk.
200 * New slot according to the changed state is determined and @chunk is
201 * moved to the slot.
202 */
203static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
204{
205 int nslot = pcpu_chunk_slot(chunk);
206
207 if (oslot != nslot) {
208 if (oslot < nslot)
209 list_move(&chunk->list, &pcpu_slot[nslot]);
210 else
211 list_move_tail(&chunk->list, &pcpu_slot[nslot]);
212 }
213}
214
215static struct rb_node **pcpu_chunk_rb_search(void *addr,
216 struct rb_node **parentp)
217{
218 struct rb_node **p = &pcpu_addr_root.rb_node;
219 struct rb_node *parent = NULL;
220 struct pcpu_chunk *chunk;
221
222 while (*p) {
223 parent = *p;
224 chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
225
226 if (addr < chunk->vm->addr)
227 p = &(*p)->rb_left;
228 else if (addr > chunk->vm->addr)
229 p = &(*p)->rb_right;
230 else
231 break;
232 }
233
234 if (parentp)
235 *parentp = parent;
236 return p;
237}
238
239/**
240 * pcpu_chunk_addr_search - search for chunk containing specified address
241 * @addr: address to search for
242 *
243 * Look for chunk which might contain @addr. More specifically, it
244 * searchs for the chunk with the highest start address which isn't
245 * beyond @addr.
246 *
247 * RETURNS:
248 * The address of the found chunk.
249 */
250static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
251{
252 struct rb_node *n, *parent;
253 struct pcpu_chunk *chunk;
254
255 n = *pcpu_chunk_rb_search(addr, &parent);
256 if (!n) {
257 /* no exactly matching chunk, the parent is the closest */
258 n = parent;
259 BUG_ON(!n);
260 }
261 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
262
263 if (addr < chunk->vm->addr) {
264 /* the parent was the next one, look for the previous one */
265 n = rb_prev(n);
266 BUG_ON(!n);
267 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
268 }
269
270 return chunk;
271}
272
273/**
274 * pcpu_chunk_addr_insert - insert chunk into address rb tree
275 * @new: chunk to insert
276 *
277 * Insert @new into address rb tree.
278 */
279static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
280{
281 struct rb_node **p, *parent;
282
283 p = pcpu_chunk_rb_search(new->vm->addr, &parent);
284 BUG_ON(*p);
285 rb_link_node(&new->rb_node, parent, p);
286 rb_insert_color(&new->rb_node, &pcpu_addr_root);
287}
288
289/**
290 * pcpu_split_block - split a map block
291 * @chunk: chunk of interest
292 * @i: index of map block to split
293 * @head: head size (can be 0)
294 * @tail: tail size (can be 0)
295 *
296 * Split the @i'th map block into two or three blocks. If @head is
297 * non-zero, @head bytes block is inserted before block @i moving it
298 * to @i+1 and reducing its size by @head bytes.
299 *
300 * If @tail is non-zero, the target block, which can be @i or @i+1
301 * depending on @head, is reduced by @tail bytes and @tail byte block
302 * is inserted after the target block.
303 *
304 * RETURNS:
305 * 0 on success, -errno on failure.
306 */
307static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
308{
309 int nr_extra = !!head + !!tail;
310 int target = chunk->map_used + nr_extra;
311
312 /* reallocation required? */
313 if (chunk->map_alloc < target) {
314 int new_alloc = chunk->map_alloc;
315 int *new;
316
317 while (new_alloc < target)
318 new_alloc *= 2;
319
320 new = pcpu_realloc(chunk->map,
321 chunk->map_alloc * sizeof(new[0]),
322 new_alloc * sizeof(new[0]));
323 if (!new)
324 return -ENOMEM;
325
326 chunk->map_alloc = new_alloc;
327 chunk->map = new;
328 }
329
330 /* insert a new subblock */
331 memmove(&chunk->map[i + nr_extra], &chunk->map[i],
332 sizeof(chunk->map[0]) * (chunk->map_used - i));
333 chunk->map_used += nr_extra;
334
335 if (head) {
336 chunk->map[i + 1] = chunk->map[i] - head;
337 chunk->map[i++] = head;
338 }
339 if (tail) {
340 chunk->map[i++] -= tail;
341 chunk->map[i] = tail;
342 }
343 return 0;
344}
345
346/**
347 * pcpu_alloc_area - allocate area from a pcpu_chunk
348 * @chunk: chunk of interest
349 * @size: wanted size
350 * @align: wanted align
351 *
352 * Try to allocate @size bytes area aligned at @align from @chunk.
353 * Note that this function only allocates the offset. It doesn't
354 * populate or map the area.
355 *
356 * RETURNS:
357 * Allocated offset in @chunk on success, -errno on failure.
358 */
359static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
360{
361 int oslot = pcpu_chunk_slot(chunk);
362 int max_contig = 0;
363 int i, off;
364
365 /*
366 * The static chunk initially doesn't have map attached
367 * because kmalloc wasn't available during init. Give it one.
368 */
369 if (unlikely(!chunk->map)) {
370 chunk->map = pcpu_realloc(NULL, 0,
371 PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
372 if (!chunk->map)
373 return -ENOMEM;
374
375 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
376 chunk->map[chunk->map_used++] = -pcpu_static_size;
377 if (chunk->free_size)
378 chunk->map[chunk->map_used++] = chunk->free_size;
379 }
380
381 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
382 bool is_last = i + 1 == chunk->map_used;
383 int head, tail;
384
385 /* extra for alignment requirement */
386 head = ALIGN(off, align) - off;
387 BUG_ON(i == 0 && head != 0);
388
389 if (chunk->map[i] < 0)
390 continue;
391 if (chunk->map[i] < head + size) {
392 max_contig = max(chunk->map[i], max_contig);
393 continue;
394 }
395
396 /*
397 * If head is small or the previous block is free,
398 * merge'em. Note that 'small' is defined as smaller
399 * than sizeof(int), which is very small but isn't too
400 * uncommon for percpu allocations.
401 */
402 if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
403 if (chunk->map[i - 1] > 0)
404 chunk->map[i - 1] += head;
405 else {
406 chunk->map[i - 1] -= head;
407 chunk->free_size -= head;
408 }
409 chunk->map[i] -= head;
410 off += head;
411 head = 0;
412 }
413
414 /* if tail is small, just keep it around */
415 tail = chunk->map[i] - head - size;
416 if (tail < sizeof(int))
417 tail = 0;
418
419 /* split if warranted */
420 if (head || tail) {
421 if (pcpu_split_block(chunk, i, head, tail))
422 return -ENOMEM;
423 if (head) {
424 i++;
425 off += head;
426 max_contig = max(chunk->map[i - 1], max_contig);
427 }
428 if (tail)
429 max_contig = max(chunk->map[i + 1], max_contig);
430 }
431
432 /* update hint and mark allocated */
433 if (is_last)
434 chunk->contig_hint = max_contig; /* fully scanned */
435 else
436 chunk->contig_hint = max(chunk->contig_hint,
437 max_contig);
438
439 chunk->free_size -= chunk->map[i];
440 chunk->map[i] = -chunk->map[i];
441
442 pcpu_chunk_relocate(chunk, oslot);
443 return off;
444 }
445
446 chunk->contig_hint = max_contig; /* fully scanned */
447 pcpu_chunk_relocate(chunk, oslot);
448
449 /*
450 * Tell the upper layer that this chunk has no area left.
451 * Note that this is not an error condition but a notification
452 * to upper layer that it needs to look at other chunks.
453 * -ENOSPC is chosen as it isn't used in memory subsystem and
454 * matches the meaning in a way.
455 */
456 return -ENOSPC;
457}
458
459/**
460 * pcpu_free_area - free area to a pcpu_chunk
461 * @chunk: chunk of interest
462 * @freeme: offset of area to free
463 *
464 * Free area starting from @freeme to @chunk. Note that this function
465 * only modifies the allocation map. It doesn't depopulate or unmap
466 * the area.
467 */
468static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
469{
470 int oslot = pcpu_chunk_slot(chunk);
471 int i, off;
472
473 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
474 if (off == freeme)
475 break;
476 BUG_ON(off != freeme);
477 BUG_ON(chunk->map[i] > 0);
478
479 chunk->map[i] = -chunk->map[i];
480 chunk->free_size += chunk->map[i];
481
482 /* merge with previous? */
483 if (i > 0 && chunk->map[i - 1] >= 0) {
484 chunk->map[i - 1] += chunk->map[i];
485 chunk->map_used--;
486 memmove(&chunk->map[i], &chunk->map[i + 1],
487 (chunk->map_used - i) * sizeof(chunk->map[0]));
488 i--;
489 }
490 /* merge with next? */
491 if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
492 chunk->map[i] += chunk->map[i + 1];
493 chunk->map_used--;
494 memmove(&chunk->map[i + 1], &chunk->map[i + 2],
495 (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
496 }
497
498 chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
499 pcpu_chunk_relocate(chunk, oslot);
500}
501
502/**
503 * pcpu_unmap - unmap pages out of a pcpu_chunk
504 * @chunk: chunk of interest
505 * @page_start: page index of the first page to unmap
506 * @page_end: page index of the last page to unmap + 1
507 * @flush: whether to flush cache and tlb or not
508 *
509 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
510 * If @flush is true, vcache is flushed before unmapping and tlb
511 * after.
512 */
513static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
514 bool flush)
515{
516 unsigned int last = num_possible_cpus() - 1;
517 unsigned int cpu;
518
519 /*
520 * Each flushing trial can be very expensive, issue flush on
521 * the whole region at once rather than doing it for each cpu.
522 * This could be an overkill but is more scalable.
523 */
524 if (flush)
525 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
526 pcpu_chunk_addr(chunk, last, page_end));
527
528 for_each_possible_cpu(cpu)
529 unmap_kernel_range_noflush(
530 pcpu_chunk_addr(chunk, cpu, page_start),
531 (page_end - page_start) << PAGE_SHIFT);
532
533 /* ditto as flush_cache_vunmap() */
534 if (flush)
535 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
536 pcpu_chunk_addr(chunk, last, page_end));
537}
538
539/**
540 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
541 * @chunk: chunk to depopulate
542 * @off: offset to the area to depopulate
543 * @size: size of the area to depopulate
544 * @flush: whether to flush cache and tlb or not
545 *
546 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
547 * from @chunk. If @flush is true, vcache is flushed before unmapping
548 * and tlb after.
549 */
550static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off,
551 size_t size, bool flush)
552{
553 int page_start = PFN_DOWN(off);
554 int page_end = PFN_UP(off + size);
555 int unmap_start = -1;
556 int uninitialized_var(unmap_end);
557 unsigned int cpu;
558 int i;
559
560 for (i = page_start; i < page_end; i++) {
561 for_each_possible_cpu(cpu) {
562 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
563
564 if (!*pagep)
565 continue;
566
567 __free_page(*pagep);
568
569 /*
570 * If it's partial depopulation, it might get
571 * populated or depopulated again. Mark the
572 * page gone.
573 */
574 *pagep = NULL;
575
576 unmap_start = unmap_start < 0 ? i : unmap_start;
577 unmap_end = i + 1;
578 }
579 }
580
581 if (unmap_start >= 0)
582 pcpu_unmap(chunk, unmap_start, unmap_end, flush);
583}
584
585/**
586 * pcpu_map - map pages into a pcpu_chunk
587 * @chunk: chunk of interest
588 * @page_start: page index of the first page to map
589 * @page_end: page index of the last page to map + 1
590 *
591 * For each cpu, map pages [@page_start,@page_end) into @chunk.
592 * vcache is flushed afterwards.
593 */
594static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
595{
596 unsigned int last = num_possible_cpus() - 1;
597 unsigned int cpu;
598 int err;
599
600 for_each_possible_cpu(cpu) {
601 err = map_kernel_range_noflush(
602 pcpu_chunk_addr(chunk, cpu, page_start),
603 (page_end - page_start) << PAGE_SHIFT,
604 PAGE_KERNEL,
605 pcpu_chunk_pagep(chunk, cpu, page_start));
606 if (err < 0)
607 return err;
608 }
609
610 /* flush at once, please read comments in pcpu_unmap() */
611 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
612 pcpu_chunk_addr(chunk, last, page_end));
613 return 0;
614}
615
616/**
617 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
618 * @chunk: chunk of interest
619 * @off: offset to the area to populate
620 * @size: size of the area to populate
621 *
622 * For each cpu, populate and map pages [@page_start,@page_end) into
623 * @chunk. The area is cleared on return.
624 */
625static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
626{
627 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
628 int page_start = PFN_DOWN(off);
629 int page_end = PFN_UP(off + size);
630 int map_start = -1;
631 int map_end;
632 unsigned int cpu;
633 int i;
634
635 for (i = page_start; i < page_end; i++) {
636 if (pcpu_chunk_page_occupied(chunk, i)) {
637 if (map_start >= 0) {
638 if (pcpu_map(chunk, map_start, map_end))
639 goto err;
640 map_start = -1;
641 }
642 continue;
643 }
644
645 map_start = map_start < 0 ? i : map_start;
646 map_end = i + 1;
647
648 for_each_possible_cpu(cpu) {
649 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
650
651 *pagep = alloc_pages_node(cpu_to_node(cpu),
652 alloc_mask, 0);
653 if (!*pagep)
654 goto err;
655 }
656 }
657
658 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
659 goto err;
660
661 for_each_possible_cpu(cpu)
662 memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0,
663 size);
664
665 return 0;
666err:
667 /* likely under heavy memory pressure, give memory back */
668 pcpu_depopulate_chunk(chunk, off, size, true);
669 return -ENOMEM;
670}
671
672static void free_pcpu_chunk(struct pcpu_chunk *chunk)
673{
674 if (!chunk)
675 return;
676 if (chunk->vm)
677 free_vm_area(chunk->vm);
678 pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
679 kfree(chunk);
680}
681
682static struct pcpu_chunk *alloc_pcpu_chunk(void)
683{
684 struct pcpu_chunk *chunk;
685
686 chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
687 if (!chunk)
688 return NULL;
689
690 chunk->map = pcpu_realloc(NULL, 0,
691 PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
692 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
693 chunk->map[chunk->map_used++] = pcpu_unit_size;
694
695 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
696 if (!chunk->vm) {
697 free_pcpu_chunk(chunk);
698 return NULL;
699 }
700
701 INIT_LIST_HEAD(&chunk->list);
702 chunk->free_size = pcpu_unit_size;
703 chunk->contig_hint = pcpu_unit_size;
704
705 return chunk;
706}
707
708/**
709 * __alloc_percpu - allocate percpu area
710 * @size: size of area to allocate
711 * @align: alignment of area (max PAGE_SIZE)
712 *
713 * Allocate percpu area of @size bytes aligned at @align. Might
714 * sleep. Might trigger writeouts.
715 *
716 * RETURNS:
717 * Percpu pointer to the allocated area on success, NULL on failure.
718 */
719void *__alloc_percpu(size_t size, size_t align)
720{
721 void *ptr = NULL;
722 struct pcpu_chunk *chunk;
723 int slot, off;
724
725 if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT ||
726 align > PAGE_SIZE)) {
727 WARN(true, "illegal size (%zu) or align (%zu) for "
728 "percpu allocation\n", size, align);
729 return NULL;
730 }
731
732 mutex_lock(&pcpu_mutex);
733
734 /* allocate area */
735 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
736 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
737 if (size > chunk->contig_hint)
738 continue;
739 off = pcpu_alloc_area(chunk, size, align);
740 if (off >= 0)
741 goto area_found;
742 if (off != -ENOSPC)
743 goto out_unlock;
744 }
745 }
746
747 /* hmmm... no space left, create a new chunk */
748 chunk = alloc_pcpu_chunk();
749 if (!chunk)
750 goto out_unlock;
751 pcpu_chunk_relocate(chunk, -1);
752 pcpu_chunk_addr_insert(chunk);
753
754 off = pcpu_alloc_area(chunk, size, align);
755 if (off < 0)
756 goto out_unlock;
757
758area_found:
759 /* populate, map and clear the area */
760 if (pcpu_populate_chunk(chunk, off, size)) {
761 pcpu_free_area(chunk, off);
762 goto out_unlock;
763 }
764
765 ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
766out_unlock:
767 mutex_unlock(&pcpu_mutex);
768 return ptr;
769}
770EXPORT_SYMBOL_GPL(__alloc_percpu);
771
772static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
773{
774 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
775 list_del(&chunk->list);
776 rb_erase(&chunk->rb_node, &pcpu_addr_root);
777 free_pcpu_chunk(chunk);
778}
779
780/**
781 * free_percpu - free percpu area
782 * @ptr: pointer to area to free
783 *
784 * Free percpu area @ptr. Might sleep.
785 */
786void free_percpu(void *ptr)
787{
788 void *addr = __pcpu_ptr_to_addr(ptr);
789 struct pcpu_chunk *chunk;
790 int off;
791
792 if (!ptr)
793 return;
794
795 mutex_lock(&pcpu_mutex);
796
797 chunk = pcpu_chunk_addr_search(addr);
798 off = addr - chunk->vm->addr;
799
800 pcpu_free_area(chunk, off);
801
802 /* the chunk became fully free, kill one if there are other free ones */
803 if (chunk->free_size == pcpu_unit_size) {
804 struct pcpu_chunk *pos;
805
806 list_for_each_entry(pos,
807 &pcpu_slot[pcpu_chunk_slot(chunk)], list)
808 if (pos != chunk) {
809 pcpu_kill_chunk(pos);
810 break;
811 }
812 }
813
814 mutex_unlock(&pcpu_mutex);
815}
816EXPORT_SYMBOL_GPL(free_percpu);
817
818/**
819 * pcpu_setup_static - initialize kernel static percpu area
820 * @populate_pte_fn: callback to allocate pagetable
821 * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages
822 *
823 * Initialize kernel static percpu area. The caller should allocate
824 * all the necessary pages and pass them in @pages.
825 * @populate_pte_fn() is called on each page to be used for percpu
826 * mapping and is responsible for making sure all the necessary page
827 * tables for the page is allocated.
828 *
829 * RETURNS:
830 * The determined pcpu_unit_size which can be used to initialize
831 * percpu access.
832 */
833size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
834 struct page **pages, size_t cpu_size)
835{
836 static struct vm_struct static_vm;
837 struct pcpu_chunk *static_chunk;
838 int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE);
839 unsigned int cpu;
840 int err, i;
841
842 pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT,
843 order_base_2(cpu_size) - PAGE_SHIFT);
844
845 pcpu_static_size = cpu_size;
846 pcpu_unit_pages = 1 << pcpu_unit_pages_shift;
847 pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift;
848 pcpu_unit_size = 1 << pcpu_unit_shift;
849 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
850 pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1;
851 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
852 + (1 << pcpu_unit_pages_shift) * sizeof(struct page *);
853
854 /* allocate chunk slots */
855 pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
856 for (i = 0; i < pcpu_nr_slots; i++)
857 INIT_LIST_HEAD(&pcpu_slot[i]);
858
859 /* init and register vm area */
860 static_vm.flags = VM_ALLOC;
861 static_vm.size = pcpu_chunk_size;
862 vm_area_register_early(&static_vm);
863
864 /* init static_chunk */
865 static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
866 INIT_LIST_HEAD(&static_chunk->list);
867 static_chunk->vm = &static_vm;
868 static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
869 static_chunk->contig_hint = static_chunk->free_size;
870
871 /* assign pages and map them */
872 for_each_possible_cpu(cpu) {
873 for (i = 0; i < nr_cpu_pages; i++) {
874 *pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++;
875 populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i));
876 }
877 }
878
879 err = pcpu_map(static_chunk, 0, nr_cpu_pages);
880 if (err)
881 panic("failed to setup static percpu area, err=%d\n", err);
882
883 /* link static_chunk in */
884 pcpu_chunk_relocate(static_chunk, -1);
885 pcpu_chunk_addr_insert(static_chunk);
886
887 /* we're done */
888 pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
889 return pcpu_unit_size;
890}