diff options
Diffstat (limited to 'lib/percpu_ida.c')
-rw-r--r-- | lib/percpu_ida.c | 335 |
1 files changed, 335 insertions, 0 deletions
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c new file mode 100644 index 000000000000..bab1ba2a4c71 --- /dev/null +++ b/lib/percpu_ida.c | |||
@@ -0,0 +1,335 @@ | |||
1 | /* | ||
2 | * Percpu IDA library | ||
3 | * | ||
4 | * Copyright (C) 2013 Datera, Inc. Kent Overstreet | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License as | ||
8 | * published by the Free Software Foundation; either version 2, or (at | ||
9 | * your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | */ | ||
16 | |||
17 | #include <linux/bitmap.h> | ||
18 | #include <linux/bitops.h> | ||
19 | #include <linux/bug.h> | ||
20 | #include <linux/err.h> | ||
21 | #include <linux/export.h> | ||
22 | #include <linux/hardirq.h> | ||
23 | #include <linux/idr.h> | ||
24 | #include <linux/init.h> | ||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/percpu.h> | ||
27 | #include <linux/sched.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/string.h> | ||
30 | #include <linux/spinlock.h> | ||
31 | #include <linux/percpu_ida.h> | ||
32 | |||
33 | /* | ||
34 | * Number of tags we move between the percpu freelist and the global freelist at | ||
35 | * a time | ||
36 | */ | ||
37 | #define IDA_PCPU_BATCH_MOVE 32U | ||
38 | |||
39 | /* Max size of percpu freelist, */ | ||
40 | #define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2) | ||
41 | |||
42 | struct percpu_ida_cpu { | ||
43 | /* | ||
44 | * Even though this is percpu, we need a lock for tag stealing by remote | ||
45 | * CPUs: | ||
46 | */ | ||
47 | spinlock_t lock; | ||
48 | |||
49 | /* nr_free/freelist form a stack of free IDs */ | ||
50 | unsigned nr_free; | ||
51 | unsigned freelist[]; | ||
52 | }; | ||
53 | |||
54 | static inline void move_tags(unsigned *dst, unsigned *dst_nr, | ||
55 | unsigned *src, unsigned *src_nr, | ||
56 | unsigned nr) | ||
57 | { | ||
58 | *src_nr -= nr; | ||
59 | memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr); | ||
60 | *dst_nr += nr; | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * Try to steal tags from a remote cpu's percpu freelist. | ||
65 | * | ||
66 | * We first check how many percpu freelists have tags - we don't steal tags | ||
67 | * unless enough percpu freelists have tags on them that it's possible more than | ||
68 | * half the total tags could be stuck on remote percpu freelists. | ||
69 | * | ||
70 | * Then we iterate through the cpus until we find some tags - we don't attempt | ||
71 | * to find the "best" cpu to steal from, to keep cacheline bouncing to a | ||
72 | * minimum. | ||
73 | */ | ||
74 | static inline void steal_tags(struct percpu_ida *pool, | ||
75 | struct percpu_ida_cpu *tags) | ||
76 | { | ||
77 | unsigned cpus_have_tags, cpu = pool->cpu_last_stolen; | ||
78 | struct percpu_ida_cpu *remote; | ||
79 | |||
80 | for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags); | ||
81 | cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2; | ||
82 | cpus_have_tags--) { | ||
83 | cpu = cpumask_next(cpu, &pool->cpus_have_tags); | ||
84 | |||
85 | if (cpu >= nr_cpu_ids) { | ||
86 | cpu = cpumask_first(&pool->cpus_have_tags); | ||
87 | if (cpu >= nr_cpu_ids) | ||
88 | BUG(); | ||
89 | } | ||
90 | |||
91 | pool->cpu_last_stolen = cpu; | ||
92 | remote = per_cpu_ptr(pool->tag_cpu, cpu); | ||
93 | |||
94 | cpumask_clear_cpu(cpu, &pool->cpus_have_tags); | ||
95 | |||
96 | if (remote == tags) | ||
97 | continue; | ||
98 | |||
99 | spin_lock(&remote->lock); | ||
100 | |||
101 | if (remote->nr_free) { | ||
102 | memcpy(tags->freelist, | ||
103 | remote->freelist, | ||
104 | sizeof(unsigned) * remote->nr_free); | ||
105 | |||
106 | tags->nr_free = remote->nr_free; | ||
107 | remote->nr_free = 0; | ||
108 | } | ||
109 | |||
110 | spin_unlock(&remote->lock); | ||
111 | |||
112 | if (tags->nr_free) | ||
113 | break; | ||
114 | } | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | * Pop up to IDA_PCPU_BATCH_MOVE IDs off the global freelist, and push them onto | ||
119 | * our percpu freelist: | ||
120 | */ | ||
121 | static inline void alloc_global_tags(struct percpu_ida *pool, | ||
122 | struct percpu_ida_cpu *tags) | ||
123 | { | ||
124 | move_tags(tags->freelist, &tags->nr_free, | ||
125 | pool->freelist, &pool->nr_free, | ||
126 | min(pool->nr_free, IDA_PCPU_BATCH_MOVE)); | ||
127 | } | ||
128 | |||
129 | static inline unsigned alloc_local_tag(struct percpu_ida *pool, | ||
130 | struct percpu_ida_cpu *tags) | ||
131 | { | ||
132 | int tag = -ENOSPC; | ||
133 | |||
134 | spin_lock(&tags->lock); | ||
135 | if (tags->nr_free) | ||
136 | tag = tags->freelist[--tags->nr_free]; | ||
137 | spin_unlock(&tags->lock); | ||
138 | |||
139 | return tag; | ||
140 | } | ||
141 | |||
142 | /** | ||
143 | * percpu_ida_alloc - allocate a tag | ||
144 | * @pool: pool to allocate from | ||
145 | * @gfp: gfp flags | ||
146 | * | ||
147 | * Returns a tag - an integer in the range [0..nr_tags) (passed to | ||
148 | * tag_pool_init()), or otherwise -ENOSPC on allocation failure. | ||
149 | * | ||
150 | * Safe to be called from interrupt context (assuming it isn't passed | ||
151 | * __GFP_WAIT, of course). | ||
152 | * | ||
153 | * @gfp indicates whether or not to wait until a free id is available (it's not | ||
154 | * used for internal memory allocations); thus if passed __GFP_WAIT we may sleep | ||
155 | * however long it takes until another thread frees an id (same semantics as a | ||
156 | * mempool). | ||
157 | * | ||
158 | * Will not fail if passed __GFP_WAIT. | ||
159 | */ | ||
160 | int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp) | ||
161 | { | ||
162 | DEFINE_WAIT(wait); | ||
163 | struct percpu_ida_cpu *tags; | ||
164 | unsigned long flags; | ||
165 | int tag; | ||
166 | |||
167 | local_irq_save(flags); | ||
168 | tags = this_cpu_ptr(pool->tag_cpu); | ||
169 | |||
170 | /* Fastpath */ | ||
171 | tag = alloc_local_tag(pool, tags); | ||
172 | if (likely(tag >= 0)) { | ||
173 | local_irq_restore(flags); | ||
174 | return tag; | ||
175 | } | ||
176 | |||
177 | while (1) { | ||
178 | spin_lock(&pool->lock); | ||
179 | |||
180 | /* | ||
181 | * prepare_to_wait() must come before steal_tags(), in case | ||
182 | * percpu_ida_free() on another cpu flips a bit in | ||
183 | * cpus_have_tags | ||
184 | * | ||
185 | * global lock held and irqs disabled, don't need percpu lock | ||
186 | */ | ||
187 | prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); | ||
188 | |||
189 | if (!tags->nr_free) | ||
190 | alloc_global_tags(pool, tags); | ||
191 | if (!tags->nr_free) | ||
192 | steal_tags(pool, tags); | ||
193 | |||
194 | if (tags->nr_free) { | ||
195 | tag = tags->freelist[--tags->nr_free]; | ||
196 | if (tags->nr_free) | ||
197 | cpumask_set_cpu(smp_processor_id(), | ||
198 | &pool->cpus_have_tags); | ||
199 | } | ||
200 | |||
201 | spin_unlock(&pool->lock); | ||
202 | local_irq_restore(flags); | ||
203 | |||
204 | if (tag >= 0 || !(gfp & __GFP_WAIT)) | ||
205 | break; | ||
206 | |||
207 | schedule(); | ||
208 | |||
209 | local_irq_save(flags); | ||
210 | tags = this_cpu_ptr(pool->tag_cpu); | ||
211 | } | ||
212 | |||
213 | finish_wait(&pool->wait, &wait); | ||
214 | return tag; | ||
215 | } | ||
216 | EXPORT_SYMBOL_GPL(percpu_ida_alloc); | ||
217 | |||
218 | /** | ||
219 | * percpu_ida_free - free a tag | ||
220 | * @pool: pool @tag was allocated from | ||
221 | * @tag: a tag previously allocated with percpu_ida_alloc() | ||
222 | * | ||
223 | * Safe to be called from interrupt context. | ||
224 | */ | ||
225 | void percpu_ida_free(struct percpu_ida *pool, unsigned tag) | ||
226 | { | ||
227 | struct percpu_ida_cpu *tags; | ||
228 | unsigned long flags; | ||
229 | unsigned nr_free; | ||
230 | |||
231 | BUG_ON(tag >= pool->nr_tags); | ||
232 | |||
233 | local_irq_save(flags); | ||
234 | tags = this_cpu_ptr(pool->tag_cpu); | ||
235 | |||
236 | spin_lock(&tags->lock); | ||
237 | tags->freelist[tags->nr_free++] = tag; | ||
238 | |||
239 | nr_free = tags->nr_free; | ||
240 | spin_unlock(&tags->lock); | ||
241 | |||
242 | if (nr_free == 1) { | ||
243 | cpumask_set_cpu(smp_processor_id(), | ||
244 | &pool->cpus_have_tags); | ||
245 | wake_up(&pool->wait); | ||
246 | } | ||
247 | |||
248 | if (nr_free == IDA_PCPU_SIZE) { | ||
249 | spin_lock(&pool->lock); | ||
250 | |||
251 | /* | ||
252 | * Global lock held and irqs disabled, don't need percpu | ||
253 | * lock | ||
254 | */ | ||
255 | if (tags->nr_free == IDA_PCPU_SIZE) { | ||
256 | move_tags(pool->freelist, &pool->nr_free, | ||
257 | tags->freelist, &tags->nr_free, | ||
258 | IDA_PCPU_BATCH_MOVE); | ||
259 | |||
260 | wake_up(&pool->wait); | ||
261 | } | ||
262 | spin_unlock(&pool->lock); | ||
263 | } | ||
264 | |||
265 | local_irq_restore(flags); | ||
266 | } | ||
267 | EXPORT_SYMBOL_GPL(percpu_ida_free); | ||
268 | |||
269 | /** | ||
270 | * percpu_ida_destroy - release a tag pool's resources | ||
271 | * @pool: pool to free | ||
272 | * | ||
273 | * Frees the resources allocated by percpu_ida_init(). | ||
274 | */ | ||
275 | void percpu_ida_destroy(struct percpu_ida *pool) | ||
276 | { | ||
277 | free_percpu(pool->tag_cpu); | ||
278 | free_pages((unsigned long) pool->freelist, | ||
279 | get_order(pool->nr_tags * sizeof(unsigned))); | ||
280 | } | ||
281 | EXPORT_SYMBOL_GPL(percpu_ida_destroy); | ||
282 | |||
283 | /** | ||
284 | * percpu_ida_init - initialize a percpu tag pool | ||
285 | * @pool: pool to initialize | ||
286 | * @nr_tags: number of tags that will be available for allocation | ||
287 | * | ||
288 | * Initializes @pool so that it can be used to allocate tags - integers in the | ||
289 | * range [0, nr_tags). Typically, they'll be used by driver code to refer to a | ||
290 | * preallocated array of tag structures. | ||
291 | * | ||
292 | * Allocation is percpu, but sharding is limited by nr_tags - for best | ||
293 | * performance, the workload should not span more cpus than nr_tags / 128. | ||
294 | */ | ||
295 | int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) | ||
296 | { | ||
297 | unsigned i, cpu, order; | ||
298 | |||
299 | memset(pool, 0, sizeof(*pool)); | ||
300 | |||
301 | init_waitqueue_head(&pool->wait); | ||
302 | spin_lock_init(&pool->lock); | ||
303 | pool->nr_tags = nr_tags; | ||
304 | |||
305 | /* Guard against overflow */ | ||
306 | if (nr_tags > (unsigned) INT_MAX + 1) { | ||
307 | pr_err("percpu_ida_init(): nr_tags too large\n"); | ||
308 | return -EINVAL; | ||
309 | } | ||
310 | |||
311 | order = get_order(nr_tags * sizeof(unsigned)); | ||
312 | pool->freelist = (void *) __get_free_pages(GFP_KERNEL, order); | ||
313 | if (!pool->freelist) | ||
314 | return -ENOMEM; | ||
315 | |||
316 | for (i = 0; i < nr_tags; i++) | ||
317 | pool->freelist[i] = i; | ||
318 | |||
319 | pool->nr_free = nr_tags; | ||
320 | |||
321 | pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) + | ||
322 | IDA_PCPU_SIZE * sizeof(unsigned), | ||
323 | sizeof(unsigned)); | ||
324 | if (!pool->tag_cpu) | ||
325 | goto err; | ||
326 | |||
327 | for_each_possible_cpu(cpu) | ||
328 | spin_lock_init(&per_cpu_ptr(pool->tag_cpu, cpu)->lock); | ||
329 | |||
330 | return 0; | ||
331 | err: | ||
332 | percpu_ida_destroy(pool); | ||
333 | return -ENOMEM; | ||
334 | } | ||
335 | EXPORT_SYMBOL_GPL(percpu_ida_init); | ||