aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKent Overstreet <kmo@daterainc.com>2013-08-16 18:04:37 -0400
committerNicholas Bellinger <nab@linux-iscsi.org>2013-09-09 17:29:15 -0400
commit798ab48eecdf659df9ae0064ca5c62626c651827 (patch)
tree1b78a050ec898f2647ad3c58c67a10462246740f
parent6faaa85f375543ea0d49a27e953ed18aec05ae56 (diff)
idr: Percpu ida
Percpu frontend for allocating ids. With percpu allocation (that works), it's impossible to guarantee it will always be possible to allocate all nr_tags - typically, some will be stuck on a remote percpu freelist where the current job can't get to them. We do guarantee that it will always be possible to allocate at least (nr_tags / 2) tags - this is done by keeping track of which and how many cpus have tags on their percpu freelists. On allocation failure if enough cpus have tags that there could potentially be (nr_tags / 2) tags stuck on remote percpu freelists, we then pick a remote cpu at random to steal from. Note that there's no cpu hotplug notifier - we don't care, because steal_tags() will eventually get the down cpu's tags. We _could_ satisfy more allocations if we had a notifier - but we'll still meet our guarantees and it's absolutely not a correctness issue, so I don't think it's worth the extra code. From akpm: "It looks OK to me (that's as close as I get to an ack :)) v6 changes: - Add #include <linux/cpumask.h> to include/linux/percpu_ida.h to make alpha/arc builds happy (Fengguang) - Move second (cpu >= nr_cpu_ids) check inside of first check scope in steal_tags() (akpm + nab) v5 changes: - Change percpu_ida->cpus_have_tags to cpumask_t (kmo + akpm) - Add comment for percpu_ida_cpu->lock + ->nr_free (kmo + akpm) - Convert steal_tags() to use cpumask_weight() + cpumask_next() + cpumask_first() + cpumask_clear_cpu() (kmo + akpm) - Add comment for alloc_global_tags() (kmo + akpm) - Convert percpu_ida_alloc() to use cpumask_set_cpu() (kmo + akpm) - Convert percpu_ida_free() to use cpumask_set_cpu() (kmo + akpm) - Drop percpu_ida->cpus_have_tags allocation in percpu_ida_init() (kmo + akpm) - Drop percpu_ida->cpus_have_tags kfree in percpu_ida_destroy() (kmo + akpm) - Add comment for percpu_ida_alloc @ gfp (kmo + akpm) - Move to percpu_ida.c + percpu_ida.h (kmo + akpm + nab) v4 changes: - Fix tags.c reference in percpu_ida_init (akpm) Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
-rw-r--r--include/linux/percpu_ida.h60
-rw-r--r--lib/Makefile5
-rw-r--r--lib/percpu_ida.c335
3 files changed, 398 insertions, 2 deletions
diff --git a/include/linux/percpu_ida.h b/include/linux/percpu_ida.h
new file mode 100644
index 000000000000..0b23edbee309
--- /dev/null
+++ b/include/linux/percpu_ida.h
@@ -0,0 +1,60 @@
1#ifndef __PERCPU_IDA_H__
2#define __PERCPU_IDA_H__
3
4#include <linux/types.h>
5#include <linux/bitops.h>
6#include <linux/init.h>
7#include <linux/spinlock_types.h>
8#include <linux/wait.h>
9#include <linux/cpumask.h>
10
11struct percpu_ida_cpu;
12
13struct percpu_ida {
14 /*
15 * number of tags available to be allocated, as passed to
16 * percpu_ida_init()
17 */
18 unsigned nr_tags;
19
20 struct percpu_ida_cpu __percpu *tag_cpu;
21
22 /*
23 * Bitmap of cpus that (may) have tags on their percpu freelists:
24 * steal_tags() uses this to decide when to steal tags, and which cpus
25 * to try stealing from.
26 *
27 * It's ok for a freelist to be empty when its bit is set - steal_tags()
28 * will just keep looking - but the bitmap _must_ be set whenever a
29 * percpu freelist does have tags.
30 */
31 cpumask_t cpus_have_tags;
32
33 struct {
34 spinlock_t lock;
35 /*
36 * When we go to steal tags from another cpu (see steal_tags()),
37 * we want to pick a cpu at random. Cycling through them every
38 * time we steal is a bit easier and more or less equivalent:
39 */
40 unsigned cpu_last_stolen;
41
42 /* For sleeping on allocation failure */
43 wait_queue_head_t wait;
44
45 /*
46 * Global freelist - it's a stack where nr_free points to the
47 * top
48 */
49 unsigned nr_free;
50 unsigned *freelist;
51 } ____cacheline_aligned_in_smp;
52};
53
54int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp);
55void percpu_ida_free(struct percpu_ida *pool, unsigned tag);
56
57void percpu_ida_destroy(struct percpu_ida *pool);
58int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags);
59
60#endif /* __PERCPU_IDA_H__ */
diff --git a/lib/Makefile b/lib/Makefile
index 7baccfd8a4e9..1cb83568c40f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
13 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ 13 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
14 proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ 14 proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
15 is_single_threaded.o plist.o decompress.o kobject_uevent.o \ 15 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
16 earlycpio.o percpu-refcount.o 16 earlycpio.o percpu-refcount.o percpu_ida.o
17 17
18obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o 18obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
19lib-$(CONFIG_MMU) += ioremap.o 19lib-$(CONFIG_MMU) += ioremap.o
@@ -24,7 +24,8 @@ lib-y += kobject.o klist.o
24obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ 24obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
25 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ 25 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
26 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \ 26 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
27 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o 27 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
28 percpu_ida.o
28obj-y += string_helpers.o 29obj-y += string_helpers.o
29obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o 30obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
30obj-y += kstrtox.o 31obj-y += kstrtox.o
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
new file mode 100644
index 000000000000..bab1ba2a4c71
--- /dev/null
+++ b/lib/percpu_ida.c
@@ -0,0 +1,335 @@
1/*
2 * Percpu IDA library
3 *
4 * Copyright (C) 2013 Datera, Inc. Kent Overstreet
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2, or (at
9 * your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 */
16
17#include <linux/bitmap.h>
18#include <linux/bitops.h>
19#include <linux/bug.h>
20#include <linux/err.h>
21#include <linux/export.h>
22#include <linux/hardirq.h>
23#include <linux/idr.h>
24#include <linux/init.h>
25#include <linux/kernel.h>
26#include <linux/percpu.h>
27#include <linux/sched.h>
28#include <linux/slab.h>
29#include <linux/string.h>
30#include <linux/spinlock.h>
31#include <linux/percpu_ida.h>
32
33/*
34 * Number of tags we move between the percpu freelist and the global freelist at
35 * a time
36 */
37#define IDA_PCPU_BATCH_MOVE 32U
38
39/* Max size of percpu freelist, */
40#define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2)
41
42struct percpu_ida_cpu {
43 /*
44 * Even though this is percpu, we need a lock for tag stealing by remote
45 * CPUs:
46 */
47 spinlock_t lock;
48
49 /* nr_free/freelist form a stack of free IDs */
50 unsigned nr_free;
51 unsigned freelist[];
52};
53
54static inline void move_tags(unsigned *dst, unsigned *dst_nr,
55 unsigned *src, unsigned *src_nr,
56 unsigned nr)
57{
58 *src_nr -= nr;
59 memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr);
60 *dst_nr += nr;
61}
62
63/*
64 * Try to steal tags from a remote cpu's percpu freelist.
65 *
66 * We first check how many percpu freelists have tags - we don't steal tags
67 * unless enough percpu freelists have tags on them that it's possible more than
68 * half the total tags could be stuck on remote percpu freelists.
69 *
70 * Then we iterate through the cpus until we find some tags - we don't attempt
71 * to find the "best" cpu to steal from, to keep cacheline bouncing to a
72 * minimum.
73 */
74static inline void steal_tags(struct percpu_ida *pool,
75 struct percpu_ida_cpu *tags)
76{
77 unsigned cpus_have_tags, cpu = pool->cpu_last_stolen;
78 struct percpu_ida_cpu *remote;
79
80 for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);
81 cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2;
82 cpus_have_tags--) {
83 cpu = cpumask_next(cpu, &pool->cpus_have_tags);
84
85 if (cpu >= nr_cpu_ids) {
86 cpu = cpumask_first(&pool->cpus_have_tags);
87 if (cpu >= nr_cpu_ids)
88 BUG();
89 }
90
91 pool->cpu_last_stolen = cpu;
92 remote = per_cpu_ptr(pool->tag_cpu, cpu);
93
94 cpumask_clear_cpu(cpu, &pool->cpus_have_tags);
95
96 if (remote == tags)
97 continue;
98
99 spin_lock(&remote->lock);
100
101 if (remote->nr_free) {
102 memcpy(tags->freelist,
103 remote->freelist,
104 sizeof(unsigned) * remote->nr_free);
105
106 tags->nr_free = remote->nr_free;
107 remote->nr_free = 0;
108 }
109
110 spin_unlock(&remote->lock);
111
112 if (tags->nr_free)
113 break;
114 }
115}
116
117/*
118 * Pop up to IDA_PCPU_BATCH_MOVE IDs off the global freelist, and push them onto
119 * our percpu freelist:
120 */
121static inline void alloc_global_tags(struct percpu_ida *pool,
122 struct percpu_ida_cpu *tags)
123{
124 move_tags(tags->freelist, &tags->nr_free,
125 pool->freelist, &pool->nr_free,
126 min(pool->nr_free, IDA_PCPU_BATCH_MOVE));
127}
128
129static inline unsigned alloc_local_tag(struct percpu_ida *pool,
130 struct percpu_ida_cpu *tags)
131{
132 int tag = -ENOSPC;
133
134 spin_lock(&tags->lock);
135 if (tags->nr_free)
136 tag = tags->freelist[--tags->nr_free];
137 spin_unlock(&tags->lock);
138
139 return tag;
140}
141
142/**
143 * percpu_ida_alloc - allocate a tag
144 * @pool: pool to allocate from
145 * @gfp: gfp flags
146 *
147 * Returns a tag - an integer in the range [0..nr_tags) (passed to
148 * tag_pool_init()), or otherwise -ENOSPC on allocation failure.
149 *
150 * Safe to be called from interrupt context (assuming it isn't passed
151 * __GFP_WAIT, of course).
152 *
153 * @gfp indicates whether or not to wait until a free id is available (it's not
154 * used for internal memory allocations); thus if passed __GFP_WAIT we may sleep
155 * however long it takes until another thread frees an id (same semantics as a
156 * mempool).
157 *
158 * Will not fail if passed __GFP_WAIT.
159 */
160int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp)
161{
162 DEFINE_WAIT(wait);
163 struct percpu_ida_cpu *tags;
164 unsigned long flags;
165 int tag;
166
167 local_irq_save(flags);
168 tags = this_cpu_ptr(pool->tag_cpu);
169
170 /* Fastpath */
171 tag = alloc_local_tag(pool, tags);
172 if (likely(tag >= 0)) {
173 local_irq_restore(flags);
174 return tag;
175 }
176
177 while (1) {
178 spin_lock(&pool->lock);
179
180 /*
181 * prepare_to_wait() must come before steal_tags(), in case
182 * percpu_ida_free() on another cpu flips a bit in
183 * cpus_have_tags
184 *
185 * global lock held and irqs disabled, don't need percpu lock
186 */
187 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
188
189 if (!tags->nr_free)
190 alloc_global_tags(pool, tags);
191 if (!tags->nr_free)
192 steal_tags(pool, tags);
193
194 if (tags->nr_free) {
195 tag = tags->freelist[--tags->nr_free];
196 if (tags->nr_free)
197 cpumask_set_cpu(smp_processor_id(),
198 &pool->cpus_have_tags);
199 }
200
201 spin_unlock(&pool->lock);
202 local_irq_restore(flags);
203
204 if (tag >= 0 || !(gfp & __GFP_WAIT))
205 break;
206
207 schedule();
208
209 local_irq_save(flags);
210 tags = this_cpu_ptr(pool->tag_cpu);
211 }
212
213 finish_wait(&pool->wait, &wait);
214 return tag;
215}
216EXPORT_SYMBOL_GPL(percpu_ida_alloc);
217
218/**
219 * percpu_ida_free - free a tag
220 * @pool: pool @tag was allocated from
221 * @tag: a tag previously allocated with percpu_ida_alloc()
222 *
223 * Safe to be called from interrupt context.
224 */
225void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
226{
227 struct percpu_ida_cpu *tags;
228 unsigned long flags;
229 unsigned nr_free;
230
231 BUG_ON(tag >= pool->nr_tags);
232
233 local_irq_save(flags);
234 tags = this_cpu_ptr(pool->tag_cpu);
235
236 spin_lock(&tags->lock);
237 tags->freelist[tags->nr_free++] = tag;
238
239 nr_free = tags->nr_free;
240 spin_unlock(&tags->lock);
241
242 if (nr_free == 1) {
243 cpumask_set_cpu(smp_processor_id(),
244 &pool->cpus_have_tags);
245 wake_up(&pool->wait);
246 }
247
248 if (nr_free == IDA_PCPU_SIZE) {
249 spin_lock(&pool->lock);
250
251 /*
252 * Global lock held and irqs disabled, don't need percpu
253 * lock
254 */
255 if (tags->nr_free == IDA_PCPU_SIZE) {
256 move_tags(pool->freelist, &pool->nr_free,
257 tags->freelist, &tags->nr_free,
258 IDA_PCPU_BATCH_MOVE);
259
260 wake_up(&pool->wait);
261 }
262 spin_unlock(&pool->lock);
263 }
264
265 local_irq_restore(flags);
266}
267EXPORT_SYMBOL_GPL(percpu_ida_free);
268
269/**
270 * percpu_ida_destroy - release a tag pool's resources
271 * @pool: pool to free
272 *
273 * Frees the resources allocated by percpu_ida_init().
274 */
275void percpu_ida_destroy(struct percpu_ida *pool)
276{
277 free_percpu(pool->tag_cpu);
278 free_pages((unsigned long) pool->freelist,
279 get_order(pool->nr_tags * sizeof(unsigned)));
280}
281EXPORT_SYMBOL_GPL(percpu_ida_destroy);
282
283/**
284 * percpu_ida_init - initialize a percpu tag pool
285 * @pool: pool to initialize
286 * @nr_tags: number of tags that will be available for allocation
287 *
288 * Initializes @pool so that it can be used to allocate tags - integers in the
289 * range [0, nr_tags). Typically, they'll be used by driver code to refer to a
290 * preallocated array of tag structures.
291 *
292 * Allocation is percpu, but sharding is limited by nr_tags - for best
293 * performance, the workload should not span more cpus than nr_tags / 128.
294 */
295int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
296{
297 unsigned i, cpu, order;
298
299 memset(pool, 0, sizeof(*pool));
300
301 init_waitqueue_head(&pool->wait);
302 spin_lock_init(&pool->lock);
303 pool->nr_tags = nr_tags;
304
305 /* Guard against overflow */
306 if (nr_tags > (unsigned) INT_MAX + 1) {
307 pr_err("percpu_ida_init(): nr_tags too large\n");
308 return -EINVAL;
309 }
310
311 order = get_order(nr_tags * sizeof(unsigned));
312 pool->freelist = (void *) __get_free_pages(GFP_KERNEL, order);
313 if (!pool->freelist)
314 return -ENOMEM;
315
316 for (i = 0; i < nr_tags; i++)
317 pool->freelist[i] = i;
318
319 pool->nr_free = nr_tags;
320
321 pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
322 IDA_PCPU_SIZE * sizeof(unsigned),
323 sizeof(unsigned));
324 if (!pool->tag_cpu)
325 goto err;
326
327 for_each_possible_cpu(cpu)
328 spin_lock_init(&per_cpu_ptr(pool->tag_cpu, cpu)->lock);
329
330 return 0;
331err:
332 percpu_ida_destroy(pool);
333 return -ENOMEM;
334}
335EXPORT_SYMBOL_GPL(percpu_ida_init);