diff options
author | Kent Overstreet <koverstreet@google.com> | 2013-05-31 18:26:45 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2013-06-03 18:36:41 -0400 |
commit | 215e262f2aeba378aa192da07c30770f9925a4bf (patch) | |
tree | c854461e40f3ce9dde45f7128679b20a362643f8 /lib | |
parent | 042dd60ca6dec9a02cefa8edd67de386e35755d6 (diff) |
percpu: implement generic percpu refcounting
This implements a refcount with similar semantics to
atomic_get()/atomic_dec_and_test() - but percpu.
It also implements two stage shutdown, as we need it to tear down the
percpu counts. Before dropping the initial refcount, you must call
percpu_ref_kill(); this puts the refcount in "shutting down mode" and
switches back to a single atomic refcount with the appropriate
barriers (synchronize_rcu()).
It's also legal to call percpu_ref_kill() multiple times - it only
returns true once, so callers don't have to reimplement shutdown
synchronization.
[akpm@linux-foundation.org: fix build]
[akpm@linux-foundation.org: coding-style tweak]
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Makefile | 2 | ||||
-rw-r--r-- | lib/percpu-refcount.c | 128 |
2 files changed, 129 insertions, 1 deletions
diff --git a/lib/Makefile b/lib/Makefile index c55a037a354e..386db4bbc265 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ | |||
13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ | 13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ |
14 | proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ | 14 | proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ |
15 | is_single_threaded.o plist.o decompress.o kobject_uevent.o \ | 15 | is_single_threaded.o plist.o decompress.o kobject_uevent.o \ |
16 | earlycpio.o | 16 | earlycpio.o percpu-refcount.o |
17 | 17 | ||
18 | obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o | 18 | obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o |
19 | lib-$(CONFIG_MMU) += ioremap.o | 19 | lib-$(CONFIG_MMU) += ioremap.o |
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c new file mode 100644 index 000000000000..6f0ffd702a09 --- /dev/null +++ b/lib/percpu-refcount.c | |||
@@ -0,0 +1,128 @@ | |||
1 | #define pr_fmt(fmt) "%s: " fmt "\n", __func__ | ||
2 | |||
3 | #include <linux/kernel.h> | ||
4 | #include <linux/percpu-refcount.h> | ||
5 | |||
6 | /* | ||
7 | * Initially, a percpu refcount is just a set of percpu counters. Initially, we | ||
8 | * don't try to detect the ref hitting 0 - which means that get/put can just | ||
9 | * increment or decrement the local counter. Note that the counter on a | ||
10 | * particular cpu can (and will) wrap - this is fine, when we go to shutdown the | ||
11 | * percpu counters will all sum to the correct value | ||
12 | * | ||
13 | * (More precisely: because moduler arithmatic is commutative the sum of all the | ||
14 | * pcpu_count vars will be equal to what it would have been if all the gets and | ||
15 | * puts were done to a single integer, even if some of the percpu integers | ||
16 | * overflow or underflow). | ||
17 | * | ||
18 | * The real trick to implementing percpu refcounts is shutdown. We can't detect | ||
19 | * the ref hitting 0 on every put - this would require global synchronization | ||
20 | * and defeat the whole purpose of using percpu refs. | ||
21 | * | ||
22 | * What we do is require the user to keep track of the initial refcount; we know | ||
23 | * the ref can't hit 0 before the user drops the initial ref, so as long as we | ||
24 | * convert to non percpu mode before the initial ref is dropped everything | ||
25 | * works. | ||
26 | * | ||
27 | * Converting to non percpu mode is done with some RCUish stuff in | ||
28 | * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t | ||
29 | * can't hit 0 before we've added up all the percpu refs. | ||
30 | */ | ||
31 | |||
32 | #define PCPU_COUNT_BIAS (1U << 31) | ||
33 | |||
34 | /** | ||
35 | * percpu_ref_init - initialize a percpu refcount | ||
36 | * @ref: ref to initialize | ||
37 | * @release: function which will be called when refcount hits 0 | ||
38 | * | ||
39 | * Initializes the refcount in single atomic counter mode with a refcount of 1; | ||
40 | * analagous to atomic_set(ref, 1). | ||
41 | * | ||
42 | * Note that @release must not sleep - it may potentially be called from RCU | ||
43 | * callback context by percpu_ref_kill(). | ||
44 | */ | ||
45 | int percpu_ref_init(struct percpu_ref *ref, percpu_ref_release *release) | ||
46 | { | ||
47 | atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); | ||
48 | |||
49 | ref->pcpu_count = alloc_percpu(unsigned); | ||
50 | if (!ref->pcpu_count) | ||
51 | return -ENOMEM; | ||
52 | |||
53 | ref->release = release; | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static void percpu_ref_kill_rcu(struct rcu_head *rcu) | ||
58 | { | ||
59 | struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); | ||
60 | unsigned __percpu *pcpu_count; | ||
61 | unsigned count = 0; | ||
62 | int cpu; | ||
63 | |||
64 | pcpu_count = ACCESS_ONCE(ref->pcpu_count); | ||
65 | |||
66 | /* Mask out PCPU_REF_DEAD */ | ||
67 | pcpu_count = (unsigned __percpu *) | ||
68 | (((unsigned long) pcpu_count) & ~PCPU_STATUS_MASK); | ||
69 | |||
70 | for_each_possible_cpu(cpu) | ||
71 | count += *per_cpu_ptr(pcpu_count, cpu); | ||
72 | |||
73 | free_percpu(pcpu_count); | ||
74 | |||
75 | pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); | ||
76 | |||
77 | /* | ||
78 | * It's crucial that we sum the percpu counters _before_ adding the sum | ||
79 | * to &ref->count; since gets could be happening on one cpu while puts | ||
80 | * happen on another, adding a single cpu's count could cause | ||
81 | * @ref->count to hit 0 before we've got a consistent value - but the | ||
82 | * sum of all the counts will be consistent and correct. | ||
83 | * | ||
84 | * Subtracting the bias value then has to happen _after_ adding count to | ||
85 | * &ref->count; we need the bias value to prevent &ref->count from | ||
86 | * reaching 0 before we add the percpu counts. But doing it at the same | ||
87 | * time is equivalent and saves us atomic operations: | ||
88 | */ | ||
89 | |||
90 | atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); | ||
91 | |||
92 | /* | ||
93 | * Now we're in single atomic_t mode with a consistent refcount, so it's | ||
94 | * safe to drop our initial ref: | ||
95 | */ | ||
96 | percpu_ref_put(ref); | ||
97 | } | ||
98 | |||
99 | /** | ||
100 | * percpu_ref_kill - safely drop initial ref | ||
101 | * | ||
102 | * Must be used to drop the initial ref on a percpu refcount; must be called | ||
103 | * precisely once before shutdown. | ||
104 | * | ||
105 | * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the | ||
106 | * percpu counters and dropping the initial ref. | ||
107 | */ | ||
108 | void percpu_ref_kill(struct percpu_ref *ref) | ||
109 | { | ||
110 | unsigned __percpu *pcpu_count, *old, *new; | ||
111 | |||
112 | pcpu_count = ACCESS_ONCE(ref->pcpu_count); | ||
113 | |||
114 | do { | ||
115 | if (REF_STATUS(pcpu_count) == PCPU_REF_DEAD) { | ||
116 | WARN(1, "percpu_ref_kill() called more than once!\n"); | ||
117 | return; | ||
118 | } | ||
119 | |||
120 | old = pcpu_count; | ||
121 | new = (unsigned __percpu *) | ||
122 | (((unsigned long) pcpu_count)|PCPU_REF_DEAD); | ||
123 | |||
124 | pcpu_count = cmpxchg(&ref->pcpu_count, old, new); | ||
125 | } while (pcpu_count != old); | ||
126 | |||
127 | call_rcu(&ref->rcu, percpu_ref_kill_rcu); | ||
128 | } | ||