diff options
-rw-r--r-- | include/linux/percpu-refcount.h | 174 | ||||
-rw-r--r-- | lib/Makefile | 2 | ||||
-rw-r--r-- | lib/percpu-refcount.c | 158 |
3 files changed, 333 insertions, 1 deletions
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h new file mode 100644 index 000000000000..95961f0bf62d --- /dev/null +++ b/include/linux/percpu-refcount.h | |||
@@ -0,0 +1,174 @@ | |||
1 | /* | ||
2 | * Percpu refcounts: | ||
3 | * (C) 2012 Google, Inc. | ||
4 | * Author: Kent Overstreet <koverstreet@google.com> | ||
5 | * | ||
6 | * This implements a refcount with similar semantics to atomic_t - atomic_inc(), | ||
7 | * atomic_dec_and_test() - but percpu. | ||
8 | * | ||
9 | * There's one important difference between percpu refs and normal atomic_t | ||
10 | * refcounts; you have to keep track of your initial refcount, and then when you | ||
11 | * start shutting down you call percpu_ref_kill() _before_ dropping the initial | ||
12 | * refcount. | ||
13 | * | ||
14 | * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less | ||
15 | * than an atomic_t - this is because of the way shutdown works, see | ||
16 | * percpu_ref_kill()/PCPU_COUNT_BIAS. | ||
17 | * | ||
18 | * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the | ||
19 | * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() | ||
20 | * puts the ref back in single atomic_t mode, collecting the per cpu refs and | ||
21 | * issuing the appropriate barriers, and then marks the ref as shutting down so | ||
22 | * that percpu_ref_put() will check for the ref hitting 0. After it returns, | ||
23 | * it's safe to drop the initial ref. | ||
24 | * | ||
25 | * USAGE: | ||
26 | * | ||
27 | * See fs/aio.c for some example usage; it's used there for struct kioctx, which | ||
28 | * is created when userspaces calls io_setup(), and destroyed when userspace | ||
29 | * calls io_destroy() or the process exits. | ||
30 | * | ||
31 | * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it | ||
32 | * calls percpu_ref_kill(), then hlist_del_rcu() and sychronize_rcu() to remove | ||
33 | * the kioctx from the proccess's list of kioctxs - after that, there can't be | ||
34 | * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop | ||
35 | * the initial ref with percpu_ref_put(). | ||
36 | * | ||
37 | * Code that does a two stage shutdown like this often needs some kind of | ||
38 | * explicit synchronization to ensure the initial refcount can only be dropped | ||
39 | * once - percpu_ref_kill() does this for you, it returns true once and false if | ||
40 | * someone else already called it. The aio code uses it this way, but it's not | ||
41 | * necessary if the code has some other mechanism to synchronize teardown. | ||
42 | * around. | ||
43 | */ | ||
44 | |||
45 | #ifndef _LINUX_PERCPU_REFCOUNT_H | ||
46 | #define _LINUX_PERCPU_REFCOUNT_H | ||
47 | |||
48 | #include <linux/atomic.h> | ||
49 | #include <linux/kernel.h> | ||
50 | #include <linux/percpu.h> | ||
51 | #include <linux/rcupdate.h> | ||
52 | |||
53 | struct percpu_ref; | ||
54 | typedef void (percpu_ref_func_t)(struct percpu_ref *); | ||
55 | |||
56 | struct percpu_ref { | ||
57 | atomic_t count; | ||
58 | /* | ||
59 | * The low bit of the pointer indicates whether the ref is in percpu | ||
60 | * mode; if set, then get/put will manipulate the atomic_t (this is a | ||
61 | * hack because we need to keep the pointer around for | ||
62 | * percpu_ref_kill_rcu()) | ||
63 | */ | ||
64 | unsigned __percpu *pcpu_count; | ||
65 | percpu_ref_func_t *release; | ||
66 | percpu_ref_func_t *confirm_kill; | ||
67 | struct rcu_head rcu; | ||
68 | }; | ||
69 | |||
70 | int __must_check percpu_ref_init(struct percpu_ref *ref, | ||
71 | percpu_ref_func_t *release); | ||
72 | void percpu_ref_cancel_init(struct percpu_ref *ref); | ||
73 | void percpu_ref_kill_and_confirm(struct percpu_ref *ref, | ||
74 | percpu_ref_func_t *confirm_kill); | ||
75 | |||
76 | /** | ||
77 | * percpu_ref_kill - drop the initial ref | ||
78 | * @ref: percpu_ref to kill | ||
79 | * | ||
80 | * Must be used to drop the initial ref on a percpu refcount; must be called | ||
81 | * precisely once before shutdown. | ||
82 | * | ||
83 | * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the | ||
84 | * percpu counters and dropping the initial ref. | ||
85 | */ | ||
86 | static inline void percpu_ref_kill(struct percpu_ref *ref) | ||
87 | { | ||
88 | return percpu_ref_kill_and_confirm(ref, NULL); | ||
89 | } | ||
90 | |||
91 | #define PCPU_STATUS_BITS 2 | ||
92 | #define PCPU_STATUS_MASK ((1 << PCPU_STATUS_BITS) - 1) | ||
93 | #define PCPU_REF_PTR 0 | ||
94 | #define PCPU_REF_DEAD 1 | ||
95 | |||
96 | #define REF_STATUS(count) (((unsigned long) count) & PCPU_STATUS_MASK) | ||
97 | |||
98 | /** | ||
99 | * percpu_ref_get - increment a percpu refcount | ||
100 | * @ref: percpu_ref to get | ||
101 | * | ||
102 | * Analagous to atomic_inc(). | ||
103 | */ | ||
104 | static inline void percpu_ref_get(struct percpu_ref *ref) | ||
105 | { | ||
106 | unsigned __percpu *pcpu_count; | ||
107 | |||
108 | rcu_read_lock_sched(); | ||
109 | |||
110 | pcpu_count = ACCESS_ONCE(ref->pcpu_count); | ||
111 | |||
112 | if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) | ||
113 | __this_cpu_inc(*pcpu_count); | ||
114 | else | ||
115 | atomic_inc(&ref->count); | ||
116 | |||
117 | rcu_read_unlock_sched(); | ||
118 | } | ||
119 | |||
120 | /** | ||
121 | * percpu_ref_tryget - try to increment a percpu refcount | ||
122 | * @ref: percpu_ref to try-get | ||
123 | * | ||
124 | * Increment a percpu refcount unless it has already been killed. Returns | ||
125 | * %true on success; %false on failure. | ||
126 | * | ||
127 | * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget | ||
128 | * will fail. For such guarantee, percpu_ref_kill_and_confirm() should be | ||
129 | * used. After the confirm_kill callback is invoked, it's guaranteed that | ||
130 | * no new reference will be given out by percpu_ref_tryget(). | ||
131 | */ | ||
132 | static inline bool percpu_ref_tryget(struct percpu_ref *ref) | ||
133 | { | ||
134 | unsigned __percpu *pcpu_count; | ||
135 | int ret = false; | ||
136 | |||
137 | rcu_read_lock_sched(); | ||
138 | |||
139 | pcpu_count = ACCESS_ONCE(ref->pcpu_count); | ||
140 | |||
141 | if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) { | ||
142 | __this_cpu_inc(*pcpu_count); | ||
143 | ret = true; | ||
144 | } | ||
145 | |||
146 | rcu_read_unlock_sched(); | ||
147 | |||
148 | return ret; | ||
149 | } | ||
150 | |||
151 | /** | ||
152 | * percpu_ref_put - decrement a percpu refcount | ||
153 | * @ref: percpu_ref to put | ||
154 | * | ||
155 | * Decrement the refcount, and if 0, call the release function (which was passed | ||
156 | * to percpu_ref_init()) | ||
157 | */ | ||
158 | static inline void percpu_ref_put(struct percpu_ref *ref) | ||
159 | { | ||
160 | unsigned __percpu *pcpu_count; | ||
161 | |||
162 | rcu_read_lock_sched(); | ||
163 | |||
164 | pcpu_count = ACCESS_ONCE(ref->pcpu_count); | ||
165 | |||
166 | if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) | ||
167 | __this_cpu_dec(*pcpu_count); | ||
168 | else if (unlikely(atomic_dec_and_test(&ref->count))) | ||
169 | ref->release(ref); | ||
170 | |||
171 | rcu_read_unlock_sched(); | ||
172 | } | ||
173 | |||
174 | #endif | ||
diff --git a/lib/Makefile b/lib/Makefile index 22f0f4e8a9e1..8f8d385187f2 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ | |||
13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ | 13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ |
14 | proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ | 14 | proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ |
15 | is_single_threaded.o plist.o decompress.o kobject_uevent.o \ | 15 | is_single_threaded.o plist.o decompress.o kobject_uevent.o \ |
16 | earlycpio.o | 16 | earlycpio.o percpu-refcount.o |
17 | 17 | ||
18 | obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o | 18 | obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o |
19 | lib-$(CONFIG_MMU) += ioremap.o | 19 | lib-$(CONFIG_MMU) += ioremap.o |
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c new file mode 100644 index 000000000000..7deeb6297a48 --- /dev/null +++ b/lib/percpu-refcount.c | |||
@@ -0,0 +1,158 @@ | |||
1 | #define pr_fmt(fmt) "%s: " fmt "\n", __func__ | ||
2 | |||
3 | #include <linux/kernel.h> | ||
4 | #include <linux/percpu-refcount.h> | ||
5 | |||
6 | /* | ||
7 | * Initially, a percpu refcount is just a set of percpu counters. Initially, we | ||
8 | * don't try to detect the ref hitting 0 - which means that get/put can just | ||
9 | * increment or decrement the local counter. Note that the counter on a | ||
10 | * particular cpu can (and will) wrap - this is fine, when we go to shutdown the | ||
11 | * percpu counters will all sum to the correct value | ||
12 | * | ||
13 | * (More precisely: because moduler arithmatic is commutative the sum of all the | ||
14 | * pcpu_count vars will be equal to what it would have been if all the gets and | ||
15 | * puts were done to a single integer, even if some of the percpu integers | ||
16 | * overflow or underflow). | ||
17 | * | ||
18 | * The real trick to implementing percpu refcounts is shutdown. We can't detect | ||
19 | * the ref hitting 0 on every put - this would require global synchronization | ||
20 | * and defeat the whole purpose of using percpu refs. | ||
21 | * | ||
22 | * What we do is require the user to keep track of the initial refcount; we know | ||
23 | * the ref can't hit 0 before the user drops the initial ref, so as long as we | ||
24 | * convert to non percpu mode before the initial ref is dropped everything | ||
25 | * works. | ||
26 | * | ||
27 | * Converting to non percpu mode is done with some RCUish stuff in | ||
28 | * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t | ||
29 | * can't hit 0 before we've added up all the percpu refs. | ||
30 | */ | ||
31 | |||
32 | #define PCPU_COUNT_BIAS (1U << 31) | ||
33 | |||
34 | /** | ||
35 | * percpu_ref_init - initialize a percpu refcount | ||
36 | * @ref: percpu_ref to initialize | ||
37 | * @release: function which will be called when refcount hits 0 | ||
38 | * | ||
39 | * Initializes the refcount in single atomic counter mode with a refcount of 1; | ||
40 | * analagous to atomic_set(ref, 1). | ||
41 | * | ||
42 | * Note that @release must not sleep - it may potentially be called from RCU | ||
43 | * callback context by percpu_ref_kill(). | ||
44 | */ | ||
45 | int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) | ||
46 | { | ||
47 | atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); | ||
48 | |||
49 | ref->pcpu_count = alloc_percpu(unsigned); | ||
50 | if (!ref->pcpu_count) | ||
51 | return -ENOMEM; | ||
52 | |||
53 | ref->release = release; | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | /** | ||
58 | * percpu_ref_cancel_init - cancel percpu_ref_init() | ||
59 | * @ref: percpu_ref to cancel init for | ||
60 | * | ||
61 | * Once a percpu_ref is initialized, its destruction is initiated by | ||
62 | * percpu_ref_kill() and completes asynchronously, which can be painful to | ||
63 | * do when destroying a half-constructed object in init failure path. | ||
64 | * | ||
65 | * This function destroys @ref without invoking @ref->release and the | ||
66 | * memory area containing it can be freed immediately on return. To | ||
67 | * prevent accidental misuse, it's required that @ref has finished | ||
68 | * percpu_ref_init(), whether successful or not, but never used. | ||
69 | * | ||
70 | * The weird name and usage restriction are to prevent people from using | ||
71 | * this function by mistake for normal shutdown instead of | ||
72 | * percpu_ref_kill(). | ||
73 | */ | ||
74 | void percpu_ref_cancel_init(struct percpu_ref *ref) | ||
75 | { | ||
76 | unsigned __percpu *pcpu_count = ref->pcpu_count; | ||
77 | int cpu; | ||
78 | |||
79 | WARN_ON_ONCE(atomic_read(&ref->count) != 1 + PCPU_COUNT_BIAS); | ||
80 | |||
81 | if (pcpu_count) { | ||
82 | for_each_possible_cpu(cpu) | ||
83 | WARN_ON_ONCE(*per_cpu_ptr(pcpu_count, cpu)); | ||
84 | free_percpu(ref->pcpu_count); | ||
85 | } | ||
86 | } | ||
87 | |||
88 | static void percpu_ref_kill_rcu(struct rcu_head *rcu) | ||
89 | { | ||
90 | struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); | ||
91 | unsigned __percpu *pcpu_count = ref->pcpu_count; | ||
92 | unsigned count = 0; | ||
93 | int cpu; | ||
94 | |||
95 | /* Mask out PCPU_REF_DEAD */ | ||
96 | pcpu_count = (unsigned __percpu *) | ||
97 | (((unsigned long) pcpu_count) & ~PCPU_STATUS_MASK); | ||
98 | |||
99 | for_each_possible_cpu(cpu) | ||
100 | count += *per_cpu_ptr(pcpu_count, cpu); | ||
101 | |||
102 | free_percpu(pcpu_count); | ||
103 | |||
104 | pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); | ||
105 | |||
106 | /* | ||
107 | * It's crucial that we sum the percpu counters _before_ adding the sum | ||
108 | * to &ref->count; since gets could be happening on one cpu while puts | ||
109 | * happen on another, adding a single cpu's count could cause | ||
110 | * @ref->count to hit 0 before we've got a consistent value - but the | ||
111 | * sum of all the counts will be consistent and correct. | ||
112 | * | ||
113 | * Subtracting the bias value then has to happen _after_ adding count to | ||
114 | * &ref->count; we need the bias value to prevent &ref->count from | ||
115 | * reaching 0 before we add the percpu counts. But doing it at the same | ||
116 | * time is equivalent and saves us atomic operations: | ||
117 | */ | ||
118 | |||
119 | atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); | ||
120 | |||
121 | /* @ref is viewed as dead on all CPUs, send out kill confirmation */ | ||
122 | if (ref->confirm_kill) | ||
123 | ref->confirm_kill(ref); | ||
124 | |||
125 | /* | ||
126 | * Now we're in single atomic_t mode with a consistent refcount, so it's | ||
127 | * safe to drop our initial ref: | ||
128 | */ | ||
129 | percpu_ref_put(ref); | ||
130 | } | ||
131 | |||
132 | /** | ||
133 | * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation | ||
134 | * @ref: percpu_ref to kill | ||
135 | * @confirm_kill: optional confirmation callback | ||
136 | * | ||
137 | * Equivalent to percpu_ref_kill() but also schedules kill confirmation if | ||
138 | * @confirm_kill is not NULL. @confirm_kill, which may not block, will be | ||
139 | * called after @ref is seen as dead from all CPUs - all further | ||
140 | * invocations of percpu_ref_tryget() will fail. See percpu_ref_tryget() | ||
141 | * for more details. | ||
142 | * | ||
143 | * Due to the way percpu_ref is implemented, @confirm_kill will be called | ||
144 | * after at least one full RCU grace period has passed but this is an | ||
145 | * implementation detail and callers must not depend on it. | ||
146 | */ | ||
147 | void percpu_ref_kill_and_confirm(struct percpu_ref *ref, | ||
148 | percpu_ref_func_t *confirm_kill) | ||
149 | { | ||
150 | WARN_ONCE(REF_STATUS(ref->pcpu_count) == PCPU_REF_DEAD, | ||
151 | "percpu_ref_kill() called more than once!\n"); | ||
152 | |||
153 | ref->pcpu_count = (unsigned __percpu *) | ||
154 | (((unsigned long) ref->pcpu_count)|PCPU_REF_DEAD); | ||
155 | ref->confirm_kill = confirm_kill; | ||
156 | |||
157 | call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); | ||
158 | } | ||