diff options
-rw-r--r-- | include/linux/percpu-refcount.h | 122 | ||||
-rw-r--r-- | lib/Makefile | 2 | ||||
-rw-r--r-- | lib/percpu-refcount.c | 128 |
3 files changed, 251 insertions, 1 deletions
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h new file mode 100644 index 000000000000..24b31ef15932 --- /dev/null +++ b/include/linux/percpu-refcount.h | |||
@@ -0,0 +1,122 @@ | |||
1 | /* | ||
2 | * Percpu refcounts: | ||
3 | * (C) 2012 Google, Inc. | ||
4 | * Author: Kent Overstreet <koverstreet@google.com> | ||
5 | * | ||
6 | * This implements a refcount with similar semantics to atomic_t - atomic_inc(), | ||
7 | * atomic_dec_and_test() - but percpu. | ||
8 | * | ||
9 | * There's one important difference between percpu refs and normal atomic_t | ||
10 | * refcounts; you have to keep track of your initial refcount, and then when you | ||
11 | * start shutting down you call percpu_ref_kill() _before_ dropping the initial | ||
12 | * refcount. | ||
13 | * | ||
14 | * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less | ||
15 | * than an atomic_t - this is because of the way shutdown works, see | ||
16 | * percpu_ref_kill()/PCPU_COUNT_BIAS. | ||
17 | * | ||
18 | * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the | ||
19 | * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() | ||
20 | * puts the ref back in single atomic_t mode, collecting the per cpu refs and | ||
21 | * issuing the appropriate barriers, and then marks the ref as shutting down so | ||
22 | * that percpu_ref_put() will check for the ref hitting 0. After it returns, | ||
23 | * it's safe to drop the initial ref. | ||
24 | * | ||
25 | * USAGE: | ||
26 | * | ||
27 | * See fs/aio.c for some example usage; it's used there for struct kioctx, which | ||
28 | * is created when userspaces calls io_setup(), and destroyed when userspace | ||
29 | * calls io_destroy() or the process exits. | ||
30 | * | ||
31 | * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it | ||
32 | * calls percpu_ref_kill(), then hlist_del_rcu() and sychronize_rcu() to remove | ||
33 | * the kioctx from the proccess's list of kioctxs - after that, there can't be | ||
34 | * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop | ||
35 | * the initial ref with percpu_ref_put(). | ||
36 | * | ||
37 | * Code that does a two stage shutdown like this often needs some kind of | ||
38 | * explicit synchronization to ensure the initial refcount can only be dropped | ||
39 | * once - percpu_ref_kill() does this for you, it returns true once and false if | ||
40 | * someone else already called it. The aio code uses it this way, but it's not | ||
41 | * necessary if the code has some other mechanism to synchronize teardown. | ||
42 | * around. | ||
43 | */ | ||
44 | |||
45 | #ifndef _LINUX_PERCPU_REFCOUNT_H | ||
46 | #define _LINUX_PERCPU_REFCOUNT_H | ||
47 | |||
48 | #include <linux/atomic.h> | ||
49 | #include <linux/kernel.h> | ||
50 | #include <linux/percpu.h> | ||
51 | #include <linux/rcupdate.h> | ||
52 | |||
53 | struct percpu_ref; | ||
54 | typedef void (percpu_ref_release)(struct percpu_ref *); | ||
55 | |||
56 | struct percpu_ref { | ||
57 | atomic_t count; | ||
58 | /* | ||
59 | * The low bit of the pointer indicates whether the ref is in percpu | ||
60 | * mode; if set, then get/put will manipulate the atomic_t (this is a | ||
61 | * hack because we need to keep the pointer around for | ||
62 | * percpu_ref_kill_rcu()) | ||
63 | */ | ||
64 | unsigned __percpu *pcpu_count; | ||
65 | percpu_ref_release *release; | ||
66 | struct rcu_head rcu; | ||
67 | }; | ||
68 | |||
69 | int percpu_ref_init(struct percpu_ref *, percpu_ref_release *); | ||
70 | void percpu_ref_kill(struct percpu_ref *ref); | ||
71 | |||
72 | #define PCPU_STATUS_BITS 2 | ||
73 | #define PCPU_STATUS_MASK ((1 << PCPU_STATUS_BITS) - 1) | ||
74 | #define PCPU_REF_PTR 0 | ||
75 | #define PCPU_REF_DEAD 1 | ||
76 | |||
77 | #define REF_STATUS(count) (((unsigned long) count) & PCPU_STATUS_MASK) | ||
78 | |||
79 | /** | ||
80 | * percpu_ref_get - increment a percpu refcount | ||
81 | * | ||
82 | * Analagous to atomic_inc(). | ||
83 | */ | ||
84 | static inline void percpu_ref_get(struct percpu_ref *ref) | ||
85 | { | ||
86 | unsigned __percpu *pcpu_count; | ||
87 | |||
88 | preempt_disable(); | ||
89 | |||
90 | pcpu_count = ACCESS_ONCE(ref->pcpu_count); | ||
91 | |||
92 | if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) | ||
93 | __this_cpu_inc(*pcpu_count); | ||
94 | else | ||
95 | atomic_inc(&ref->count); | ||
96 | |||
97 | preempt_enable(); | ||
98 | } | ||
99 | |||
100 | /** | ||
101 | * percpu_ref_put - decrement a percpu refcount | ||
102 | * | ||
103 | * Decrement the refcount, and if 0, call the release function (which was passed | ||
104 | * to percpu_ref_init()) | ||
105 | */ | ||
106 | static inline void percpu_ref_put(struct percpu_ref *ref) | ||
107 | { | ||
108 | unsigned __percpu *pcpu_count; | ||
109 | |||
110 | preempt_disable(); | ||
111 | |||
112 | pcpu_count = ACCESS_ONCE(ref->pcpu_count); | ||
113 | |||
114 | if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) | ||
115 | __this_cpu_dec(*pcpu_count); | ||
116 | else if (unlikely(atomic_dec_and_test(&ref->count))) | ||
117 | ref->release(ref); | ||
118 | |||
119 | preempt_enable(); | ||
120 | } | ||
121 | |||
122 | #endif | ||
diff --git a/lib/Makefile b/lib/Makefile index c55a037a354e..386db4bbc265 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ | |||
13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ | 13 | sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ |
14 | proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ | 14 | proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ |
15 | is_single_threaded.o plist.o decompress.o kobject_uevent.o \ | 15 | is_single_threaded.o plist.o decompress.o kobject_uevent.o \ |
16 | earlycpio.o | 16 | earlycpio.o percpu-refcount.o |
17 | 17 | ||
18 | obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o | 18 | obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o |
19 | lib-$(CONFIG_MMU) += ioremap.o | 19 | lib-$(CONFIG_MMU) += ioremap.o |
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c new file mode 100644 index 000000000000..6f0ffd702a09 --- /dev/null +++ b/lib/percpu-refcount.c | |||
@@ -0,0 +1,128 @@ | |||
1 | #define pr_fmt(fmt) "%s: " fmt "\n", __func__ | ||
2 | |||
3 | #include <linux/kernel.h> | ||
4 | #include <linux/percpu-refcount.h> | ||
5 | |||
6 | /* | ||
7 | * Initially, a percpu refcount is just a set of percpu counters. Initially, we | ||
8 | * don't try to detect the ref hitting 0 - which means that get/put can just | ||
9 | * increment or decrement the local counter. Note that the counter on a | ||
10 | * particular cpu can (and will) wrap - this is fine, when we go to shutdown the | ||
11 | * percpu counters will all sum to the correct value | ||
12 | * | ||
13 | * (More precisely: because moduler arithmatic is commutative the sum of all the | ||
14 | * pcpu_count vars will be equal to what it would have been if all the gets and | ||
15 | * puts were done to a single integer, even if some of the percpu integers | ||
16 | * overflow or underflow). | ||
17 | * | ||
18 | * The real trick to implementing percpu refcounts is shutdown. We can't detect | ||
19 | * the ref hitting 0 on every put - this would require global synchronization | ||
20 | * and defeat the whole purpose of using percpu refs. | ||
21 | * | ||
22 | * What we do is require the user to keep track of the initial refcount; we know | ||
23 | * the ref can't hit 0 before the user drops the initial ref, so as long as we | ||
24 | * convert to non percpu mode before the initial ref is dropped everything | ||
25 | * works. | ||
26 | * | ||
27 | * Converting to non percpu mode is done with some RCUish stuff in | ||
28 | * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t | ||
29 | * can't hit 0 before we've added up all the percpu refs. | ||
30 | */ | ||
31 | |||
32 | #define PCPU_COUNT_BIAS (1U << 31) | ||
33 | |||
34 | /** | ||
35 | * percpu_ref_init - initialize a percpu refcount | ||
36 | * @ref: ref to initialize | ||
37 | * @release: function which will be called when refcount hits 0 | ||
38 | * | ||
39 | * Initializes the refcount in single atomic counter mode with a refcount of 1; | ||
40 | * analagous to atomic_set(ref, 1). | ||
41 | * | ||
42 | * Note that @release must not sleep - it may potentially be called from RCU | ||
43 | * callback context by percpu_ref_kill(). | ||
44 | */ | ||
45 | int percpu_ref_init(struct percpu_ref *ref, percpu_ref_release *release) | ||
46 | { | ||
47 | atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); | ||
48 | |||
49 | ref->pcpu_count = alloc_percpu(unsigned); | ||
50 | if (!ref->pcpu_count) | ||
51 | return -ENOMEM; | ||
52 | |||
53 | ref->release = release; | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static void percpu_ref_kill_rcu(struct rcu_head *rcu) | ||
58 | { | ||
59 | struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); | ||
60 | unsigned __percpu *pcpu_count; | ||
61 | unsigned count = 0; | ||
62 | int cpu; | ||
63 | |||
64 | pcpu_count = ACCESS_ONCE(ref->pcpu_count); | ||
65 | |||
66 | /* Mask out PCPU_REF_DEAD */ | ||
67 | pcpu_count = (unsigned __percpu *) | ||
68 | (((unsigned long) pcpu_count) & ~PCPU_STATUS_MASK); | ||
69 | |||
70 | for_each_possible_cpu(cpu) | ||
71 | count += *per_cpu_ptr(pcpu_count, cpu); | ||
72 | |||
73 | free_percpu(pcpu_count); | ||
74 | |||
75 | pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); | ||
76 | |||
77 | /* | ||
78 | * It's crucial that we sum the percpu counters _before_ adding the sum | ||
79 | * to &ref->count; since gets could be happening on one cpu while puts | ||
80 | * happen on another, adding a single cpu's count could cause | ||
81 | * @ref->count to hit 0 before we've got a consistent value - but the | ||
82 | * sum of all the counts will be consistent and correct. | ||
83 | * | ||
84 | * Subtracting the bias value then has to happen _after_ adding count to | ||
85 | * &ref->count; we need the bias value to prevent &ref->count from | ||
86 | * reaching 0 before we add the percpu counts. But doing it at the same | ||
87 | * time is equivalent and saves us atomic operations: | ||
88 | */ | ||
89 | |||
90 | atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); | ||
91 | |||
92 | /* | ||
93 | * Now we're in single atomic_t mode with a consistent refcount, so it's | ||
94 | * safe to drop our initial ref: | ||
95 | */ | ||
96 | percpu_ref_put(ref); | ||
97 | } | ||
98 | |||
99 | /** | ||
100 | * percpu_ref_kill - safely drop initial ref | ||
101 | * | ||
102 | * Must be used to drop the initial ref on a percpu refcount; must be called | ||
103 | * precisely once before shutdown. | ||
104 | * | ||
105 | * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the | ||
106 | * percpu counters and dropping the initial ref. | ||
107 | */ | ||
108 | void percpu_ref_kill(struct percpu_ref *ref) | ||
109 | { | ||
110 | unsigned __percpu *pcpu_count, *old, *new; | ||
111 | |||
112 | pcpu_count = ACCESS_ONCE(ref->pcpu_count); | ||
113 | |||
114 | do { | ||
115 | if (REF_STATUS(pcpu_count) == PCPU_REF_DEAD) { | ||
116 | WARN(1, "percpu_ref_kill() called more than once!\n"); | ||
117 | return; | ||
118 | } | ||
119 | |||
120 | old = pcpu_count; | ||
121 | new = (unsigned __percpu *) | ||
122 | (((unsigned long) pcpu_count)|PCPU_REF_DEAD); | ||
123 | |||
124 | pcpu_count = cmpxchg(&ref->pcpu_count, old, new); | ||
125 | } while (pcpu_count != old); | ||
126 | |||
127 | call_rcu(&ref->rcu, percpu_ref_kill_rcu); | ||
128 | } | ||