aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorOleg Nesterov <oleg@redhat.com>2012-12-17 19:01:32 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-17 20:15:18 -0500
commita1fd3e24d8a484b3265a6d485202afe093c058f3 (patch)
tree472f6480a81abbc04b27eccdb798d80b1685bee0 /lib
parent53809751ac230a3611b5cdd375f3389f3207d471 (diff)
percpu_rw_semaphore: reimplement to not block the readers unnecessarily
Currently the writer does msleep() plus synchronize_sched() 3 times to acquire/release the semaphore, and during this time the readers are blocked completely. Even if the "write" section was not actually started or if it was already finished. With this patch down_write/up_write does synchronize_sched() twice and down_read/up_read are still possible during this time, just they use the slow path. percpu_down_write() first forces the readers to use rw_semaphore and increment the "slow" counter to take the lock for reading, then it takes that rw_semaphore for writing and blocks the readers. Also. With this patch the code relies on the documented behaviour of synchronize_sched(), it doesn't try to pair synchronize_sched() with barrier. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mikulas Patocka <mpatocka@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Anton Arapov <anton@redhat.com> Cc: Jens Axboe <axboe@kernel.dk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'lib')
-rw-r--r--lib/Makefile2
-rw-r--r--lib/percpu-rwsem.c154
2 files changed, 155 insertions, 1 deletions
diff --git a/lib/Makefile b/lib/Makefile
index e2152fa7ff4d..e959c20efb24 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -9,7 +9,7 @@ endif
9 9
10lib-y := ctype.o string.o vsprintf.o cmdline.o \ 10lib-y := ctype.o string.o vsprintf.o cmdline.o \
11 rbtree.o radix-tree.o dump_stack.o timerqueue.o\ 11 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
12 idr.o int_sqrt.o extable.o \ 12 idr.o int_sqrt.o extable.o percpu-rwsem.o \
13 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \ 13 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
14 proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \ 14 proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
15 is_single_threaded.o plist.o decompress.o kobject_uevent.o \ 15 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
diff --git a/lib/percpu-rwsem.c b/lib/percpu-rwsem.c
new file mode 100644
index 000000000000..2e03bcfe48f9
--- /dev/null
+++ b/lib/percpu-rwsem.c
@@ -0,0 +1,154 @@
1#include <linux/mutex.h>
2#include <linux/rwsem.h>
3#include <linux/percpu.h>
4#include <linux/wait.h>
5#include <linux/percpu-rwsem.h>
6#include <linux/rcupdate.h>
7#include <linux/sched.h>
8#include <linux/errno.h>
9
10int percpu_init_rwsem(struct percpu_rw_semaphore *brw)
11{
12 brw->fast_read_ctr = alloc_percpu(int);
13 if (unlikely(!brw->fast_read_ctr))
14 return -ENOMEM;
15
16 mutex_init(&brw->writer_mutex);
17 init_rwsem(&brw->rw_sem);
18 atomic_set(&brw->slow_read_ctr, 0);
19 init_waitqueue_head(&brw->write_waitq);
20 return 0;
21}
22
23void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
24{
25 free_percpu(brw->fast_read_ctr);
26 brw->fast_read_ctr = NULL; /* catch use after free bugs */
27}
28
29/*
30 * This is the fast-path for down_read/up_read, it only needs to ensure
31 * there is no pending writer (!mutex_is_locked() check) and inc/dec the
32 * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
33 * serialize with the preempt-disabled section below.
34 *
35 * The nontrivial part is that we should guarantee acquire/release semantics
36 * in case when
37 *
38 * R_W: down_write() comes after up_read(), the writer should see all
39 * changes done by the reader
40 * or
41 * W_R: down_read() comes after up_write(), the reader should see all
42 * changes done by the writer
43 *
44 * If this helper fails the callers rely on the normal rw_semaphore and
45 * atomic_dec_and_test(), so in this case we have the necessary barriers.
46 *
47 * But if it succeeds we do not have any barriers, mutex_is_locked() or
48 * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
49 * reader inside the critical section. See the comments in down_write and
50 * up_write below.
51 */
52static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
53{
54 bool success = false;
55
56 preempt_disable();
57 if (likely(!mutex_is_locked(&brw->writer_mutex))) {
58 __this_cpu_add(*brw->fast_read_ctr, val);
59 success = true;
60 }
61 preempt_enable();
62
63 return success;
64}
65
66/*
67 * Like the normal down_read() this is not recursive, the writer can
68 * come after the first percpu_down_read() and create the deadlock.
69 */
70void percpu_down_read(struct percpu_rw_semaphore *brw)
71{
72 if (likely(update_fast_ctr(brw, +1)))
73 return;
74
75 down_read(&brw->rw_sem);
76 atomic_inc(&brw->slow_read_ctr);
77 up_read(&brw->rw_sem);
78}
79
80void percpu_up_read(struct percpu_rw_semaphore *brw)
81{
82 if (likely(update_fast_ctr(brw, -1)))
83 return;
84
85 /* false-positive is possible but harmless */
86 if (atomic_dec_and_test(&brw->slow_read_ctr))
87 wake_up_all(&brw->write_waitq);
88}
89
90static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
91{
92 unsigned int sum = 0;
93 int cpu;
94
95 for_each_possible_cpu(cpu) {
96 sum += per_cpu(*brw->fast_read_ctr, cpu);
97 per_cpu(*brw->fast_read_ctr, cpu) = 0;
98 }
99
100 return sum;
101}
102
103/*
104 * A writer takes ->writer_mutex to exclude other writers and to force the
105 * readers to switch to the slow mode, note the mutex_is_locked() check in
106 * update_fast_ctr().
107 *
108 * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
109 * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
110 * counter it represents the number of active readers.
111 *
112 * Finally the writer takes ->rw_sem for writing and blocks the new readers,
113 * then waits until the slow counter becomes zero.
114 */
115void percpu_down_write(struct percpu_rw_semaphore *brw)
116{
117 /* also blocks update_fast_ctr() which checks mutex_is_locked() */
118 mutex_lock(&brw->writer_mutex);
119
120 /*
121 * 1. Ensures mutex_is_locked() is visible to any down_read/up_read
122 * so that update_fast_ctr() can't succeed.
123 *
124 * 2. Ensures we see the result of every previous this_cpu_add() in
125 * update_fast_ctr().
126 *
127 * 3. Ensures that if any reader has exited its critical section via
128 * fast-path, it executes a full memory barrier before we return.
129 * See R_W case in the comment above update_fast_ctr().
130 */
131 synchronize_sched_expedited();
132
133 /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
134 atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
135
136 /* block the new readers completely */
137 down_write(&brw->rw_sem);
138
139 /* wait for all readers to complete their percpu_up_read() */
140 wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
141}
142
143void percpu_up_write(struct percpu_rw_semaphore *brw)
144{
145 /* allow the new readers, but only the slow-path */
146 up_write(&brw->rw_sem);
147
148 /*
149 * Insert the barrier before the next fast-path in down_read,
150 * see W_R case in the comment above update_fast_ctr().
151 */
152 synchronize_sched_expedited();
153 mutex_unlock(&brw->writer_mutex);
154}