aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavidlohr Bueso <davidlohr@hp.com>2014-05-02 14:24:15 -0400
committerIngo Molnar <mingo@kernel.org>2014-06-05 04:38:21 -0400
commit4fc828e24cd9c385d3a44e1b499ec7fc70239d8a (patch)
tree9b53c7b53d6d487e03a383486171279965d3af15
parent3cf2f34e1a3d4d5ff209d087925cf950e52f4805 (diff)
locking/rwsem: Support optimistic spinning
We have reached the point where our mutexes are quite fine tuned for a number of situations. This includes the use of heuristics and optimistic spinning, based on MCS locking techniques. Exclusive ownership of read-write semaphores are, conceptually, just about the same as mutexes, making them close cousins. To this end we need to make them both perform similarly, and right now, rwsems are simply not up to it. This was discovered by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable) and similarly, converting some other mutexes (ie: i_mmap_mutex) to rwsems. This creates a situation where users have to choose between a rwsem and mutex taking into account this important performance difference. Specifically, biggest difference between both locks is when we fail to acquire a mutex in the fastpath, optimistic spinning comes in to play and we can avoid a large amount of unnecessary sleeping and overhead of moving tasks in and out of wait queue. Rwsems do not have such logic. This patch, based on the work from Tim Chen and I, adds support for write-side optimistic spinning when the lock is contended. It also includes support for the recently added cancelable MCS locking for adaptive spinning. Note that is is only applicable to the xadd method, and the spinlock rwsem variant remains intact. Allowing optimistic spinning before putting the writer on the wait queue reduces wait queue contention and provided greater chance for the rwsem to get acquired. With these changes, rwsem is on par with mutex. The performance benefits can be seen on a number of workloads. For instance, on a 8 socket, 80 core 64bit Westmere box, aim7 shows the following improvements in throughput: +--------------+---------------------+-----------------+ | Workload | throughput-increase | number of users | +--------------+---------------------+-----------------+ | alltests | 20% | >1000 | | custom | 27%, 60% | 10-100, >1000 | | high_systime | 36%, 30% | >100, >1000 | | shared | 58%, 29% | 10-100, >1000 | +--------------+---------------------+-----------------+ There was also improvement on smaller systems, such as a quad-core x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up to +60% in throughput for over 50 clients. Additionally, benefits were also noticed in exim (mail server) workloads. Furthermore, no performance regression have been seen at all. Based-on-work-from: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: Davidlohr Bueso <davidlohr@hp.com> [peterz: rej fixup due to comment patches, sched/rt.h header] Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Alex Shi <alex.shi@linaro.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Hurley <peter@hurleysoftware.com> Cc: "Paul E.McKenney" <paulmck@linux.vnet.ibm.com> Cc: Jason Low <jason.low2@hp.com> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: "Scott J Norton" <scott.norton@hp.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Chris Mason <clm@fb.com> Cc: Josef Bacik <jbacik@fusionio.com> Link: http://lkml.kernel.org/r/1399055055.6275.15.camel@buesod1.americas.hpqcorp.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/rwsem.h25
-rw-r--r--kernel/locking/rwsem-xadd.c225
-rw-r--r--kernel/locking/rwsem.c31
3 files changed, 248 insertions, 33 deletions
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 03f3b05e8ec1..3e108f154cb6 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,6 +16,7 @@
16 16
17#include <linux/atomic.h> 17#include <linux/atomic.h>
18 18
19struct optimistic_spin_queue;
19struct rw_semaphore; 20struct rw_semaphore;
20 21
21#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK 22#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -23,9 +24,17 @@ struct rw_semaphore;
23#else 24#else
24/* All arch specific implementations share the same struct */ 25/* All arch specific implementations share the same struct */
25struct rw_semaphore { 26struct rw_semaphore {
26 long count; 27 long count;
27 raw_spinlock_t wait_lock; 28 raw_spinlock_t wait_lock;
28 struct list_head wait_list; 29 struct list_head wait_list;
30#ifdef CONFIG_SMP
31 /*
32 * Write owner. Used as a speculative check to see
33 * if the owner is running on the cpu.
34 */
35 struct task_struct *owner;
36 struct optimistic_spin_queue *osq; /* spinner MCS lock */
37#endif
29#ifdef CONFIG_DEBUG_LOCK_ALLOC 38#ifdef CONFIG_DEBUG_LOCK_ALLOC
30 struct lockdep_map dep_map; 39 struct lockdep_map dep_map;
31#endif 40#endif
@@ -55,11 +64,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
55# define __RWSEM_DEP_MAP_INIT(lockname) 64# define __RWSEM_DEP_MAP_INIT(lockname)
56#endif 65#endif
57 66
67#ifdef CONFIG_SMP
68#define __RWSEM_INITIALIZER(name) \
69 { RWSEM_UNLOCKED_VALUE, \
70 __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
71 LIST_HEAD_INIT((name).wait_list), \
72 NULL, /* owner */ \
73 NULL /* mcs lock */ \
74 __RWSEM_DEP_MAP_INIT(name) }
75#else
58#define __RWSEM_INITIALIZER(name) \ 76#define __RWSEM_INITIALIZER(name) \
59 { RWSEM_UNLOCKED_VALUE, \ 77 { RWSEM_UNLOCKED_VALUE, \
60 __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ 78 __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
61 LIST_HEAD_INIT((name).wait_list) \ 79 LIST_HEAD_INIT((name).wait_list) \
62 __RWSEM_DEP_MAP_INIT(name) } 80 __RWSEM_DEP_MAP_INIT(name) }
81#endif
63 82
64#define DECLARE_RWSEM(name) \ 83#define DECLARE_RWSEM(name) \
65 struct rw_semaphore name = __RWSEM_INITIALIZER(name) 84 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index b4219ff87b8c..4a75278142cd 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,11 +5,17 @@
5 * 5 *
6 * Writer lock-stealing by Alex Shi <alex.shi@intel.com> 6 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
7 * and Michel Lespinasse <walken@google.com> 7 * and Michel Lespinasse <walken@google.com>
8 *
9 * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
10 * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
8 */ 11 */
9#include <linux/rwsem.h> 12#include <linux/rwsem.h>
10#include <linux/sched.h> 13#include <linux/sched.h>
11#include <linux/init.h> 14#include <linux/init.h>
12#include <linux/export.h> 15#include <linux/export.h>
16#include <linux/sched/rt.h>
17
18#include "mcs_spinlock.h"
13 19
14/* 20/*
15 * Guide to the rw_semaphore's count field for common values. 21 * Guide to the rw_semaphore's count field for common values.
@@ -76,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
76 sem->count = RWSEM_UNLOCKED_VALUE; 82 sem->count = RWSEM_UNLOCKED_VALUE;
77 raw_spin_lock_init(&sem->wait_lock); 83 raw_spin_lock_init(&sem->wait_lock);
78 INIT_LIST_HEAD(&sem->wait_list); 84 INIT_LIST_HEAD(&sem->wait_list);
85#ifdef CONFIG_SMP
86 sem->owner = NULL;
87 sem->osq = NULL;
88#endif
79} 89}
80 90
81EXPORT_SYMBOL(__init_rwsem); 91EXPORT_SYMBOL(__init_rwsem);
@@ -190,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
190} 200}
191 201
192/* 202/*
193 * wait for the read lock to be granted 203 * Wait for the read lock to be granted
194 */ 204 */
195__visible 205__visible
196struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) 206struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
@@ -237,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
237 return sem; 247 return sem;
238} 248}
239 249
250static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
251{
252 if (!(count & RWSEM_ACTIVE_MASK)) {
253 /* try acquiring the write lock */
254 if (sem->count == RWSEM_WAITING_BIAS &&
255 cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
256 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
257 if (!list_is_singular(&sem->wait_list))
258 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
259 return true;
260 }
261 }
262 return false;
263}
264
265#ifdef CONFIG_SMP
240/* 266/*
241 * wait until we successfully acquire the write lock 267 * Try to acquire write lock before the writer has been put on wait queue.
268 */
269static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
270{
271 long old, count = ACCESS_ONCE(sem->count);
272
273 while (true) {
274 if (!(count == 0 || count == RWSEM_WAITING_BIAS))
275 return false;
276
277 old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
278 if (old == count)
279 return true;
280
281 count = old;
282 }
283}
284
285static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
286{
287 struct task_struct *owner;
288 bool on_cpu = true;
289
290 if (need_resched())
291 return 0;
292
293 rcu_read_lock();
294 owner = ACCESS_ONCE(sem->owner);
295 if (owner)
296 on_cpu = owner->on_cpu;
297 rcu_read_unlock();
298
299 /*
300 * If sem->owner is not set, the rwsem owner may have
301 * just acquired it and not set the owner yet or the rwsem
302 * has been released.
303 */
304 return on_cpu;
305}
306
307static inline bool owner_running(struct rw_semaphore *sem,
308 struct task_struct *owner)
309{
310 if (sem->owner != owner)
311 return false;
312
313 /*
314 * Ensure we emit the owner->on_cpu, dereference _after_ checking
315 * sem->owner still matches owner, if that fails, owner might
316 * point to free()d memory, if it still matches, the rcu_read_lock()
317 * ensures the memory stays valid.
318 */
319 barrier();
320
321 return owner->on_cpu;
322}
323
324static noinline
325bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
326{
327 rcu_read_lock();
328 while (owner_running(sem, owner)) {
329 if (need_resched())
330 break;
331
332 arch_mutex_cpu_relax();
333 }
334 rcu_read_unlock();
335
336 /*
337 * We break out the loop above on need_resched() or when the
338 * owner changed, which is a sign for heavy contention. Return
339 * success only when sem->owner is NULL.
340 */
341 return sem->owner == NULL;
342}
343
344static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
345{
346 struct task_struct *owner;
347 bool taken = false;
348
349 preempt_disable();
350
351 /* sem->wait_lock should not be held when doing optimistic spinning */
352 if (!rwsem_can_spin_on_owner(sem))
353 goto done;
354
355 if (!osq_lock(&sem->osq))
356 goto done;
357
358 while (true) {
359 owner = ACCESS_ONCE(sem->owner);
360 if (owner && !rwsem_spin_on_owner(sem, owner))
361 break;
362
363 /* wait_lock will be acquired if write_lock is obtained */
364 if (rwsem_try_write_lock_unqueued(sem)) {
365 taken = true;
366 break;
367 }
368
369 /*
370 * When there's no owner, we might have preempted between the
371 * owner acquiring the lock and setting the owner field. If
372 * we're an RT task that will live-lock because we won't let
373 * the owner complete.
374 */
375 if (!owner && (need_resched() || rt_task(current)))
376 break;
377
378 /*
379 * The cpu_relax() call is a compiler barrier which forces
380 * everything in this loop to be re-loaded. We don't need
381 * memory barriers as we'll eventually observe the right
382 * values at the cost of a few extra spins.
383 */
384 arch_mutex_cpu_relax();
385 }
386 osq_unlock(&sem->osq);
387done:
388 preempt_enable();
389 return taken;
390}
391
392#else
393static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
394{
395 return false;
396}
397#endif
398
399/*
400 * Wait until we successfully acquire the write lock
242 */ 401 */
243__visible 402__visible
244struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) 403struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
245{ 404{
246 long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; 405 long count;
406 bool waiting = true; /* any queued threads before us */
247 struct rwsem_waiter waiter; 407 struct rwsem_waiter waiter;
248 struct task_struct *tsk = current;
249 408
250 /* set up my own style of waitqueue */ 409 /* undo write bias from down_write operation, stop active locking */
251 waiter.task = tsk; 410 count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
411
412 /* do optimistic spinning and steal lock if possible */
413 if (rwsem_optimistic_spin(sem))
414 return sem;
415
416 /*
417 * Optimistic spinning failed, proceed to the slowpath
418 * and block until we can acquire the sem.
419 */
420 waiter.task = current;
252 waiter.type = RWSEM_WAITING_FOR_WRITE; 421 waiter.type = RWSEM_WAITING_FOR_WRITE;
253 422
254 raw_spin_lock_irq(&sem->wait_lock); 423 raw_spin_lock_irq(&sem->wait_lock);
424
425 /* account for this before adding a new element to the list */
255 if (list_empty(&sem->wait_list)) 426 if (list_empty(&sem->wait_list))
256 adjustment += RWSEM_WAITING_BIAS; 427 waiting = false;
428
257 list_add_tail(&waiter.list, &sem->wait_list); 429 list_add_tail(&waiter.list, &sem->wait_list);
258 430
259 /* we're now waiting on the lock, but no longer actively locking */ 431 /* we're now waiting on the lock, but no longer actively locking */
260 count = rwsem_atomic_update(adjustment, sem); 432 if (waiting) {
433 count = ACCESS_ONCE(sem->count);
261 434
262 /* If there were already threads queued before us and there are no 435 /*
263 * active writers, the lock must be read owned; so we try to wake 436 * If there were already threads queued before us and there are no
264 * any read locks that were queued ahead of us. */ 437 * active writers, the lock must be read owned; so we try to wake
265 if (count > RWSEM_WAITING_BIAS && 438 * any read locks that were queued ahead of us.
266 adjustment == -RWSEM_ACTIVE_WRITE_BIAS) 439 */
267 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); 440 if (count > RWSEM_WAITING_BIAS)
441 sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
442
443 } else
444 count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
268 445
269 /* wait until we successfully acquire the lock */ 446 /* wait until we successfully acquire the lock */
270 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 447 set_current_state(TASK_UNINTERRUPTIBLE);
271 while (true) { 448 while (true) {
272 if (!(count & RWSEM_ACTIVE_MASK)) { 449 if (rwsem_try_write_lock(count, sem))
273 /* Try acquiring the write lock. */ 450 break;
274 count = RWSEM_ACTIVE_WRITE_BIAS;
275 if (!list_is_singular(&sem->wait_list))
276 count += RWSEM_WAITING_BIAS;
277
278 if (sem->count == RWSEM_WAITING_BIAS &&
279 cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
280 RWSEM_WAITING_BIAS)
281 break;
282 }
283
284 raw_spin_unlock_irq(&sem->wait_lock); 451 raw_spin_unlock_irq(&sem->wait_lock);
285 452
286 /* Block until there are no active lockers. */ 453 /* Block until there are no active lockers. */
287 do { 454 do {
288 schedule(); 455 schedule();
289 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 456 set_current_state(TASK_UNINTERRUPTIBLE);
290 } while ((count = sem->count) & RWSEM_ACTIVE_MASK); 457 } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
291 458
292 raw_spin_lock_irq(&sem->wait_lock); 459 raw_spin_lock_irq(&sem->wait_lock);
293 } 460 }
461 __set_current_state(TASK_RUNNING);
294 462
295 list_del(&waiter.list); 463 list_del(&waiter.list);
296 raw_spin_unlock_irq(&sem->wait_lock); 464 raw_spin_unlock_irq(&sem->wait_lock);
297 tsk->state = TASK_RUNNING;
298 465
299 return sem; 466 return sem;
300} 467}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index cfff1435bdfb..42f806de49d4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@
12 12
13#include <linux/atomic.h> 13#include <linux/atomic.h>
14 14
15#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
16static inline void rwsem_set_owner(struct rw_semaphore *sem)
17{
18 sem->owner = current;
19}
20
21static inline void rwsem_clear_owner(struct rw_semaphore *sem)
22{
23 sem->owner = NULL;
24}
25
26#else
27static inline void rwsem_set_owner(struct rw_semaphore *sem)
28{
29}
30
31static inline void rwsem_clear_owner(struct rw_semaphore *sem)
32{
33}
34#endif
35
15/* 36/*
16 * lock for reading 37 * lock for reading
17 */ 38 */
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 69 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
49 70
50 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 71 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
72 rwsem_set_owner(sem);
51} 73}
52 74
53EXPORT_SYMBOL(down_write); 75EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
59{ 81{
60 int ret = __down_write_trylock(sem); 82 int ret = __down_write_trylock(sem);
61 83
62 if (ret == 1) 84 if (ret == 1) {
63 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); 85 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
86 rwsem_set_owner(sem);
87 }
88
64 return ret; 89 return ret;
65} 90}
66 91
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
85{ 110{
86 rwsem_release(&sem->dep_map, 1, _RET_IP_); 111 rwsem_release(&sem->dep_map, 1, _RET_IP_);
87 112
113 rwsem_clear_owner(sem);
88 __up_write(sem); 114 __up_write(sem);
89} 115}
90 116
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
99 * lockdep: a downgraded write will live on as a write 125 * lockdep: a downgraded write will live on as a write
100 * dependency. 126 * dependency.
101 */ 127 */
128 rwsem_clear_owner(sem);
102 __downgrade_write(sem); 129 __downgrade_write(sem);
103} 130}
104 131
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
122 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); 149 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
123 150
124 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 151 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
152 rwsem_set_owner(sem);
125} 153}
126 154
127EXPORT_SYMBOL(_down_write_nest_lock); 155EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
141 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 169 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
142 170
143 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 171 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
172 rwsem_set_owner(sem);
144} 173}
145 174
146EXPORT_SYMBOL(down_write_nested); 175EXPORT_SYMBOL(down_write_nested);