locking/rwsem: Support optimistic spinning

We have reached the point where our mutexes are quite fine tuned for a number of situations. This includes the use of heuristics and optimistic spinning, based on MCS locking techniques. Exclusive ownership of read-write semaphores are, conceptually, just about the same as mutexes, making them close cousins. To this end we need to make them both perform similarly, and right now, rwsems are simply not up to it. This was discovered by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable) and similarly, converting some other mutexes (ie: i_mmap_mutex) to rwsems. This creates a situation where users have to choose between a rwsem and mutex taking into account this important performance difference. Specifically, biggest difference between both locks is when we fail to acquire a mutex in the fastpath, optimistic spinning comes in to play and we can avoid a large amount of unnecessary sleeping and overhead of moving tasks in and out of wait queue. Rwsems do not have such logic. This patch, based on the work from Tim Chen and I, adds support for write-side optimistic spinning when the lock is contended. It also includes support for the recently added cancelable MCS locking for adaptive spinning. Note that is is only applicable to the xadd method, and the spinlock rwsem variant remains intact. Allowing optimistic spinning before putting the writer on the wait queue reduces wait queue contention and provided greater chance for the rwsem to get acquired. With these changes, rwsem is on par with mutex. The performance benefits can be seen on a number of workloads. For instance, on a 8 socket, 80 core 64bit Westmere box, aim7 shows the following improvements in throughput: +--------------+---------------------+-----------------+ | Workload | throughput-increase | number of users | +--------------+---------------------+-----------------+ | alltests | 20% | >1000 | | custom | 27%, 60% | 10-100, >1000 | | high_systime | 36%, 30% | >100, >1000 | | shared | 58%, 29% | 10-100, >1000 | +--------------+---------------------+-----------------+ There was also improvement on smaller systems, such as a quad-core x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up to +60% in throughput for over 50 clients. Additionally, benefits were also noticed in exim (mail server) workloads. Furthermore, no performance regression have been seen at all. Based-on-work-from: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: Davidlohr Bueso <davidlohr@hp.com> [peterz: rej fixup due to comment patches, sched/rt.h header] Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Alex Shi <alex.shi@linaro.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Hurley <peter@hurleysoftware.com> Cc: "Paul E.McKenney" <paulmck@linux.vnet.ibm.com> Cc: Jason Low <jason.low2@hp.com> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: "Scott J Norton" <scott.norton@hp.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Chris Mason <clm@fb.com> Cc: Josef Bacik <jbacik@fusionio.com> Link: http://lkml.kernel.org/r/1399055055.6275.15.camel@buesod1.americas.hpqcorp.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Davidlohr Bueso <davidlohr@hp.com> 2014-05-02 14:24:15 -0400
committer: Ingo Molnar <mingo@kernel.org> 2014-06-05 04:38:21 -0400
commit: 4fc828e24cd9c385d3a44e1b499ec7fc70239d8a (patch)
tree: 9b53c7b53d6d487e03a383486171279965d3af15
parent: 3cf2f34e1a3d4d5ff209d087925cf950e52f4805 (diff)
3 files changed, 248 insertions, 33 deletions
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 03f3b05e8ec1..3e108f154cb6 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,6 +16,7 @@
 #include <linux/atomic.h>
+struct optimistic_spin_queue;
 struct rw_semaphore;
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -23,9 +24,17 @@ struct rw_semaphore;
 #else
 /* All arch specific implementations share the same struct */
 struct rw_semaphore {
-        long                    count;
+        long count;
-        raw_spinlock_t          wait_lock;
+        raw_spinlock_t wait_lock;
-        struct list_head        wait_list;
+        struct list_head wait_list;
+#ifdef CONFIG_SMP
+        /*
+         * Write owner. Used as a speculative check to see
+         * if the owner is running on the cpu.
+         */
+        struct task_struct *owner;
+        struct optimistic_spin_queue *osq; /* spinner MCS lock */
+#endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map      dep_map;
 #endif
@@ -55,11 +64,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 # define __RWSEM_DEP_MAP_INIT(lockname)
 #endif
+#ifdef CONFIG_SMP
+#define __RWSEM_INITIALIZER(name)                       \
+        { RWSEM_UNLOCKED_VALUE,                         \
+          __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),     \
+          LIST_HEAD_INIT((name).wait_list),             \
+          NULL, /* owner */                             \
+          NULL /* mcs lock */                           \
+          __RWSEM_DEP_MAP_INIT(name) }
+#else
 #define __RWSEM_INITIALIZER(name)                       \
        { RWSEM_UNLOCKED_VALUE,                         \
          __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),     \
          LIST_HEAD_INIT((name).wait_list)              \
          __RWSEM_DEP_MAP_INIT(name) }
+#endif
 #define DECLARE_RWSEM(name) \
        struct rw_semaphore name = __RWSEM_INITIALIZER(name)
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index b4219ff87b8c..4a75278142cd 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,11 +5,17 @@
 *
 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
 * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
 */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/sched/rt.h>
+#include "mcs_spinlock.h"
 /*
 * Guide to the rw_semaphore's count field for common values.
@@ -76,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
        sem->count = RWSEM_UNLOCKED_VALUE;
        raw_spin_lock_init(&sem->wait_lock);
        INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_SMP
+        sem->owner = NULL;
+        sem->osq = NULL;
+#endif
 }
 EXPORT_SYMBOL(__init_rwsem);
@@ -190,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 }
 /*
- * wait for the read lock to be granted
+ * Wait for the read lock to be granted
 */
 __visible
 struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
@@ -237,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        return sem;
 }
+static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+        if (!(count & RWSEM_ACTIVE_MASK)) {
+                /* try acquiring the write lock */
+                if (sem->count == RWSEM_WAITING_BIAS &&
+                    cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+                            RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+                        if (!list_is_singular(&sem->wait_list))
+                                rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+                        return true;
+                }
+        }
+        return false;
+}
+#ifdef CONFIG_SMP
 /*
- * wait until we successfully acquire the write lock
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+        long old, count = ACCESS_ONCE(sem->count);
+        while (true) {
+                if (!(count == 0 || count == RWSEM_WAITING_BIAS))
+                        return false;
+                old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+                if (old == count)
+                        return true;
+                count = old;
+        }
+}
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+        struct task_struct *owner;
+        bool on_cpu = true;
+        if (need_resched())
+                return 0;
+        rcu_read_lock();
+        owner = ACCESS_ONCE(sem->owner);
+        if (owner)
+                on_cpu = owner->on_cpu;
+        rcu_read_unlock();
+        /*
+         * If sem->owner is not set, the rwsem owner may have
+         * just acquired it and not set the owner yet or the rwsem
+         * has been released.
+         */
+        return on_cpu;
+}
+static inline bool owner_running(struct rw_semaphore *sem,
+                                 struct task_struct *owner)
+{
+        if (sem->owner != owner)
+                return false;
+        /*
+         * Ensure we emit the owner->on_cpu, dereference _after_ checking
+         * sem->owner still matches owner, if that fails, owner might
+         * point to free()d memory, if it still matches, the rcu_read_lock()
+         * ensures the memory stays valid.
+         */
+        barrier();
+        return owner->on_cpu;
+}
+static noinline
+bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+{
+        rcu_read_lock();
+        while (owner_running(sem, owner)) {
+                if (need_resched())
+                        break;
+                arch_mutex_cpu_relax();
+        }
+        rcu_read_unlock();
+        /*
+         * We break out the loop above on need_resched() or when the
+         * owner changed, which is a sign for heavy contention. Return
+         * success only when sem->owner is NULL.
+         */
+        return sem->owner == NULL;
+}
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+        struct task_struct *owner;
+        bool taken = false;
+        preempt_disable();
+        /* sem->wait_lock should not be held when doing optimistic spinning */
+        if (!rwsem_can_spin_on_owner(sem))
+                goto done;
+        if (!osq_lock(&sem->osq))
+                goto done;
+        while (true) {
+                owner = ACCESS_ONCE(sem->owner);
+                if (owner && !rwsem_spin_on_owner(sem, owner))
+                        break;
+                /* wait_lock will be acquired if write_lock is obtained */
+                if (rwsem_try_write_lock_unqueued(sem)) {
+                        taken = true;
+                        break;
+                }
+                /*
+                 * When there's no owner, we might have preempted between the
+                 * owner acquiring the lock and setting the owner field. If
+                 * we're an RT task that will live-lock because we won't let
+                 * the owner complete.
+                 */
+                if (!owner && (need_resched() || rt_task(current)))
+                        break;
+                /*
+                 * The cpu_relax() call is a compiler barrier which forces
+                 * everything in this loop to be re-loaded. We don't need
+                 * memory barriers as we'll eventually observe the right
+                 * values at the cost of a few extra spins.
+                 */
+                arch_mutex_cpu_relax();
+        }
+        osq_unlock(&sem->osq);
+done:
+        preempt_enable();
+        return taken;
+}
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+        return false;
+}
+#endif
+/*
+ * Wait until we successfully acquire the write lock
 */
 __visible
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
-        long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+        long count;
+        bool waiting = true; /* any queued threads before us */
        struct rwsem_waiter waiter;
-        struct task_struct *tsk = current;
-        /* set up my own style of waitqueue */
+        /* undo write bias from down_write operation, stop active locking */
-        waiter.task = tsk;
+        count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+        /* do optimistic spinning and steal lock if possible */
+        if (rwsem_optimistic_spin(sem))
+                return sem;
+        /*
+         * Optimistic spinning failed, proceed to the slowpath
+         * and block until we can acquire the sem.
+         */
+        waiter.task = current;
        waiter.type = RWSEM_WAITING_FOR_WRITE;
        raw_spin_lock_irq(&sem->wait_lock);
+        /* account for this before adding a new element to the list */
        if (list_empty(&sem->wait_list))
-                adjustment += RWSEM_WAITING_BIAS;
+                waiting = false;
        list_add_tail(&waiter.list, &sem->wait_list);
        /* we're now waiting on the lock, but no longer actively locking */
-        count = rwsem_atomic_update(adjustment, sem);
+        if (waiting) {
+                count = ACCESS_ONCE(sem->count);
-        /* If there were already threads queued before us and there are no
+                /*
-         * active writers, the lock must be read owned; so we try to wake
+                 * If there were already threads queued before us and there are no
-         * any read locks that were queued ahead of us. */
+                 * active writers, the lock must be read owned; so we try to wake
-        if (count > RWSEM_WAITING_BIAS &&
+                 * any read locks that were queued ahead of us.
-            adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
+                 */
-                sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+                if (count > RWSEM_WAITING_BIAS)
+                        sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+        } else
+                count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
        /* wait until we successfully acquire the lock */
-        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+        set_current_state(TASK_UNINTERRUPTIBLE);
        while (true) {
-                if (!(count & RWSEM_ACTIVE_MASK)) {
+                if (rwsem_try_write_lock(count, sem))
-                        /* Try acquiring the write lock. */
+                        break;
-                        count = RWSEM_ACTIVE_WRITE_BIAS;
-                        if (!list_is_singular(&sem->wait_list))
-                                count += RWSEM_WAITING_BIAS;
-                        if (sem->count == RWSEM_WAITING_BIAS &&
-                            cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
-                                                        RWSEM_WAITING_BIAS)
-                                break;
-                }
                raw_spin_unlock_irq(&sem->wait_lock);
                /* Block until there are no active lockers. */
                do {
                        schedule();
-                        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
                } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
                raw_spin_lock_irq(&sem->wait_lock);
        }
+        __set_current_state(TASK_RUNNING);
        list_del(&waiter.list);
        raw_spin_unlock_irq(&sem->wait_lock);
-        tsk->state = TASK_RUNNING;
        return sem;
 }
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index cfff1435bdfb..42f806de49d4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@
 #include <linux/atomic.h>
+#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+        sem->owner = current;
+}
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+        sem->owner = NULL;
+}
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
 /*
 * lock for reading
 */
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
        rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
 {
        int ret = __down_write_trylock(sem);
-        if (ret == 1)
+        if (ret == 1) {
                rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+                rwsem_set_owner(sem);
+        }
        return ret;
 }
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
 {
        rwsem_release(&sem->dep_map, 1, _RET_IP_);
+        rwsem_clear_owner(sem);
        __up_write(sem);
 }
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
         * lockdep: a downgraded write will live on as a write
         * dependency.
         */
+        rwsem_clear_owner(sem);
        __downgrade_write(sem);
 }
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
        rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
        rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(down_write_nested);
author	Davidlohr Bueso <davidlohr@hp.com>	2014-05-02 14:24:15 -0400
committer	Ingo Molnar <mingo@kernel.org>	2014-06-05 04:38:21 -0400
commit	4fc828e24cd9c385d3a44e1b499ec7fc70239d8a (patch)
tree	9b53c7b53d6d487e03a383486171279965d3af15
parent	3cf2f34e1a3d4d5ff209d087925cf950e52f4805 (diff)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 03f3b05e8ec1..3e108f154cb6 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h
@@ -16,6 +16,7 @@
16		16
17	#include <linux/atomic.h>	17	#include <linux/atomic.h>
18		18
		19	struct optimistic_spin_queue;
19	struct rw_semaphore;	20	struct rw_semaphore;
20		21
21	#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK	22	#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -23,9 +24,17 @@ struct rw_semaphore;
23	#else	24	#else
24	/* All arch specific implementations share the same struct */	25	/* All arch specific implementations share the same struct */
25	struct rw_semaphore {	26	struct rw_semaphore {
26	long count;	27	long count;
27	raw_spinlock_t wait_lock;	28	raw_spinlock_t wait_lock;
28	struct list_head wait_list;	29	struct list_head wait_list;
		30	#ifdef CONFIG_SMP
		31	/*
		32	* Write owner. Used as a speculative check to see
		33	* if the owner is running on the cpu.
		34	*/
		35	struct task_struct *owner;
		36	struct optimistic_spin_queue osq; / spinner MCS lock */
		37	#endif
29	#ifdef CONFIG_DEBUG_LOCK_ALLOC	38	#ifdef CONFIG_DEBUG_LOCK_ALLOC
30	struct lockdep_map dep_map;	39	struct lockdep_map dep_map;
31	#endif	40	#endif
@@ -55,11 +64,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
55	# define __RWSEM_DEP_MAP_INIT(lockname)	64	# define __RWSEM_DEP_MAP_INIT(lockname)
56	#endif	65	#endif
57		66
		67	#ifdef CONFIG_SMP
		68	#define __RWSEM_INITIALIZER(name) \
		69	{ RWSEM_UNLOCKED_VALUE, \
		70	__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
		71	LIST_HEAD_INIT((name).wait_list), \
		72	NULL, /* owner */ \
		73	NULL /* mcs lock */ \
		74	__RWSEM_DEP_MAP_INIT(name) }
		75	#else
58	#define __RWSEM_INITIALIZER(name) \	76	#define __RWSEM_INITIALIZER(name) \
59	{ RWSEM_UNLOCKED_VALUE, \	77	{ RWSEM_UNLOCKED_VALUE, \
60	__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \	78	__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
61	LIST_HEAD_INIT((name).wait_list) \	79	LIST_HEAD_INIT((name).wait_list) \
62	__RWSEM_DEP_MAP_INIT(name) }	80	__RWSEM_DEP_MAP_INIT(name) }
		81	#endif
63		82
64	#define DECLARE_RWSEM(name) \	83	#define DECLARE_RWSEM(name) \
65	struct rw_semaphore name = __RWSEM_INITIALIZER(name)	84	struct rw_semaphore name = __RWSEM_INITIALIZER(name)


diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b4219ff87b8c..4a75278142cd 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c
@@ -5,11 +5,17 @@
5	*	5	*
6	* Writer lock-stealing by Alex Shi <alex.shi@intel.com>	6	* Writer lock-stealing by Alex Shi <alex.shi@intel.com>
7	* and Michel Lespinasse <walken@google.com>	7	* and Michel Lespinasse <walken@google.com>
		8	*
		9	* Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
		10	* and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
8	*/	11	*/
9	#include <linux/rwsem.h>	12	#include <linux/rwsem.h>
10	#include <linux/sched.h>	13	#include <linux/sched.h>
11	#include <linux/init.h>	14	#include <linux/init.h>
12	#include <linux/export.h>	15	#include <linux/export.h>
		16	#include <linux/sched/rt.h>
		17
		18	#include "mcs_spinlock.h"
13		19
14	/*	20	/*
15	* Guide to the rw_semaphore's count field for common values.	21	* Guide to the rw_semaphore's count field for common values.
@@ -76,6 +82,10 @@ void __init_rwsem(struct rw_semaphore sem, const char name,
76	sem->count = RWSEM_UNLOCKED_VALUE;	82	sem->count = RWSEM_UNLOCKED_VALUE;
77	raw_spin_lock_init(&sem->wait_lock);	83	raw_spin_lock_init(&sem->wait_lock);
78	INIT_LIST_HEAD(&sem->wait_list);	84	INIT_LIST_HEAD(&sem->wait_list);
		85	#ifdef CONFIG_SMP
		86	sem->owner = NULL;
		87	sem->osq = NULL;
		88	#endif
79	}	89	}
80		90
81	EXPORT_SYMBOL(__init_rwsem);	91	EXPORT_SYMBOL(__init_rwsem);
@@ -190,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
190	}	200	}
191		201
192	/*	202	/*
193	* wait for the read lock to be granted	203	* Wait for the read lock to be granted
194	*/	204	*/
195	__visible	205	__visible
196	struct rw_semaphore __sched rwsem_down_read_failed(struct rw_semaphore sem)	206	struct rw_semaphore __sched rwsem_down_read_failed(struct rw_semaphore sem)
@@ -237,64 +247,221 @@ struct rw_semaphore __sched rwsem_down_read_failed(struct rw_semaphore sem)
237	return sem;	247	return sem;
238	}	248	}
239		249
		250	static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
		251	{
		252	if (!(count & RWSEM_ACTIVE_MASK)) {
		253	/* try acquiring the write lock */
		254	if (sem->count == RWSEM_WAITING_BIAS &&
		255	cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
		256	RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
		257	if (!list_is_singular(&sem->wait_list))
		258	rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
		259	return true;
		260	}
		261	}
		262	return false;
		263	}
		264
		265	#ifdef CONFIG_SMP
240	/*	266	/*
241	* wait until we successfully acquire the write lock	267	* Try to acquire write lock before the writer has been put on wait queue.
		268	*/
		269	static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
		270	{
		271	long old, count = ACCESS_ONCE(sem->count);
		272
		273	while (true) {
		274	if (!(count == 0 \|\| count == RWSEM_WAITING_BIAS))
		275	return false;
		276
		277	old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
		278	if (old == count)
		279	return true;
		280
		281	count = old;
		282	}
		283	}
		284
		285	static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
		286	{
		287	struct task_struct *owner;
		288	bool on_cpu = true;
		289
		290	if (need_resched())
		291	return 0;
		292
		293	rcu_read_lock();
		294	owner = ACCESS_ONCE(sem->owner);
		295	if (owner)
		296	on_cpu = owner->on_cpu;
		297	rcu_read_unlock();
		298
		299	/*
		300	* If sem->owner is not set, the rwsem owner may have
		301	* just acquired it and not set the owner yet or the rwsem
		302	* has been released.
		303	*/
		304	return on_cpu;
		305	}
		306
		307	static inline bool owner_running(struct rw_semaphore *sem,
		308	struct task_struct *owner)
		309	{
		310	if (sem->owner != owner)
		311	return false;
		312
		313	/*
		314	* Ensure we emit the owner->on_cpu, dereference _after_ checking
		315	* sem->owner still matches owner, if that fails, owner might
		316	* point to free()d memory, if it still matches, the rcu_read_lock()
		317	* ensures the memory stays valid.
		318	*/
		319	barrier();
		320
		321	return owner->on_cpu;
		322	}
		323
		324	static noinline
		325	bool rwsem_spin_on_owner(struct rw_semaphore sem, struct task_struct owner)
		326	{
		327	rcu_read_lock();
		328	while (owner_running(sem, owner)) {
		329	if (need_resched())
		330	break;
		331
		332	arch_mutex_cpu_relax();
		333	}
		334	rcu_read_unlock();
		335
		336	/*
		337	* We break out the loop above on need_resched() or when the
		338	* owner changed, which is a sign for heavy contention. Return
		339	* success only when sem->owner is NULL.
		340	*/
		341	return sem->owner == NULL;
		342	}
		343
		344	static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
		345	{
		346	struct task_struct *owner;
		347	bool taken = false;
		348
		349	preempt_disable();
		350
		351	/* sem->wait_lock should not be held when doing optimistic spinning */
		352	if (!rwsem_can_spin_on_owner(sem))
		353	goto done;
		354
		355	if (!osq_lock(&sem->osq))
		356	goto done;
		357
		358	while (true) {
		359	owner = ACCESS_ONCE(sem->owner);
		360	if (owner && !rwsem_spin_on_owner(sem, owner))
		361	break;
		362
		363	/* wait_lock will be acquired if write_lock is obtained */
		364	if (rwsem_try_write_lock_unqueued(sem)) {
		365	taken = true;
		366	break;
		367	}
		368
		369	/*
		370	* When there's no owner, we might have preempted between the
		371	* owner acquiring the lock and setting the owner field. If
		372	* we're an RT task that will live-lock because we won't let
		373	* the owner complete.
		374	*/
		375	if (!owner && (need_resched() \|\| rt_task(current)))
		376	break;
		377
		378	/*
		379	* The cpu_relax() call is a compiler barrier which forces
		380	* everything in this loop to be re-loaded. We don't need
		381	* memory barriers as we'll eventually observe the right
		382	* values at the cost of a few extra spins.
		383	*/
		384	arch_mutex_cpu_relax();
		385	}
		386	osq_unlock(&sem->osq);
		387	done:
		388	preempt_enable();
		389	return taken;
		390	}
		391
		392	#else
		393	static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
		394	{
		395	return false;
		396	}
		397	#endif
		398
		399	/*
		400	* Wait until we successfully acquire the write lock
242	*/	401	*/
243	__visible	402	__visible
244	struct rw_semaphore __sched rwsem_down_write_failed(struct rw_semaphore sem)	403	struct rw_semaphore __sched rwsem_down_write_failed(struct rw_semaphore sem)
245	{	404	{
246	long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;	405	long count;
		406	bool waiting = true; /* any queued threads before us */
247	struct rwsem_waiter waiter;	407	struct rwsem_waiter waiter;
248	struct task_struct *tsk = current;
249		408
250	/* set up my own style of waitqueue */	409	/* undo write bias from down_write operation, stop active locking */
251	waiter.task = tsk;	410	count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
		411
		412	/* do optimistic spinning and steal lock if possible */
		413	if (rwsem_optimistic_spin(sem))
		414	return sem;
		415
		416	/*
		417	* Optimistic spinning failed, proceed to the slowpath
		418	* and block until we can acquire the sem.
		419	*/
		420	waiter.task = current;
252	waiter.type = RWSEM_WAITING_FOR_WRITE;	421	waiter.type = RWSEM_WAITING_FOR_WRITE;
253		422
254	raw_spin_lock_irq(&sem->wait_lock);	423	raw_spin_lock_irq(&sem->wait_lock);
		424
		425	/* account for this before adding a new element to the list */
255	if (list_empty(&sem->wait_list))	426	if (list_empty(&sem->wait_list))
256	adjustment += RWSEM_WAITING_BIAS;	427	waiting = false;
		428
257	list_add_tail(&waiter.list, &sem->wait_list);	429	list_add_tail(&waiter.list, &sem->wait_list);
258		430
259	/* we're now waiting on the lock, but no longer actively locking */	431	/* we're now waiting on the lock, but no longer actively locking */
260	count = rwsem_atomic_update(adjustment, sem);	432	if (waiting) {
		433	count = ACCESS_ONCE(sem->count);
261		434
262	/* If there were already threads queued before us and there are no	435	/*
263	* active writers, the lock must be read owned; so we try to wake	436	* If there were already threads queued before us and there are no
264	* any read locks that were queued ahead of us. */	437	* active writers, the lock must be read owned; so we try to wake
265	if (count > RWSEM_WAITING_BIAS &&	438	* any read locks that were queued ahead of us.
266	adjustment == -RWSEM_ACTIVE_WRITE_BIAS)	439	*/
267	sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);	440	if (count > RWSEM_WAITING_BIAS)
		441	sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
		442
		443	} else
		444	count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
268		445
269	/* wait until we successfully acquire the lock */	446	/* wait until we successfully acquire the lock */
270	set_task_state(tsk, TASK_UNINTERRUPTIBLE);	447	set_current_state(TASK_UNINTERRUPTIBLE);
271	while (true) {	448	while (true) {
272	if (!(count & RWSEM_ACTIVE_MASK)) {	449	if (rwsem_try_write_lock(count, sem))
273	/* Try acquiring the write lock. */	450	break;
274	count = RWSEM_ACTIVE_WRITE_BIAS;
275	if (!list_is_singular(&sem->wait_list))
276	count += RWSEM_WAITING_BIAS;
277
278	if (sem->count == RWSEM_WAITING_BIAS &&
279	cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
280	RWSEM_WAITING_BIAS)
281	break;
282	}
283
284	raw_spin_unlock_irq(&sem->wait_lock);	451	raw_spin_unlock_irq(&sem->wait_lock);
285		452
286	/* Block until there are no active lockers. */	453	/* Block until there are no active lockers. */
287	do {	454	do {
288	schedule();	455	schedule();
289	set_task_state(tsk, TASK_UNINTERRUPTIBLE);	456	set_current_state(TASK_UNINTERRUPTIBLE);
290	} while ((count = sem->count) & RWSEM_ACTIVE_MASK);	457	} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
291		458
292	raw_spin_lock_irq(&sem->wait_lock);	459	raw_spin_lock_irq(&sem->wait_lock);
293	}	460	}
		461	__set_current_state(TASK_RUNNING);
294		462
295	list_del(&waiter.list);	463	list_del(&waiter.list);
296	raw_spin_unlock_irq(&sem->wait_lock);	464	raw_spin_unlock_irq(&sem->wait_lock);
297	tsk->state = TASK_RUNNING;
298		465
299	return sem;	466	return sem;
300	}	467	}


diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..42f806de49d4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@
12		12
13	#include <linux/atomic.h>	13	#include <linux/atomic.h>
14		14
		15	#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
		16	static inline void rwsem_set_owner(struct rw_semaphore *sem)
		17	{
		18	sem->owner = current;
		19	}
		20
		21	static inline void rwsem_clear_owner(struct rw_semaphore *sem)
		22	{
		23	sem->owner = NULL;
		24	}
		25
		26	#else
		27	static inline void rwsem_set_owner(struct rw_semaphore *sem)
		28	{
		29	}
		30
		31	static inline void rwsem_clear_owner(struct rw_semaphore *sem)
		32	{
		33	}
		34	#endif
		35
15	/*	36	/*
16	* lock for reading	37	* lock for reading
17	*/	38	*/
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
48	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);	69	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
49		70
50	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);	71	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
		72	rwsem_set_owner(sem);
51	}	73	}
52		74
53	EXPORT_SYMBOL(down_write);	75	EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
59	{	81	{
60	int ret = __down_write_trylock(sem);	82	int ret = __down_write_trylock(sem);
61		83
62	if (ret == 1)	84	if (ret == 1) {
63	rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);	85	rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
		86	rwsem_set_owner(sem);
		87	}
		88
64	return ret;	89	return ret;
65	}	90	}
66		91
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
85	{	110	{
86	rwsem_release(&sem->dep_map, 1, _RET_IP_);	111	rwsem_release(&sem->dep_map, 1, _RET_IP_);
87		112
		113	rwsem_clear_owner(sem);
88	__up_write(sem);	114	__up_write(sem);
89	}	115	}
90		116
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
99	* lockdep: a downgraded write will live on as a write	125	* lockdep: a downgraded write will live on as a write
100	* dependency.	126	* dependency.
101	*/	127	*/
		128	rwsem_clear_owner(sem);
102	__downgrade_write(sem);	129	__downgrade_write(sem);
103	}	130	}
104		131
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore sem, struct lockdep_map nest)
122	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);	149	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
123		150
124	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);	151	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
		152	rwsem_set_owner(sem);
125	}	153	}
126		154
127	EXPORT_SYMBOL(_down_write_nest_lock);	155	EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
141	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);	169	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
142		170
143	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);	171	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
		172	rwsem_set_owner(sem);
144	}	173	}
145		174
146	EXPORT_SYMBOL(down_write_nested);	175	EXPORT_SYMBOL(down_write_nested);