rwsem: implement support for write lock stealing on the fastpath

When we decide to wake up readers, we must first grant them as many read locks as necessary, and then actually wake up all these readers. But in order to know how many read shares to grant, we must first count the readers at the head of the queue. This might take a while if there are many readers, and we want to be protected against a writer stealing the lock while we're counting. To that end, we grant the first reader lock before counting how many more readers are queued. We also require some adjustments to the wake_type semantics. RWSEM_WAKE_NO_ACTIVE used to mean that we had found the count to be RWSEM_WAITING_BIAS, in which case the rwsem was known to be free as nobody could steal it while we hold the wait_lock. This doesn't make sense once we implement fastpath write lock stealing, so we now use RWSEM_WAKE_ANY in that case. Similarly, when rwsem_down_write_failed found that a read lock was active, it would use RWSEM_WAKE_READ_OWNED which signalled that new readers could be woken without checking first that the rwsem was available. We can't do that anymore since the existing readers might release their read locks, and a writer could steal the lock before we wake up additional readers. So, we have to use a new RWSEM_WAKE_READERS value to indicate we only want to wake readers, but we don't currently hold any read lock. Signed-off-by: Michel Lespinasse <walken@google.com> Reviewed-by: Peter Hurley <peter@hurleysoftware.com> Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Michel Lespinasse <walken@google.com> 2013-05-07 09:45:59 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-05-07 10:20:16 -0400
commit: fe6e674c6187d4f452a679ced7e95262bd517936 (patch)
tree: ba04818049ec225a541f278940532473d3005b9c /lib/rwsem.c
parent: 8cf5322ce69afea1fab6a6270db24d057d664798 (diff)
1 files changed, 32 insertions, 32 deletions
diff --git a/lib/rwsem.c b/lib/rwsem.c
index 9a675fa9d78e..bbe48c04f363 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -4,6 +4,7 @@
 * Derived from arch/i386/kernel/semaphore.c
 *
 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
 */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
@@ -41,13 +42,11 @@ struct rwsem_waiter {
        enum rwsem_waiter_type type;
 };
-/* Wake types for __rwsem_do_wake().  Note that RWSEM_WAKE_NO_ACTIVE and
+enum rwsem_wake_type {
- * RWSEM_WAKE_READ_OWNED imply that the spinlock must have been kept held
+        RWSEM_WAKE_ANY,         /* Wake whatever's at head of wait list */
- * since the rwsem value was observed.
+        RWSEM_WAKE_READERS,     /* Wake readers only */
- */
+        RWSEM_WAKE_READ_OWNED   /* Waker thread holds the read lock */
-#define RWSEM_WAKE_ANY        0 /* Wake whatever's at head of wait list */
+};
-#define RWSEM_WAKE_NO_ACTIVE  1 /* rwsem was observed with no active thread */
-#define RWSEM_WAKE_READ_OWNED 2 /* rwsem was observed to be read owned */
 /*
 * handle the lock release when processes blocked on it that can now run
@@ -60,16 +59,16 @@ struct rwsem_waiter {
 * - writers are only woken if downgrading is false
 */
 static struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
+__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 {
        struct rwsem_waiter *waiter;
        struct task_struct *tsk;
        struct list_head *next;
-        signed long woken, loop, adjustment;
+        signed long oldcount, woken, loop, adjustment;
        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
        if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
-                if (wake_type != RWSEM_WAKE_READ_OWNED)
+                if (wake_type == RWSEM_WAKE_ANY)
                        /* Wake writer at the front of the queue, but do not
                         * grant it the lock yet as we want other writers
                         * to be able to steal it.  Readers, on the other hand,
@@ -79,24 +78,24 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
                goto out;
        }
-        /* If we come here from up_xxxx(), another thread might have reached
+        /* Writers might steal the lock before we grant it to the next reader.
-         * rwsem_down_failed_common() before we acquired the spinlock and
+         * We prefer to do the first reader grant before counting readers
-         * woken up a waiter, making it now active.  We prefer to check for
+         * so we can bail out early if a writer stole the lock.
-         * this first in order to not spend too much time with the spinlock
-         * held if we're not going to be able to wake up readers in the end.
-         *
-         * Note that we do not need to update the rwsem count: any writer
-         * trying to acquire rwsem will run rwsem_down_write_failed() due
-         * to the waiting threads and block trying to acquire the spinlock.
-         *
-         * We use a dummy atomic update in order to acquire the cache line
-         * exclusively since we expect to succeed and run the final rwsem
-         * count adjustment pretty soon.
         */
-        if (wake_type == RWSEM_WAKE_ANY &&
+        adjustment = 0;
-            rwsem_atomic_update(0, sem) < RWSEM_WAITING_BIAS)
+        if (wake_type != RWSEM_WAKE_READ_OWNED) {
-                /* Someone grabbed the sem for write already */
+                adjustment = RWSEM_ACTIVE_READ_BIAS;
-                goto out;
+ try_reader_grant:
+                oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+                if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
+                        /* A writer stole the lock. Undo our reader grant. */
+                        if (rwsem_atomic_update(-adjustment, sem) &
+                                                RWSEM_ACTIVE_MASK)
+                                goto out;
+                        /* Last active locker left. Retry waking readers. */
+                        goto try_reader_grant;
+                }
+        }
        /* Grant an infinite number of read locks to the readers at the front
         * of the queue.  Note we increment the 'active part' of the count by
@@ -114,12 +113,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
        } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-        adjustment = woken * RWSEM_ACTIVE_READ_BIAS;
+        adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
        if (waiter->type != RWSEM_WAITING_FOR_WRITE)
                /* hit end of list above */
                adjustment -= RWSEM_WAITING_BIAS;
-        rwsem_atomic_add(adjustment, sem);
+        if (adjustment)
+                rwsem_atomic_add(adjustment, sem);
        next = sem->wait_list.next;
        loop = woken;
@@ -164,8 +164,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        count = rwsem_atomic_update(adjustment, sem);
        /* If there are no active locks, wake the front queued process(es). */
-        if (count == RWSEM_WAITING_BIAS)
+        if (!(count & RWSEM_ACTIVE_MASK))
-                sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE);
+                sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
        raw_spin_unlock_irq(&sem->wait_lock);
@@ -209,7 +209,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
         * any read locks that were queued ahead of us. */
        if (count > RWSEM_WAITING_BIAS &&
            adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
-                sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+                sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
        /* wait until we successfully acquire the lock */
        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
author	Michel Lespinasse <walken@google.com>	2013-05-07 09:45:59 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-05-07 10:20:16 -0400
commit	fe6e674c6187d4f452a679ced7e95262bd517936 (patch)
tree	ba04818049ec225a541f278940532473d3005b9c /lib/rwsem.c
parent	8cf5322ce69afea1fab6a6270db24d057d664798 (diff)