1 files changed, 64 insertions, 18 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a63c5..393d1874b9e0 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
 * (C) Copyright 2013-2014 Red Hat, Inc.
 * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
 *
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
 *          Peter Zijlstra <peterz@infradead.org>
 */
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 {
        struct __qspinlock *l = (void *)lock;
-        return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+        /*
+         * Use release semantics to make sure that the MCS node is properly
+         * initialized before changing the tail code.
+         */
+        return (u32)xchg_release(&l->tail,
+                                 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
 }
 #else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
        for (;;) {
                new = (val & _Q_LOCKED_PENDING_MASK) | tail;
-                old = atomic_cmpxchg(&lock->val, val, new);
+                /*
+                 * Use release semantics to make sure that the MCS node is
+                 * properly initialized before changing the tail code.
+                 */
+                old = atomic_cmpxchg_release(&lock->val, val, new);
                if (old == val)
                        break;
@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
 */
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+                                           struct mcs_spinlock *prev) { }
 static __always_inline void __pv_kick_node(struct qspinlock *lock,
                                           struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
+static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
-                                           struct mcs_spinlock *node) { }
+                                                   struct mcs_spinlock *node)
+                                                   { return 0; }
 #define pv_enabled()            false
 #define pv_init_node            __pv_init_node
 #define pv_wait_node            __pv_wait_node
 #define pv_kick_node            __pv_kick_node
-#define pv_wait_head            __pv_wait_head
+#define pv_wait_head_or_lock    __pv_wait_head_or_lock
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 #define queued_spin_lock_slowpath       native_queued_spin_lock_slowpath
@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
                if (val == new)
                        new |= _Q_PENDING_VAL;
-                old = atomic_cmpxchg(&lock->val, val, new);
+                /*
+                 * Acquire semantic is required here as the function may
+                 * return immediately if the lock was free.
+                 */
+                old = atomic_cmpxchg_acquire(&lock->val, val, new);
                if (old == val)
                        break;
@@ -382,6 +398,7 @@ queue:
         * p,*,* -> n,*,*
         */
        old = xchg_tail(lock, tail);
+        next = NULL;
        /*
         * if there was a previous node; link it and wait until reaching the
@@ -391,8 +408,18 @@ queue:
                prev = decode_tail(old);
                WRITE_ONCE(prev->next, node);
-                pv_wait_node(node);
+                pv_wait_node(node, prev);
                arch_mcs_spin_lock_contended(&node->locked);
+                /*
+                 * While waiting for the MCS lock, the next pointer may have
+                 * been set by another lock waiter. We optimistically load
+                 * the next pointer & prefetch the cacheline for writing
+                 * to reduce latency in the upcoming MCS unlock operation.
+                 */
+                next = READ_ONCE(node->next);
+                if (next)
+                        prefetchw(next);
        }
        /*
@@ -406,11 +433,22 @@ queue:
         * sequentiality; this is because the set_locked() function below
         * does not imply a full barrier.
         *
+         * The PV pv_wait_head_or_lock function, if active, will acquire
+         * the lock and return a non-zero value. So we have to skip the
+         * smp_load_acquire() call. As the next PV queue head hasn't been
+         * designated yet, there is no way for the locked value to become
+         * _Q_SLOW_VAL. So both the set_locked() and the
+         * atomic_cmpxchg_relaxed() calls will be safe.
+         *
+         * If PV isn't active, 0 will be returned instead.
+         *
         */
-        pv_wait_head(lock, node);
+        if ((val = pv_wait_head_or_lock(lock, node)))
-        while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+                goto locked;
-                cpu_relax();
+        smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+locked:
        /*
         * claim the lock:
         *
@@ -422,11 +460,17 @@ queue:
         * to grab the lock.
         */
        for (;;) {
-                if (val != tail) {
+                /* In the PV case we might already have _Q_LOCKED_VAL set */
+                if ((val & _Q_TAIL_MASK) != tail) {
                        set_locked(lock);
                        break;
                }
-                old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+                /*
+                 * The smp_load_acquire() call above has provided the necessary
+                 * acquire semantics required for locking. At most two
+                 * iterations of this loop may be ran.
+                 */
+                old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
                if (old == val)
                        goto release;   /* No contention */
@@ -434,10 +478,12 @@ queue:
        }
        /*
-         * contended path; wait for next, release.
+         * contended path; wait for next if not observed yet, release.
         */
-        while (!(next = READ_ONCE(node->next)))
+        if (!next) {
-                cpu_relax();
+                while (!(next = READ_ONCE(node->next)))
+                        cpu_relax();
+        }
        arch_mcs_spin_unlock_contended(&next->locked);
        pv_kick_node(lock, next);
@@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
 #undef pv_init_node
 #undef pv_wait_node
 #undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
 #undef  queued_spin_lock_slowpath
 #define queued_spin_lock_slowpath       __pv_queued_spin_lock_slowpath