1 files changed, 97 insertions, 46 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index bfaeb05123ff..8a8c3c208c5e 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -74,12 +74,24 @@
 */
 #include "mcs_spinlock.h"
+#define MAX_NODES       4
+/*
+ * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
+ * size and four of them will fit nicely in one 64-byte cacheline. For
+ * pvqspinlock, however, we need more space for extra data. To accommodate
+ * that, we insert two more long words to pad it up to 32 bytes. IOW, only
+ * two of them can fit in a cacheline in this case. That is OK as it is rare
+ * to have more than 2 levels of slowpath nesting in actual use. We don't
+ * want to penalize pvqspinlocks to optimize for a rare case in native
+ * qspinlocks.
+ */
+struct qnode {
+        struct mcs_spinlock mcs;
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
-#define MAX_NODES       8
+        long reserved[2];
-#else
-#define MAX_NODES       4
 #endif
+};
 /*
 * The pending bit spinning loop count.
@@ -101,7 +113,7 @@
 *
 * PV doubles the storage and uses the second cacheline for PV state.
 */
-static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);
 /*
 * We must be able to distinguish between no-tail and the tail at 0:0,
@@ -126,7 +138,13 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
        int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
        int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
-        return per_cpu_ptr(&mcs_nodes[idx], cpu);
+        return per_cpu_ptr(&qnodes[idx].mcs, cpu);
+}
+static inline __pure
+struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)
+{
+        return &((struct qnode *)base + idx)->mcs;
 }
 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
@@ -232,6 +250,20 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 #endif /* _Q_PENDING_BITS == 8 */
 /**
+ * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
+ * @lock : Pointer to queued spinlock structure
+ * Return: The previous lock value
+ *
+ * *,*,* -> *,1,*
+ */
+#ifndef queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
+{
+        return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+}
+#endif
+/**
 * set_locked - Set the lock bit and own the lock
 * @lock: Pointer to queued spinlock structure
 *
@@ -326,43 +358,48 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
        /*
         * trylock || pending
         *
-         * 0,0,0 -> 0,0,1 ; trylock
+         * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
-         * 0,0,1 -> 0,1,1 ; pending
         */
-        val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+        val = queued_fetch_set_pending_acquire(lock);
-        if (!(val & ~_Q_LOCKED_MASK)) {
-                /*
-                 * We're pending, wait for the owner to go away.
-                 *
-                 * *,1,1 -> *,1,0
-                 *
-                 * this wait loop must be a load-acquire such that we match the
-                 * store-release that clears the locked bit and create lock
-                 * sequentiality; this is because not all
-                 * clear_pending_set_locked() implementations imply full
-                 * barriers.
-                 */
-                if (val & _Q_LOCKED_MASK) {
-                        atomic_cond_read_acquire(&lock->val,
-                                                 !(VAL & _Q_LOCKED_MASK));
-                }
-                /*
+        /*
-                 * take ownership and clear the pending bit.
+         * If we observe contention, there is a concurrent locker.
-                 *
+         *
-                 * *,1,0 -> *,0,1
+         * Undo and queue; our setting of PENDING might have made the
-                 */
+         * n,0,0 -> 0,0,0 transition fail and it will now be waiting
-                clear_pending_set_locked(lock);
+         * on @next to become !NULL.
-                qstat_inc(qstat_lock_pending, true);
+         */
-                return;
+        if (unlikely(val & ~_Q_LOCKED_MASK)) {
+                /* Undo PENDING if we set it. */
+                if (!(val & _Q_PENDING_MASK))
+                        clear_pending(lock);
+                goto queue;
        }
        /*
-         * If pending was clear but there are waiters in the queue, then
+         * We're pending, wait for the owner to go away.
-         * we need to undo our setting of pending before we queue ourselves.
+         *
+         * 0,1,1 -> 0,1,0
+         *
+         * this wait loop must be a load-acquire such that we match the
+         * store-release that clears the locked bit and create lock
+         * sequentiality; this is because not all
+         * clear_pending_set_locked() implementations imply full
+         * barriers.
+         */
+        if (val & _Q_LOCKED_MASK)
+                atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
+        /*
+         * take ownership and clear the pending bit.
+         *
+         * 0,1,0 -> 0,0,1
         */
-        if (!(val & _Q_PENDING_MASK))
+        clear_pending_set_locked(lock);
-                clear_pending(lock);
+        qstat_inc(qstat_lock_pending, true);
+        return;
        /*
         * End of pending bit optimistic spinning and beginning of MCS
@@ -371,11 +408,16 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 queue:
        qstat_inc(qstat_lock_slowpath, true);
 pv_queue:
-        node = this_cpu_ptr(&mcs_nodes[0]);
+        node = this_cpu_ptr(&qnodes[0].mcs);
        idx = node->count++;
        tail = encode_tail(smp_processor_id(), idx);
-        node += idx;
+        node = grab_mcs_node(node, idx);
+        /*
+         * Keep counts of non-zero index values:
+         */
+        qstat_inc(qstat_lock_idx1 + idx - 1, idx);
        /*
         * Ensure that we increment the head node->count before initialising
@@ -476,16 +518,25 @@ locked:
         */
        /*
-         * In the PV case we might already have _Q_LOCKED_VAL set.
+         * In the PV case we might already have _Q_LOCKED_VAL set, because
+         * of lock stealing; therefore we must also allow:
+         *
+         * n,0,1 -> 0,0,1
         *
-         * The atomic_cond_read_acquire() call above has provided the
+         * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
-         * necessary acquire semantics required for locking.
+         *       above wait condition, therefore any concurrent setting of
+         *       PENDING will make the uncontended transition fail.
         */
-        if (((val & _Q_TAIL_MASK) == tail) &&
+        if ((val & _Q_TAIL_MASK) == tail) {
-            atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+                if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
-                goto release; /* No contention */
+                        goto release; /* No contention */
+        }
-        /* Either somebody is queued behind us or _Q_PENDING_VAL is set */
+        /*
+         * Either somebody is queued behind us or _Q_PENDING_VAL got set
+         * which will then detect the remaining tail and queue behind us
+         * ensuring we'll see a @next.
+         */
        set_locked(lock);
        /*
@@ -501,7 +552,7 @@ release:
        /*
         * release the node
         */
-        __this_cpu_dec(mcs_nodes[0].count);
+        __this_cpu_dec(qnodes[0].mcs.count);
 }
 EXPORT_SYMBOL(queued_spin_lock_slowpath);