locking/qspinlock: Remove unbounded cmpxchg() loop from locking slowpath

The qspinlock locking slowpath utilises a "pending" bit as a simple form of an embedded test-and-set lock that can avoid the overhead of explicit queuing in cases where the lock is held but uncontended. This bit is managed using a cmpxchg() loop which tries to transition the uncontended lock word from (0,0,0) -> (0,0,1) or (0,0,1) -> (0,1,1). Unfortunately, the cmpxchg() loop is unbounded and lockers can be starved indefinitely if the lock word is seen to oscillate between unlocked (0,0,0) and locked (0,0,1). This could happen if concurrent lockers are able to take the lock in the cmpxchg() loop without queuing and pass it around amongst themselves. This patch fixes the problem by unconditionally setting _Q_PENDING_VAL using atomic_fetch_or, and then inspecting the old value to see whether we need to spin on the current lock owner, or whether we now effectively hold the lock. The tricky scenario is when concurrent lockers end up queuing on the lock and the lock becomes available, causing us to see a lockword of (n,0,0). With pending now set, simply queuing could lead to deadlock as the head of the queue may not have observed the pending flag being cleared. Conversely, if the head of the queue did observe pending being cleared, then it could transition the lock from (n,0,0) -> (0,0,1) meaning that any attempt to "undo" our setting of the pending bit could race with a concurrent locker trying to set it. We handle this race by preserving the pending bit when taking the lock after reaching the head of the queue and leaving the tail entry intact if we saw pending set, because we know that the tail is going to be updated shortly. Signed-off-by: Will Deacon <will.deacon@arm.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Waiman Long <longman@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: boqun.feng@gmail.com Cc: linux-arm-kernel@lists.infradead.org Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1524738868-31318-6-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Will Deacon <will.deacon@arm.com> 2018-04-26 06:34:19 -0400
committer: Ingo Molnar <mingo@kernel.org> 2018-04-27 03:48:47 -0400
commit: 59fb586b4a07b4e1a0ee577140ab4842ba451acd (patch)
tree: 52d6f875dcfa5efc1e3b59d2c4a9208d19a02144 /kernel/locking
parent: b247be3fe89b6aba928bf80f4453d1c4ba8d2063 (diff)
2 files changed, 58 insertions, 49 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index a0f7976348f8..e06f67e021d9 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -128,6 +128,17 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
 #if _Q_PENDING_BITS == 8
 /**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+        WRITE_ONCE(lock->pending, 0);
+}
+/**
 * clear_pending_set_locked - take ownership and clear the pending bit.
 * @lock: Pointer to queued spinlock structure
 *
@@ -163,6 +174,17 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 #else /* _Q_PENDING_BITS == 8 */
 /**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+        atomic_andnot(_Q_PENDING_VAL, &lock->val);
+}
+/**
 * clear_pending_set_locked - take ownership and clear the pending bit.
 * @lock: Pointer to queued spinlock structure
 *
@@ -266,7 +288,7 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 {
        struct mcs_spinlock *prev, *next, *node;
-        u32 new, old, tail;
+        u32 old, tail;
        int idx;
        BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
@@ -290,58 +312,50 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
        }
        /*
+         * If we observe any contention; queue.
+         */
+        if (val & ~_Q_LOCKED_MASK)
+                goto queue;
+        /*
         * trylock || pending
         *
         * 0,0,0 -> 0,0,1 ; trylock
         * 0,0,1 -> 0,1,1 ; pending
         */
-        for (;;) {
+        val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+        if (!(val & ~_Q_LOCKED_MASK)) {
                /*
-                 * If we observe any contention; queue.
+                 * We're pending, wait for the owner to go away.
+                 *
+                 * *,1,1 -> *,1,0
+                 *
+                 * this wait loop must be a load-acquire such that we match the
+                 * store-release that clears the locked bit and create lock
+                 * sequentiality; this is because not all
+                 * clear_pending_set_locked() implementations imply full
+                 * barriers.
                 */
-                if (val & ~_Q_LOCKED_MASK)
+                if (val & _Q_LOCKED_MASK) {
-                        goto queue;
+                        smp_cond_load_acquire(&lock->val.counter,
+                                              !(VAL & _Q_LOCKED_MASK));
-                new = _Q_LOCKED_VAL;
+                }
-                if (val == new)
-                        new |= _Q_PENDING_VAL;
                /*
-                 * Acquire semantic is required here as the function may
+                 * take ownership and clear the pending bit.
-                 * return immediately if the lock was free.
+                 *
+                 * *,1,0 -> *,0,1
                 */
-                old = atomic_cmpxchg_acquire(&lock->val, val, new);
+                clear_pending_set_locked(lock);
-                if (old == val)
-                        break;
-                val = old;
-        }
-        /*
-         * we won the trylock
-         */
-        if (new == _Q_LOCKED_VAL)
                return;
+        }
        /*
-         * we're pending, wait for the owner to go away.
+         * If pending was clear but there are waiters in the queue, then
-         *
+         * we need to undo our setting of pending before we queue ourselves.
-         * *,1,1 -> *,1,0
-         *
-         * this wait loop must be a load-acquire such that we match the
-         * store-release that clears the locked bit and create lock
-         * sequentiality; this is because not all clear_pending_set_locked()
-         * implementations imply full barriers.
-         */
-        smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
-        /*
-         * take ownership and clear the pending bit.
-         *
-         * *,1,0 -> *,0,1
         */
-        clear_pending_set_locked(lock);
+        if (!(val & _Q_PENDING_MASK))
-        return;
+                clear_pending(lock);
        /*
         * End of pending bit optimistic spinning and beginning of MCS
@@ -445,15 +459,15 @@ locked:
         * claim the lock:
         *
         * n,0,0 -> 0,0,1 : lock, uncontended
-         * *,0,0 -> *,0,1 : lock, contended
+         * *,*,0 -> *,*,1 : lock, contended
         *
-         * If the queue head is the only one in the queue (lock value == tail),
+         * If the queue head is the only one in the queue (lock value == tail)
-         * clear the tail code and grab the lock. Otherwise, we only need
+         * and nobody is pending, clear the tail code and grab the lock.
-         * to grab the lock.
+         * Otherwise, we only need to grab the lock.
         */
        for (;;) {
                /* In the PV case we might already have _Q_LOCKED_VAL set */
-                if ((val & _Q_TAIL_MASK) != tail) {
+                if ((val & _Q_TAIL_MASK) != tail || (val & _Q_PENDING_MASK)) {
                        set_locked(lock);
                        break;
                }
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 2711940429f5..2dbad2f25480 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -118,11 +118,6 @@ static __always_inline void set_pending(struct qspinlock *lock)
        WRITE_ONCE(lock->pending, 1);
 }
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
-        WRITE_ONCE(lock->pending, 0);
-}
 /*
 * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
 * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
author	Will Deacon <will.deacon@arm.com>	2018-04-26 06:34:19 -0400
committer	Ingo Molnar <mingo@kernel.org>	2018-04-27 03:48:47 -0400
commit	59fb586b4a07b4e1a0ee577140ab4842ba451acd (patch)
tree	52d6f875dcfa5efc1e3b59d2c4a9208d19a02144 /kernel/locking
parent	b247be3fe89b6aba928bf80f4453d1c4ba8d2063 (diff)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index a0f7976348f8..e06f67e021d9 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c
@@ -128,6 +128,17 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
128		128
129	#if _Q_PENDING_BITS == 8	129	#if _Q_PENDING_BITS == 8
130	/**	130	/**
		131	* clear_pending - clear the pending bit.
		132	* @lock: Pointer to queued spinlock structure
		133	*
		134	* ,1, -> ,0,
		135	*/
		136	static __always_inline void clear_pending(struct qspinlock *lock)
		137	{
		138	WRITE_ONCE(lock->pending, 0);
		139	}
		140
		141	/**
131	* clear_pending_set_locked - take ownership and clear the pending bit.	142	* clear_pending_set_locked - take ownership and clear the pending bit.
132	* @lock: Pointer to queued spinlock structure	143	* @lock: Pointer to queued spinlock structure
133	*	144	*
@@ -163,6 +174,17 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
163	#else /* _Q_PENDING_BITS == 8 */	174	#else /* _Q_PENDING_BITS == 8 */
164		175
165	/**	176	/**
		177	* clear_pending - clear the pending bit.
		178	* @lock: Pointer to queued spinlock structure
		179	*
		180	* ,1, -> ,0,
		181	*/
		182	static __always_inline void clear_pending(struct qspinlock *lock)
		183	{
		184	atomic_andnot(_Q_PENDING_VAL, &lock->val);
		185	}
		186
		187	/**
166	* clear_pending_set_locked - take ownership and clear the pending bit.	188	* clear_pending_set_locked - take ownership and clear the pending bit.
167	* @lock: Pointer to queued spinlock structure	189	* @lock: Pointer to queued spinlock structure
168	*	190	*
@@ -266,7 +288,7 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
266	void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)	288	void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
267	{	289	{
268	struct mcs_spinlock prev, next, *node;	290	struct mcs_spinlock prev, next, *node;
269	u32 new, old, tail;	291	u32 old, tail;
270	int idx;	292	int idx;
271		293
272	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));	294	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
@@ -290,58 +312,50 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
290	}	312	}
291		313
292	/*	314	/*
		315	* If we observe any contention; queue.
		316	*/
		317	if (val & ~_Q_LOCKED_MASK)
		318	goto queue;
		319
		320	/*
293	* trylock \|\| pending	321	* trylock \|\| pending
294	*	322	*
295	* 0,0,0 -> 0,0,1 ; trylock	323	* 0,0,0 -> 0,0,1 ; trylock
296	* 0,0,1 -> 0,1,1 ; pending	324	* 0,0,1 -> 0,1,1 ; pending
297	*/	325	*/
298	for (;;) {	326	val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
		327	if (!(val & ~_Q_LOCKED_MASK)) {
299	/*	328	/*
300	* If we observe any contention; queue.	329	* We're pending, wait for the owner to go away.
		330	*
		331	* ,1,1 -> ,1,0
		332	*
		333	* this wait loop must be a load-acquire such that we match the
		334	* store-release that clears the locked bit and create lock
		335	* sequentiality; this is because not all
		336	* clear_pending_set_locked() implementations imply full
		337	* barriers.
301	*/	338	*/
302	if (val & ~_Q_LOCKED_MASK)	339	if (val & _Q_LOCKED_MASK) {
303	goto queue;	340	smp_cond_load_acquire(&lock->val.counter,
304		341	!(VAL & _Q_LOCKED_MASK));
305	new = _Q_LOCKED_VAL;	342	}
306	if (val == new)
307	new \|= _Q_PENDING_VAL;
308		343
309	/*	344	/*
310	* Acquire semantic is required here as the function may	345	* take ownership and clear the pending bit.
311	* return immediately if the lock was free.	346	*
		347	* ,1,0 -> ,0,1
312	*/	348	*/
313	old = atomic_cmpxchg_acquire(&lock->val, val, new);	349	clear_pending_set_locked(lock);
314	if (old == val)
315	break;
316
317	val = old;
318	}
319
320	/*
321	* we won the trylock
322	*/
323	if (new == _Q_LOCKED_VAL)
324	return;	350	return;
		351	}
325		352
326	/*	353	/*
327	* we're pending, wait for the owner to go away.	354	* If pending was clear but there are waiters in the queue, then
328	*	355	* we need to undo our setting of pending before we queue ourselves.
329	* ,1,1 -> ,1,0
330	*
331	* this wait loop must be a load-acquire such that we match the
332	* store-release that clears the locked bit and create lock
333	* sequentiality; this is because not all clear_pending_set_locked()
334	* implementations imply full barriers.
335	*/
336	smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
337
338	/*
339	* take ownership and clear the pending bit.
340	*
341	* ,1,0 -> ,0,1
342	*/	356	*/
343	clear_pending_set_locked(lock);	357	if (!(val & _Q_PENDING_MASK))
344	return;	358	clear_pending(lock);
345		359
346	/*	360	/*
347	* End of pending bit optimistic spinning and beginning of MCS	361	* End of pending bit optimistic spinning and beginning of MCS
@@ -445,15 +459,15 @@ locked:
445	* claim the lock:	459	* claim the lock:
446	*	460	*
447	* n,0,0 -> 0,0,1 : lock, uncontended	461	* n,0,0 -> 0,0,1 : lock, uncontended
448	* ,0,0 -> ,0,1 : lock, contended	462	* ,,0 -> ,,1 : lock, contended
449	*	463	*
450	* If the queue head is the only one in the queue (lock value == tail),	464	* If the queue head is the only one in the queue (lock value == tail)
451	* clear the tail code and grab the lock. Otherwise, we only need	465	* and nobody is pending, clear the tail code and grab the lock.
452	* to grab the lock.	466	* Otherwise, we only need to grab the lock.
453	*/	467	*/
454	for (;;) {	468	for (;;) {
455	/* In the PV case we might already have _Q_LOCKED_VAL set */	469	/* In the PV case we might already have _Q_LOCKED_VAL set */
456	if ((val & _Q_TAIL_MASK) != tail) {	470	if ((val & _Q_TAIL_MASK) != tail \|\| (val & _Q_PENDING_MASK)) {
457	set_locked(lock);	471	set_locked(lock);
458	break;	472	break;
459	}	473	}


diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 2711940429f5..2dbad2f25480 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h
@@ -118,11 +118,6 @@ static __always_inline void set_pending(struct qspinlock *lock)
118	WRITE_ONCE(lock->pending, 1);	118	WRITE_ONCE(lock->pending, 1);
119	}	119	}
120		120
121	static __always_inline void clear_pending(struct qspinlock *lock)
122	{
123	WRITE_ONCE(lock->pending, 0);
124	}
125
126	/*	121	/*
127	* The pending bit check in pv_queued_spin_steal_lock() isn't a memory	122	* The pending bit check in pv_queued_spin_steal_lock() isn't a memory
128	* barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the	123	* barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the