1 files changed, 473 insertions, 0 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
new file mode 100644
index 000000000000..38c49202d532
--- /dev/null
+++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,473 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ *          Peter Zijlstra <peterz@infradead.org>
+ */
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/byteorder.h>
+#include <asm/qspinlock.h>
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. The paper below provides a good description for this kind
+ * of lock.
+ *
+ * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to make
+ * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
+ * API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ *      atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+#include "mcs_spinlock.h"
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES       8
+#else
+#define MAX_NODES       4
+#endif
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+static inline u32 encode_tail(int cpu, int idx)
+{
+        u32 tail;
+#ifdef CONFIG_DEBUG_SPINLOCK
+        BUG_ON(idx > 3);
+#endif
+        tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+        tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+        return tail;
+}
+static inline struct mcs_spinlock *decode_tail(u32 tail)
+{
+        int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+        int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+        return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ *
+ * This internal structure is also used by the set_locked function which
+ * is not restricted to _Q_PENDING_BITS == 8.
+ */
+struct __qspinlock {
+        union {
+                atomic_t val;
+#ifdef __LITTLE_ENDIAN
+                struct {
+                        u8      locked;
+                        u8      pending;
+                };
+                struct {
+                        u16     locked_pending;
+                        u16     tail;
+                };
+#else
+                struct {
+                        u16     tail;
+                        u16     locked_pending;
+                };
+                struct {
+                        u8      reserved[2];
+                        u8      pending;
+                        u8      locked;
+                };
+#endif
+        };
+};
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+        struct __qspinlock *l = (void *)lock;
+        return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+#else /* _Q_PENDING_BITS == 8 */
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+        atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+        u32 old, new, val = atomic_read(&lock->val);
+        for (;;) {
+                new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+                old = atomic_cmpxchg(&lock->val, val, new);
+                if (old == val)
+                        break;
+                val = old;
+        }
+        return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+                                           struct mcs_spinlock *node) { }
+#define pv_enabled()            false
+#define pv_init_node            __pv_init_node
+#define pv_wait_node            __pv_wait_node
+#define pv_kick_node            __pv_kick_node
+#define pv_wait_head            __pv_wait_head
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath       native_queued_spin_lock_slowpath
+#endif
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+/**
+ * queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ *              fast     :    slow                                  :    unlock
+ *                       :                                          :
+ * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ *                       :       | ^--------.------.             /  :
+ *                       :       v           \      \            |  :
+ * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
+ *                       :       | ^--'              |           |  :
+ *                       :       v                   |           |  :
+ * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
+ *   queue               :       | ^--'                          |  :
+ *                       :       v                               |  :
+ * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
+ *   queue               :         ^--'                             :
+ */
+void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+        struct mcs_spinlock *prev, *next, *node;
+        u32 new, old, tail;
+        int idx;
+        BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+        if (pv_enabled())
+                goto queue;
+        if (virt_queued_spin_lock(lock))
+                return;
+        /*
+         * wait for in-progress pending->locked hand-overs
+         *
+         * 0,1,0 -> 0,0,1
+         */
+        if (val == _Q_PENDING_VAL) {
+                while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+                        cpu_relax();
+        }
+        /*
+         * trylock || pending
+         *
+         * 0,0,0 -> 0,0,1 ; trylock
+         * 0,0,1 -> 0,1,1 ; pending
+         */
+        for (;;) {
+                /*
+                 * If we observe any contention; queue.
+                 */
+                if (val & ~_Q_LOCKED_MASK)
+                        goto queue;
+                new = _Q_LOCKED_VAL;
+                if (val == new)
+                        new |= _Q_PENDING_VAL;
+                old = atomic_cmpxchg(&lock->val, val, new);
+                if (old == val)
+                        break;
+                val = old;
+        }
+        /*
+         * we won the trylock
+         */
+        if (new == _Q_LOCKED_VAL)
+                return;
+        /*
+         * we're pending, wait for the owner to go away.
+         *
+         * *,1,1 -> *,1,0
+         *
+         * this wait loop must be a load-acquire such that we match the
+         * store-release that clears the locked bit and create lock
+         * sequentiality; this is because not all clear_pending_set_locked()
+         * implementations imply full barriers.
+         */
+        while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
+                cpu_relax();
+        /*
+         * take ownership and clear the pending bit.
+         *
+         * *,1,0 -> *,0,1
+         */
+        clear_pending_set_locked(lock);
+        return;
+        /*
+         * End of pending bit optimistic spinning and beginning of MCS
+         * queuing.
+         */
+queue:
+        node = this_cpu_ptr(&mcs_nodes[0]);
+        idx = node->count++;
+        tail = encode_tail(smp_processor_id(), idx);
+        node += idx;
+        node->locked = 0;
+        node->next = NULL;
+        pv_init_node(node);
+        /*
+         * We touched a (possibly) cold cacheline in the per-cpu queue node;
+         * attempt the trylock once more in the hope someone let go while we
+         * weren't watching.
+         */
+        if (queued_spin_trylock(lock))
+                goto release;
+        /*
+         * We have already touched the queueing cacheline; don't bother with
+         * pending stuff.
+         *
+         * p,*,* -> n,*,*
+         */
+        old = xchg_tail(lock, tail);
+        /*
+         * if there was a previous node; link it and wait until reaching the
+         * head of the waitqueue.
+         */
+        if (old & _Q_TAIL_MASK) {
+                prev = decode_tail(old);
+                WRITE_ONCE(prev->next, node);
+                pv_wait_node(node);
+                arch_mcs_spin_lock_contended(&node->locked);
+        }
+        /*
+         * we're at the head of the waitqueue, wait for the owner & pending to
+         * go away.
+         *
+         * *,x,y -> *,0,0
+         *
+         * this wait loop must use a load-acquire such that we match the
+         * store-release that clears the locked bit and create lock
+         * sequentiality; this is because the set_locked() function below
+         * does not imply a full barrier.
+         *
+         */
+        pv_wait_head(lock, node);
+        while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+                cpu_relax();
+        /*
+         * claim the lock:
+         *
+         * n,0,0 -> 0,0,1 : lock, uncontended
+         * *,0,0 -> *,0,1 : lock, contended
+         *
+         * If the queue head is the only one in the queue (lock value == tail),
+         * clear the tail code and grab the lock. Otherwise, we only need
+         * to grab the lock.
+         */
+        for (;;) {
+                if (val != tail) {
+                        set_locked(lock);
+                        break;
+                }
+                old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+                if (old == val)
+                        goto release;   /* No contention */
+                val = old;
+        }
+        /*
+         * contended path; wait for next, release.
+         */
+        while (!(next = READ_ONCE(node->next)))
+                cpu_relax();
+        arch_mcs_spin_unlock_contended(&next->locked);
+        pv_kick_node(next);
+release:
+        /*
+         * release the node
+         */
+        this_cpu_dec(mcs_nodes[0].count);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+#undef  pv_enabled
+#define pv_enabled()    true
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head
+#undef  queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath       __pv_queued_spin_lock_slowpath
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+#endif

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c new file mode 100644 index 000000000000..38c49202d532 --- /dev/null +++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,473 @@
	1	/*
	2	* Queued spinlock
	3	*
	4	* This program is free software; you can redistribute it and/or modify
	5	* it under the terms of the GNU General Public License as published by
	6	* the Free Software Foundation; either version 2 of the License, or
	7	* (at your option) any later version.
	8	*
	9	* This program is distributed in the hope that it will be useful,
	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	* GNU General Public License for more details.
	13	*
	14	* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
	15	* (C) Copyright 2013-2014 Red Hat, Inc.
	16	* (C) Copyright 2015 Intel Corp.
	17	*
	18	* Authors: Waiman Long <waiman.long@hp.com>
	19	* Peter Zijlstra <peterz@infradead.org>
	20	*/
	21
	22	#ifndef _GEN_PV_LOCK_SLOWPATH
	23
	24	#include <linux/smp.h>
	25	#include <linux/bug.h>
	26	#include <linux/cpumask.h>
	27	#include <linux/percpu.h>
	28	#include <linux/hardirq.h>
	29	#include <linux/mutex.h>
	30	#include <asm/byteorder.h>
	31	#include <asm/qspinlock.h>
	32
	33	/*
	34	* The basic principle of a queue-based spinlock can best be understood
	35	* by studying a classic queue-based spinlock implementation called the
	36	* MCS lock. The paper below provides a good description for this kind
	37	* of lock.
	38	*
	39	* http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
	40	*
	41	* This queued spinlock implementation is based on the MCS lock, however to make
	42	* it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
	43	* API, we must modify it somehow.
	44	*
	45	* In particular; where the traditional MCS lock consists of a tail pointer
	46	* (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
	47	* unlock the next pending (next->locked), we compress both these: {tail,
	48	* next->locked} into a single u32 value.
	49	*
	50	* Since a spinlock disables recursion of its own context and there is a limit
	51	* to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
	52	* are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
	53	* we can encode the tail by combining the 2-bit nesting level with the cpu
	54	* number. With one byte for the lock value and 3 bytes for the tail, only a
	55	* 32-bit word is now needed. Even though we only need 1 bit for the lock,
	56	* we extend it to a full byte to achieve better performance for architectures
	57	* that support atomic byte write.
	58	*
	59	* We also change the first spinner to spin on the lock bit instead of its
	60	* node; whereby avoiding the need to carry a node from lock to unlock, and
	61	* preserving existing lock API. This also makes the unlock code simpler and
	62	* faster.
	63	*
	64	* N.B. The current implementation only supports architectures that allow
	65	* atomic operations on smaller 8-bit and 16-bit data types.
	66	*
	67	*/
	68
	69	#include "mcs_spinlock.h"
	70
	71	#ifdef CONFIG_PARAVIRT_SPINLOCKS
	72	#define MAX_NODES 8
	73	#else
	74	#define MAX_NODES 4
	75	#endif
	76
	77	/*
	78	* Per-CPU queue node structures; we can never have more than 4 nested
	79	* contexts: task, softirq, hardirq, nmi.
	80	*
	81	* Exactly fits one 64-byte cacheline on a 64-bit architecture.
	82	*
	83	* PV doubles the storage and uses the second cacheline for PV state.
	84	*/
	85	static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
	86
	87	/*
	88	* We must be able to distinguish between no-tail and the tail at 0:0,
	89	* therefore increment the cpu number by one.
	90	*/
	91
	92	static inline u32 encode_tail(int cpu, int idx)
	93	{
	94	u32 tail;
	95
	96	#ifdef CONFIG_DEBUG_SPINLOCK
	97	BUG_ON(idx > 3);
	98	#endif
	99	tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
	100	tail \|= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
	101
	102	return tail;
	103	}
	104
	105	static inline struct mcs_spinlock *decode_tail(u32 tail)
	106	{
	107	int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
	108	int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
	109
	110	return per_cpu_ptr(&mcs_nodes[idx], cpu);
	111	}
	112
	113	#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK \| _Q_PENDING_MASK)
	114
	115	/*
	116	* By using the whole 2nd least significant byte for the pending bit, we
	117	* can allow better optimization of the lock acquisition for the pending
	118	* bit holder.
	119	*
	120	* This internal structure is also used by the set_locked function which
	121	* is not restricted to _Q_PENDING_BITS == 8.
	122	*/
	123	struct __qspinlock {
	124	union {
	125	atomic_t val;
	126	#ifdef __LITTLE_ENDIAN
	127	struct {
	128	u8 locked;
	129	u8 pending;
	130	};
	131	struct {
	132	u16 locked_pending;
	133	u16 tail;
	134	};
	135	#else
	136	struct {
	137	u16 tail;
	138	u16 locked_pending;
	139	};
	140	struct {
	141	u8 reserved[2];
	142	u8 pending;
	143	u8 locked;
	144	};
	145	#endif
	146	};
	147	};
	148
	149	#if _Q_PENDING_BITS == 8
	150	/**
	151	* clear_pending_set_locked - take ownership and clear the pending bit.
	152	* @lock: Pointer to queued spinlock structure
	153	*
	154	* ,1,0 -> ,0,1
	155	*
	156	* Lock stealing is not allowed if this function is used.
	157	*/
	158	static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
	159	{
	160	struct __qspinlock l = (void )lock;
	161
	162	WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
	163	}
	164
	165	/*
	166	* xchg_tail - Put in the new queue tail code word & retrieve previous one
	167	* @lock : Pointer to queued spinlock structure
	168	* @tail : The new queue tail code word
	169	* Return: The previous queue tail code word
	170	*
	171	* xchg(lock, tail)
	172	*
	173	* p,, -> n,, ; prev = xchg(lock, node)
	174	*/
	175	static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
	176	{
	177	struct __qspinlock l = (void )lock;
	178
	179	return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
	180	}
	181
	182	#else /* _Q_PENDING_BITS == 8 */
	183
	184	/**
	185	* clear_pending_set_locked - take ownership and clear the pending bit.
	186	* @lock: Pointer to queued spinlock structure
	187	*
	188	* ,1,0 -> ,0,1
	189	*/
	190	static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
	191	{
	192	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
	193	}
	194
	195	/**
	196	* xchg_tail - Put in the new queue tail code word & retrieve previous one
	197	* @lock : Pointer to queued spinlock structure
	198	* @tail : The new queue tail code word
	199	* Return: The previous queue tail code word
	200	*
	201	* xchg(lock, tail)
	202	*
	203	* p,, -> n,, ; prev = xchg(lock, node)
	204	*/
	205	static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
	206	{
	207	u32 old, new, val = atomic_read(&lock->val);
	208
	209	for (;;) {
	210	new = (val & _Q_LOCKED_PENDING_MASK) \| tail;
	211	old = atomic_cmpxchg(&lock->val, val, new);
	212	if (old == val)
	213	break;
	214
	215	val = old;
	216	}
	217	return old;
	218	}
	219	#endif /* _Q_PENDING_BITS == 8 */
	220
	221	/**
	222	* set_locked - Set the lock bit and own the lock
	223	* @lock: Pointer to queued spinlock structure
	224	*
	225	* ,,0 -> *,0,1
	226	*/
	227	static __always_inline void set_locked(struct qspinlock *lock)
	228	{
	229	struct __qspinlock l = (void )lock;
	230
	231	WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
	232	}
	233
	234
	235	/*
	236	* Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
	237	* all the PV callbacks.
	238	*/
	239
	240	static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
	241	static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
	242	static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
	243
	244	static __always_inline void __pv_wait_head(struct qspinlock *lock,
	245	struct mcs_spinlock *node) { }
	246
	247	#define pv_enabled() false
	248
	249	#define pv_init_node __pv_init_node
	250	#define pv_wait_node __pv_wait_node
	251	#define pv_kick_node __pv_kick_node
	252	#define pv_wait_head __pv_wait_head
	253
	254	#ifdef CONFIG_PARAVIRT_SPINLOCKS
	255	#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
	256	#endif
	257
	258	#endif /* _GEN_PV_LOCK_SLOWPATH */
	259
	260	/**
	261	* queued_spin_lock_slowpath - acquire the queued spinlock
	262	* @lock: Pointer to queued spinlock structure
	263	* @val: Current value of the queued spinlock 32-bit word
	264	*
	265	* (queue tail, pending bit, lock value)
	266	*
	267	* fast : slow : unlock
	268	* : :
	269	* uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (,,0)
	270	* : \| ^--------.------. / :
	271	* : v \ \ \| :
	272	* pending : (0,1,1) +--> (0,1,0) \ \| :
	273	* : \| ^--' \| \| :
	274	* : v \| \| :
	275	* uncontended : (n,x,y) +--> (n,0,0) --' \| :
	276	* queue : \| ^--' \| :
	277	* : v \| :
	278	* contended : (,x,y) +--> (,0,0) ---> (*,0,1) -' :
	279	* queue : ^--' :
	280	*/
	281	void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
	282	{
	283	struct mcs_spinlock prev, next, *node;
	284	u32 new, old, tail;
	285	int idx;
	286
	287	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
	288
	289	if (pv_enabled())
	290	goto queue;
	291
	292	if (virt_queued_spin_lock(lock))
	293	return;
	294
	295	/*
	296	* wait for in-progress pending->locked hand-overs
	297	*
	298	* 0,1,0 -> 0,0,1
	299	*/
	300	if (val == _Q_PENDING_VAL) {
	301	while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
	302	cpu_relax();
	303	}
	304
	305	/*
	306	* trylock \|\| pending
	307	*
	308	* 0,0,0 -> 0,0,1 ; trylock
	309	* 0,0,1 -> 0,1,1 ; pending
	310	*/
	311	for (;;) {
	312	/*
	313	* If we observe any contention; queue.
	314	*/
	315	if (val & ~_Q_LOCKED_MASK)
	316	goto queue;
	317
	318	new = _Q_LOCKED_VAL;
	319	if (val == new)
	320	new \|= _Q_PENDING_VAL;
	321
	322	old = atomic_cmpxchg(&lock->val, val, new);
	323	if (old == val)
	324	break;
	325
	326	val = old;
	327	}
	328
	329	/*
	330	* we won the trylock
	331	*/
	332	if (new == _Q_LOCKED_VAL)
	333	return;
	334
	335	/*
	336	* we're pending, wait for the owner to go away.
	337	*
	338	* ,1,1 -> ,1,0
	339	*
	340	* this wait loop must be a load-acquire such that we match the
	341	* store-release that clears the locked bit and create lock
	342	* sequentiality; this is because not all clear_pending_set_locked()
	343	* implementations imply full barriers.
	344	*/
	345	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
	346	cpu_relax();
	347
	348	/*
	349	* take ownership and clear the pending bit.
	350	*
	351	* ,1,0 -> ,0,1
	352	*/
	353	clear_pending_set_locked(lock);
	354	return;
	355
	356	/*
	357	* End of pending bit optimistic spinning and beginning of MCS
	358	* queuing.
	359	*/
	360	queue:
	361	node = this_cpu_ptr(&mcs_nodes[0]);
	362	idx = node->count++;
	363	tail = encode_tail(smp_processor_id(), idx);
	364
	365	node += idx;
	366	node->locked = 0;
	367	node->next = NULL;
	368	pv_init_node(node);
	369
	370	/*
	371	* We touched a (possibly) cold cacheline in the per-cpu queue node;
	372	* attempt the trylock once more in the hope someone let go while we
	373	* weren't watching.
	374	*/
	375	if (queued_spin_trylock(lock))
	376	goto release;
	377
	378	/*
	379	* We have already touched the queueing cacheline; don't bother with
	380	* pending stuff.
	381	*
	382	* p,, -> n,,
	383	*/
	384	old = xchg_tail(lock, tail);
	385
	386	/*
	387	* if there was a previous node; link it and wait until reaching the
	388	* head of the waitqueue.
	389	*/
	390	if (old & _Q_TAIL_MASK) {
	391	prev = decode_tail(old);
	392	WRITE_ONCE(prev->next, node);
	393
	394	pv_wait_node(node);
	395	arch_mcs_spin_lock_contended(&node->locked);
	396	}
	397
	398	/*
	399	* we're at the head of the waitqueue, wait for the owner & pending to
	400	* go away.
	401	*
	402	* ,x,y -> ,0,0
	403	*
	404	* this wait loop must use a load-acquire such that we match the
	405	* store-release that clears the locked bit and create lock
	406	* sequentiality; this is because the set_locked() function below
	407	* does not imply a full barrier.
	408	*
	409	*/
	410	pv_wait_head(lock, node);
	411	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
	412	cpu_relax();
	413
	414	/*
	415	* claim the lock:
	416	*
	417	* n,0,0 -> 0,0,1 : lock, uncontended
	418	* ,0,0 -> ,0,1 : lock, contended
	419	*
	420	* If the queue head is the only one in the queue (lock value == tail),
	421	* clear the tail code and grab the lock. Otherwise, we only need
	422	* to grab the lock.
	423	*/
	424	for (;;) {
	425	if (val != tail) {
	426	set_locked(lock);
	427	break;
	428	}
	429	old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
	430	if (old == val)
	431	goto release; /* No contention */
	432
	433	val = old;
	434	}
	435
	436	/*
	437	* contended path; wait for next, release.
	438	*/
	439	while (!(next = READ_ONCE(node->next)))
	440	cpu_relax();
	441
	442	arch_mcs_spin_unlock_contended(&next->locked);
	443	pv_kick_node(next);
	444
	445	release:
	446	/*
	447	* release the node
	448	*/
	449	this_cpu_dec(mcs_nodes[0].count);
	450	}
	451	EXPORT_SYMBOL(queued_spin_lock_slowpath);
	452
	453	/*
	454	* Generate the paravirt code for queued_spin_unlock_slowpath().
	455	*/
	456	#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
	457	#define _GEN_PV_LOCK_SLOWPATH
	458
	459	#undef pv_enabled
	460	#define pv_enabled() true
	461
	462	#undef pv_init_node
	463	#undef pv_wait_node
	464	#undef pv_kick_node
	465	#undef pv_wait_head
	466
	467	#undef queued_spin_lock_slowpath
	468	#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
	469
	470	#include "qspinlock_paravirt.h"
	471	#include "qspinlock.c"
	472
	473	#endif