locking/pvqspinlock: Implement simple paravirt support for the qspinlock

Provide a separate (second) version of the spin_lock_slowpath for paravirt along with a special unlock path. The second slowpath is generated by adding a few pv hooks to the normal slowpath, but where those will compile away for the native case, they expand into special wait/wake code for the pv version. The actual MCS queue can use extra storage in the mcs_nodes[] array to keep track of state and therefore uses directed wakeups. The head contender has no such storage directly visible to the unlocker. So the unlocker searches a hash table with open addressing using a simple binary Galois linear feedback shift register. Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Waiman Long <Waiman.Long@hp.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Daniel J Blueman <daniel@numascale.com> Cc: David Vrabel <david.vrabel@citrix.com> Cc: Douglas Hatch <doug.hatch@hp.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Paolo Bonzini <paolo.bonzini@gmail.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com> Cc: Rik van Riel <riel@redhat.com> Cc: Scott J Norton <scott.norton@hp.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1429901803-29771-9-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Waiman Long <Waiman.Long@hp.com> 2015-04-24 14:56:37 -0400
committer: Ingo Molnar <mingo@kernel.org> 2015-05-08 06:37:05 -0400
commit: a23db284fe0d1879ca2002bf31077b5efa2fe2ca (patch)
tree: 816042635ecb0462f1cb3f35bc8091f1cd46ce19 /kernel/locking
parent: 2aa79af64263190eec610422b07f60e99a7d230a (diff)
2 files changed, 392 insertions, 1 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index fd31a474145d..38c49202d532 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -18,6 +18,9 @@
 * Authors: Waiman Long <waiman.long@hp.com>
 *          Peter Zijlstra <peterz@infradead.org>
 */
+#ifndef _GEN_PV_LOCK_SLOWPATH
 #include <linux/smp.h>
 #include <linux/bug.h>
 #include <linux/cpumask.h>
@@ -65,13 +68,21 @@
 #include "mcs_spinlock.h"
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES       8
+#else
+#define MAX_NODES       4
+#endif
 /*
 * Per-CPU queue node structures; we can never have more than 4 nested
 * contexts: task, softirq, hardirq, nmi.
 *
 * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
 */
-static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[4]);
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
 /*
 * We must be able to distinguish between no-tail and the tail at 0:0,
@@ -220,6 +231,32 @@ static __always_inline void set_locked(struct qspinlock *lock)
        WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
 }
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+                                           struct mcs_spinlock *node) { }
+#define pv_enabled()            false
+#define pv_init_node            __pv_init_node
+#define pv_wait_node            __pv_wait_node
+#define pv_kick_node            __pv_kick_node
+#define pv_wait_head            __pv_wait_head
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath       native_queued_spin_lock_slowpath
+#endif
+#endif /* _GEN_PV_LOCK_SLOWPATH */
 /**
 * queued_spin_lock_slowpath - acquire the queued spinlock
 * @lock: Pointer to queued spinlock structure
@@ -249,6 +286,9 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
        BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+        if (pv_enabled())
+                goto queue;
        if (virt_queued_spin_lock(lock))
                return;
@@ -325,6 +365,7 @@ queue:
        node += idx;
        node->locked = 0;
        node->next = NULL;
+        pv_init_node(node);
        /*
         * We touched a (possibly) cold cacheline in the per-cpu queue node;
@@ -350,6 +391,7 @@ queue:
                prev = decode_tail(old);
                WRITE_ONCE(prev->next, node);
+                pv_wait_node(node);
                arch_mcs_spin_lock_contended(&node->locked);
        }
@@ -365,6 +407,7 @@ queue:
         * does not imply a full barrier.
         *
         */
+        pv_wait_head(lock, node);
        while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
                cpu_relax();
@@ -397,6 +440,7 @@ queue:
                cpu_relax();
        arch_mcs_spin_unlock_contended(&next->locked);
+        pv_kick_node(next);
 release:
        /*
@@ -405,3 +449,25 @@ release:
        this_cpu_dec(mcs_nodes[0].count);
 }
 EXPORT_SYMBOL(queued_spin_lock_slowpath);
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+#undef  pv_enabled
+#define pv_enabled()    true
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head
+#undef  queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath       __pv_queued_spin_lock_slowpath
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+#endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
new file mode 100644
index 000000000000..b5758a95a8d3
--- /dev/null
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -0,0 +1,325 @@
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ *   pv_kick(cpu)             -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+#define _Q_SLOW_VAL     (3U << _Q_LOCKED_OFFSET)
+enum vcpu_state {
+        vcpu_running = 0,
+        vcpu_halted,
+};
+struct pv_node {
+        struct mcs_spinlock     mcs;
+        struct mcs_spinlock     __res[3];
+        int                     cpu;
+        u8                      state;
+};
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+        struct qspinlock *lock;
+        struct pv_node   *node;
+};
+#define PV_HE_PER_LINE  (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN       (PAGE_SIZE / sizeof(struct pv_hash_entry))
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+        int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+        if (pv_hash_size < PV_HE_MIN)
+                pv_hash_size = PV_HE_MIN;
+        /*
+         * Allocate space from bootmem which should be page-size aligned
+         * and hence cacheline aligned.
+         */
+        pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+                                               sizeof(struct pv_hash_entry),
+                                               pv_hash_size, 0, HASH_EARLY,
+                                               &pv_lock_hash_bits, NULL,
+                                               pv_hash_size, pv_hash_size);
+}
+#define for_each_hash_entry(he, offset, hash)                                           \
+        for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;       \
+             offset < (1 << pv_lock_hash_bits);                                         \
+             offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+{
+        unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+        struct pv_hash_entry *he;
+        for_each_hash_entry(he, offset, hash) {
+                if (!cmpxchg(&he->lock, NULL, lock)) {
+                        WRITE_ONCE(he->node, node);
+                        return &he->lock;
+                }
+        }
+        /*
+         * Hard assume there is a free entry for us.
+         *
+         * This is guaranteed by ensuring every blocked lock only ever consumes
+         * a single entry, and since we only have 4 nesting levels per CPU
+         * and allocated 4*nr_possible_cpus(), this must be so.
+         *
+         * The single entry is guaranteed by having the lock owner unhash
+         * before it releases.
+         */
+        BUG();
+}
+static struct pv_node *pv_unhash(struct qspinlock *lock)
+{
+        unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+        struct pv_hash_entry *he;
+        struct pv_node *node;
+        for_each_hash_entry(he, offset, hash) {
+                if (READ_ONCE(he->lock) == lock) {
+                        node = READ_ONCE(he->node);
+                        WRITE_ONCE(he->lock, NULL);
+                        return node;
+                }
+        }
+        /*
+         * Hard assume we'll find an entry.
+         *
+         * This guarantees a limited lookup time and is itself guaranteed by
+         * having the lock owner do the unhash -- IFF the unlock sees the
+         * SLOW flag, there MUST be a hash entry.
+         */
+        BUG();
+}
+/*
+ * Initialize the PV part of the mcs_spinlock node.
+ */
+static void pv_init_node(struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+        pn->cpu = smp_processor_id();
+        pn->state = vcpu_running;
+}
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to wake the vcpu again.
+ */
+static void pv_wait_node(struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        int loop;
+        for (;;) {
+                for (loop = SPIN_THRESHOLD; loop; loop--) {
+                        if (READ_ONCE(node->locked))
+                                return;
+                        cpu_relax();
+                }
+                /*
+                 * Order pn->state vs pn->locked thusly:
+                 *
+                 * [S] pn->state = vcpu_halted    [S] next->locked = 1
+                 *     MB                             MB
+                 * [L] pn->locked               [RmW] pn->state = vcpu_running
+                 *
+                 * Matches the xchg() from pv_kick_node().
+                 */
+                (void)xchg(&pn->state, vcpu_halted);
+                if (!READ_ONCE(node->locked))
+                        pv_wait(&pn->state, vcpu_halted);
+                /*
+                 * Reset the vCPU state to avoid unncessary CPU kicking
+                 */
+                WRITE_ONCE(pn->state, vcpu_running);
+                /*
+                 * If the locked flag is still not set after wakeup, it is a
+                 * spurious wakeup and the vCPU should wait again. However,
+                 * there is a pretty high overhead for CPU halting and kicking.
+                 * So it is better to spin for a while in the hope that the
+                 * MCS lock will be released soon.
+                 */
+        }
+        /*
+         * By now our node->locked should be 1 and our caller will not actually
+         * spin-wait for it. We do however rely on our caller to do a
+         * load-acquire for us.
+         */
+}
+/*
+ * Called after setting next->locked = 1, used to wake those stuck in
+ * pv_wait_node().
+ */
+static void pv_kick_node(struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        /*
+         * Note that because node->locked is already set, this actual
+         * mcs_spinlock entry could be re-used already.
+         *
+         * This should be fine however, kicking people for no reason is
+         * harmless.
+         *
+         * See the comment in pv_wait_node().
+         */
+        if (xchg(&pn->state, vcpu_running) == vcpu_halted)
+                pv_kick(pn->cpu);
+}
+/*
+ * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ */
+static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        struct __qspinlock *l = (void *)lock;
+        struct qspinlock **lp = NULL;
+        int loop;
+        for (;;) {
+                for (loop = SPIN_THRESHOLD; loop; loop--) {
+                        if (!READ_ONCE(l->locked))
+                                return;
+                        cpu_relax();
+                }
+                WRITE_ONCE(pn->state, vcpu_halted);
+                if (!lp) { /* ONCE */
+                        lp = pv_hash(lock, pn);
+                        /*
+                         * lp must be set before setting _Q_SLOW_VAL
+                         *
+                         * [S] lp = lock                [RmW] l = l->locked = 0
+                         *     MB                             MB
+                         * [S] l->locked = _Q_SLOW_VAL  [L]   lp
+                         *
+                         * Matches the cmpxchg() in __pv_queued_spin_unlock().
+                         */
+                        if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+                                /*
+                                 * The lock is free and _Q_SLOW_VAL has never
+                                 * been set. Therefore we need to unhash before
+                                 * getting the lock.
+                                 */
+                                WRITE_ONCE(*lp, NULL);
+                                return;
+                        }
+                }
+                pv_wait(&l->locked, _Q_SLOW_VAL);
+                /*
+                 * The unlocker should have freed the lock before kicking the
+                 * CPU. So if the lock is still not free, it is a spurious
+                 * wakeup and so the vCPU should wait again after spinning for
+                 * a while.
+                 */
+        }
+        /*
+         * Lock is unlocked now; the caller will acquire it without waiting.
+         * As with pv_wait_node() we rely on the caller to do a load-acquire
+         * for us.
+         */
+}
+/*
+ * PV version of the unlock function to be used in stead of
+ * queued_spin_unlock().
+ */
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        struct pv_node *node;
+        /*
+         * We must not unlock if SLOW, because in that case we must first
+         * unhash. Otherwise it would be possible to have multiple @lock
+         * entries, which would be BAD.
+         */
+        if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL))
+                return;
+        /*
+         * Since the above failed to release, this must be the SLOW path.
+         * Therefore start by looking up the blocked node and unhashing it.
+         */
+        node = pv_unhash(lock);
+        /*
+         * Now that we have a reference to the (likely) blocked pv_node,
+         * release the lock.
+         */
+        smp_store_release(&l->locked, 0);
+        /*
+         * At this point the memory pointed at by lock can be freed/reused,
+         * however we can still use the pv_node to kick the CPU.
+         */
+        if (READ_ONCE(node->state) == vcpu_halted)
+                pv_kick(node->cpu);
+}
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() near the top of the file to make sure
+ * that the callee-save thunk and the real unlock function are close
+ * to each other sharing consecutive instruction cachelines.
+ */
+#include <asm/qspinlock_paravirt.h>
author	Waiman Long <Waiman.Long@hp.com>	2015-04-24 14:56:37 -0400
committer	Ingo Molnar <mingo@kernel.org>	2015-05-08 06:37:05 -0400
commit	a23db284fe0d1879ca2002bf31077b5efa2fe2ca (patch)
tree	816042635ecb0462f1cb3f35bc8091f1cd46ce19 /kernel/locking
parent	2aa79af64263190eec610422b07f60e99a7d230a (diff)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index fd31a474145d..38c49202d532 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c
@@ -18,6 +18,9 @@
18	* Authors: Waiman Long <waiman.long@hp.com>	18	* Authors: Waiman Long <waiman.long@hp.com>
19	* Peter Zijlstra <peterz@infradead.org>	19	* Peter Zijlstra <peterz@infradead.org>
20	*/	20	*/
		21
		22	#ifndef _GEN_PV_LOCK_SLOWPATH
		23
21	#include <linux/smp.h>	24	#include <linux/smp.h>
22	#include <linux/bug.h>	25	#include <linux/bug.h>
23	#include <linux/cpumask.h>	26	#include <linux/cpumask.h>
@@ -65,13 +68,21 @@
65		68
66	#include "mcs_spinlock.h"	69	#include "mcs_spinlock.h"
67		70
		71	#ifdef CONFIG_PARAVIRT_SPINLOCKS
		72	#define MAX_NODES 8
		73	#else
		74	#define MAX_NODES 4
		75	#endif
		76
68	/*	77	/*
69	* Per-CPU queue node structures; we can never have more than 4 nested	78	* Per-CPU queue node structures; we can never have more than 4 nested
70	* contexts: task, softirq, hardirq, nmi.	79	* contexts: task, softirq, hardirq, nmi.
71	*	80	*
72	* Exactly fits one 64-byte cacheline on a 64-bit architecture.	81	* Exactly fits one 64-byte cacheline on a 64-bit architecture.
		82	*
		83	* PV doubles the storage and uses the second cacheline for PV state.
73	*/	84	*/
74	static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[4]);	85	static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
75		86
76	/*	87	/*
77	* We must be able to distinguish between no-tail and the tail at 0:0,	88	* We must be able to distinguish between no-tail and the tail at 0:0,
@@ -220,6 +231,32 @@ static __always_inline void set_locked(struct qspinlock *lock)
220	WRITE_ONCE(l->locked, _Q_LOCKED_VAL);	231	WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
221	}	232	}
222		233
		234
		235	/*
		236	* Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
		237	* all the PV callbacks.
		238	*/
		239
		240	static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
		241	static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
		242	static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
		243
		244	static __always_inline void __pv_wait_head(struct qspinlock *lock,
		245	struct mcs_spinlock *node) { }
		246
		247	#define pv_enabled() false
		248
		249	#define pv_init_node __pv_init_node
		250	#define pv_wait_node __pv_wait_node
		251	#define pv_kick_node __pv_kick_node
		252	#define pv_wait_head __pv_wait_head
		253
		254	#ifdef CONFIG_PARAVIRT_SPINLOCKS
		255	#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
		256	#endif
		257
		258	#endif /* _GEN_PV_LOCK_SLOWPATH */
		259
223	/**	260	/**
224	* queued_spin_lock_slowpath - acquire the queued spinlock	261	* queued_spin_lock_slowpath - acquire the queued spinlock
225	* @lock: Pointer to queued spinlock structure	262	* @lock: Pointer to queued spinlock structure
@@ -249,6 +286,9 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
249		286
250	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));	287	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
251		288
		289	if (pv_enabled())
		290	goto queue;
		291
252	if (virt_queued_spin_lock(lock))	292	if (virt_queued_spin_lock(lock))
253	return;	293	return;
254		294
@@ -325,6 +365,7 @@ queue:
325	node += idx;	365	node += idx;
326	node->locked = 0;	366	node->locked = 0;
327	node->next = NULL;	367	node->next = NULL;
		368	pv_init_node(node);
328		369
329	/*	370	/*
330	* We touched a (possibly) cold cacheline in the per-cpu queue node;	371	* We touched a (possibly) cold cacheline in the per-cpu queue node;
@@ -350,6 +391,7 @@ queue:
350	prev = decode_tail(old);	391	prev = decode_tail(old);
351	WRITE_ONCE(prev->next, node);	392	WRITE_ONCE(prev->next, node);
352		393
		394	pv_wait_node(node);
353	arch_mcs_spin_lock_contended(&node->locked);	395	arch_mcs_spin_lock_contended(&node->locked);
354	}	396	}
355		397
@@ -365,6 +407,7 @@ queue:
365	* does not imply a full barrier.	407	* does not imply a full barrier.
366	*	408	*
367	*/	409	*/
		410	pv_wait_head(lock, node);
368	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)	411	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
369	cpu_relax();	412	cpu_relax();
370		413
@@ -397,6 +440,7 @@ queue:
397	cpu_relax();	440	cpu_relax();
398		441
399	arch_mcs_spin_unlock_contended(&next->locked);	442	arch_mcs_spin_unlock_contended(&next->locked);
		443	pv_kick_node(next);
400		444
401	release:	445	release:
402	/*	446	/*
@@ -405,3 +449,25 @@ release:
405	this_cpu_dec(mcs_nodes[0].count);	449	this_cpu_dec(mcs_nodes[0].count);
406	}	450	}
407	EXPORT_SYMBOL(queued_spin_lock_slowpath);	451	EXPORT_SYMBOL(queued_spin_lock_slowpath);
		452
		453	/*
		454	* Generate the paravirt code for queued_spin_unlock_slowpath().
		455	*/
		456	#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
		457	#define _GEN_PV_LOCK_SLOWPATH
		458
		459	#undef pv_enabled
		460	#define pv_enabled() true
		461
		462	#undef pv_init_node
		463	#undef pv_wait_node
		464	#undef pv_kick_node
		465	#undef pv_wait_head
		466
		467	#undef queued_spin_lock_slowpath
		468	#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
		469
		470	#include "qspinlock_paravirt.h"
		471	#include "qspinlock.c"
		472
		473	#endif


diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h new file mode 100644 index 000000000000..b5758a95a8d3 --- /dev/null +++ b/kernel/locking/qspinlock_paravirt.h
@@ -0,0 +1,325 @@
		1	#ifndef _GEN_PV_LOCK_SLOWPATH
		2	#error "do not include this file"
		3	#endif
		4
		5	#include <linux/hash.h>
		6	#include <linux/bootmem.h>
		7
		8	/*
		9	* Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
		10	* of spinning them.
		11	*
		12	* This relies on the architecture to provide two paravirt hypercalls:
		13	*
		14	* pv_wait(u8 ptr, u8 val) -- suspends the vcpu if ptr == val
		15	* pv_kick(cpu) -- wakes a suspended vcpu
		16	*
		17	* Using these we implement __pv_queued_spin_lock_slowpath() and
		18	* __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
		19	* native_queued_spin_unlock().
		20	*/
		21
		22	#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
		23
		24	enum vcpu_state {
		25	vcpu_running = 0,
		26	vcpu_halted,
		27	};
		28
		29	struct pv_node {
		30	struct mcs_spinlock mcs;
		31	struct mcs_spinlock __res[3];
		32
		33	int cpu;
		34	u8 state;
		35	};
		36
		37	/*
		38	* Lock and MCS node addresses hash table for fast lookup
		39	*
		40	* Hashing is done on a per-cacheline basis to minimize the need to access
		41	* more than one cacheline.
		42	*
		43	* Dynamically allocate a hash table big enough to hold at least 4X the
		44	* number of possible cpus in the system. Allocation is done on page
		45	* granularity. So the minimum number of hash buckets should be at least
		46	* 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
		47	*
		48	* Since we should not be holding locks from NMI context (very rare indeed) the
		49	* max load factor is 0.75, which is around the point where open addressing
		50	* breaks down.
		51	*
		52	*/
		53	struct pv_hash_entry {
		54	struct qspinlock *lock;
		55	struct pv_node *node;
		56	};
		57
		58	#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
		59	#define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry))
		60
		61	static struct pv_hash_entry *pv_lock_hash;
		62	static unsigned int pv_lock_hash_bits __read_mostly;
		63
		64	/*
		65	* Allocate memory for the PV qspinlock hash buckets
		66	*
		67	* This function should be called from the paravirt spinlock initialization
		68	* routine.
		69	*/
		70	void __init __pv_init_lock_hash(void)
		71	{
		72	int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
		73
		74	if (pv_hash_size < PV_HE_MIN)
		75	pv_hash_size = PV_HE_MIN;
		76
		77	/*
		78	* Allocate space from bootmem which should be page-size aligned
		79	* and hence cacheline aligned.
		80	*/
		81	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
		82	sizeof(struct pv_hash_entry),
		83	pv_hash_size, 0, HASH_EARLY,
		84	&pv_lock_hash_bits, NULL,
		85	pv_hash_size, pv_hash_size);
		86	}
		87
		88	#define for_each_hash_entry(he, offset, hash) \
		89	for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0; \
		90	offset < (1 << pv_lock_hash_bits); \
		91	offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
		92
		93	static struct qspinlock *pv_hash(struct qspinlock lock, struct pv_node *node)
		94	{
		95	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
		96	struct pv_hash_entry *he;
		97
		98	for_each_hash_entry(he, offset, hash) {
		99	if (!cmpxchg(&he->lock, NULL, lock)) {
		100	WRITE_ONCE(he->node, node);
		101	return &he->lock;
		102	}
		103	}
		104	/*
		105	* Hard assume there is a free entry for us.
		106	*
		107	* This is guaranteed by ensuring every blocked lock only ever consumes
		108	* a single entry, and since we only have 4 nesting levels per CPU
		109	* and allocated 4*nr_possible_cpus(), this must be so.
		110	*
		111	* The single entry is guaranteed by having the lock owner unhash
		112	* before it releases.
		113	*/
		114	BUG();
		115	}
		116
		117	static struct pv_node pv_unhash(struct qspinlock lock)
		118	{
		119	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
		120	struct pv_hash_entry *he;
		121	struct pv_node *node;
		122
		123	for_each_hash_entry(he, offset, hash) {
		124	if (READ_ONCE(he->lock) == lock) {
		125	node = READ_ONCE(he->node);
		126	WRITE_ONCE(he->lock, NULL);
		127	return node;
		128	}
		129	}
		130	/*
		131	* Hard assume we'll find an entry.
		132	*
		133	* This guarantees a limited lookup time and is itself guaranteed by
		134	* having the lock owner do the unhash -- IFF the unlock sees the
		135	* SLOW flag, there MUST be a hash entry.
		136	*/
		137	BUG();
		138	}
		139
		140	/*
		141	* Initialize the PV part of the mcs_spinlock node.
		142	*/
		143	static void pv_init_node(struct mcs_spinlock *node)
		144	{
		145	struct pv_node pn = (struct pv_node )node;
		146
		147	BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
		148
		149	pn->cpu = smp_processor_id();
		150	pn->state = vcpu_running;
		151	}
		152
		153	/*
		154	* Wait for node->locked to become true, halt the vcpu after a short spin.
		155	* pv_kick_node() is used to wake the vcpu again.
		156	*/
		157	static void pv_wait_node(struct mcs_spinlock *node)
		158	{
		159	struct pv_node pn = (struct pv_node )node;
		160	int loop;
		161
		162	for (;;) {
		163	for (loop = SPIN_THRESHOLD; loop; loop--) {
		164	if (READ_ONCE(node->locked))
		165	return;
		166	cpu_relax();
		167	}
		168
		169	/*
		170	* Order pn->state vs pn->locked thusly:
		171	*
		172	* [S] pn->state = vcpu_halted [S] next->locked = 1
		173	* MB MB
		174	* [L] pn->locked [RmW] pn->state = vcpu_running
		175	*
		176	* Matches the xchg() from pv_kick_node().
		177	*/
		178	(void)xchg(&pn->state, vcpu_halted);
		179
		180	if (!READ_ONCE(node->locked))
		181	pv_wait(&pn->state, vcpu_halted);
		182
		183	/*
		184	* Reset the vCPU state to avoid unncessary CPU kicking
		185	*/
		186	WRITE_ONCE(pn->state, vcpu_running);
		187
		188	/*
		189	* If the locked flag is still not set after wakeup, it is a
		190	* spurious wakeup and the vCPU should wait again. However,
		191	* there is a pretty high overhead for CPU halting and kicking.
		192	* So it is better to spin for a while in the hope that the
		193	* MCS lock will be released soon.
		194	*/
		195	}
		196	/*
		197	* By now our node->locked should be 1 and our caller will not actually
		198	* spin-wait for it. We do however rely on our caller to do a
		199	* load-acquire for us.
		200	*/
		201	}
		202
		203	/*
		204	* Called after setting next->locked = 1, used to wake those stuck in
		205	* pv_wait_node().
		206	*/
		207	static void pv_kick_node(struct mcs_spinlock *node)
		208	{
		209	struct pv_node pn = (struct pv_node )node;
		210
		211	/*
		212	* Note that because node->locked is already set, this actual
		213	* mcs_spinlock entry could be re-used already.
		214	*
		215	* This should be fine however, kicking people for no reason is
		216	* harmless.
		217	*
		218	* See the comment in pv_wait_node().
		219	*/
		220	if (xchg(&pn->state, vcpu_running) == vcpu_halted)
		221	pv_kick(pn->cpu);
		222	}
		223
		224	/*
		225	* Wait for l->locked to become clear; halt the vcpu after a short spin.
		226	* __pv_queued_spin_unlock() will wake us.
		227	*/
		228	static void pv_wait_head(struct qspinlock lock, struct mcs_spinlock node)
		229	{
		230	struct pv_node pn = (struct pv_node )node;
		231	struct __qspinlock l = (void )lock;
		232	struct qspinlock **lp = NULL;
		233	int loop;
		234
		235	for (;;) {
		236	for (loop = SPIN_THRESHOLD; loop; loop--) {
		237	if (!READ_ONCE(l->locked))
		238	return;
		239	cpu_relax();
		240	}
		241
		242	WRITE_ONCE(pn->state, vcpu_halted);
		243	if (!lp) { /* ONCE */
		244	lp = pv_hash(lock, pn);
		245	/*
		246	* lp must be set before setting _Q_SLOW_VAL
		247	*
		248	* [S] lp = lock [RmW] l = l->locked = 0
		249	* MB MB
		250	* [S] l->locked = _Q_SLOW_VAL [L] lp
		251	*
		252	* Matches the cmpxchg() in __pv_queued_spin_unlock().
		253	*/
		254	if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
		255	/*
		256	* The lock is free and _Q_SLOW_VAL has never
		257	* been set. Therefore we need to unhash before
		258	* getting the lock.
		259	*/
		260	WRITE_ONCE(*lp, NULL);
		261	return;
		262	}
		263	}
		264	pv_wait(&l->locked, _Q_SLOW_VAL);
		265
		266	/*
		267	* The unlocker should have freed the lock before kicking the
		268	* CPU. So if the lock is still not free, it is a spurious
		269	* wakeup and so the vCPU should wait again after spinning for
		270	* a while.
		271	*/
		272	}
		273
		274	/*
		275	* Lock is unlocked now; the caller will acquire it without waiting.
		276	* As with pv_wait_node() we rely on the caller to do a load-acquire
		277	* for us.
		278	*/
		279	}
		280
		281	/*
		282	* PV version of the unlock function to be used in stead of
		283	* queued_spin_unlock().
		284	*/
		285	__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
		286	{
		287	struct __qspinlock l = (void )lock;
		288	struct pv_node *node;
		289
		290	/*
		291	* We must not unlock if SLOW, because in that case we must first
		292	* unhash. Otherwise it would be possible to have multiple @lock
		293	* entries, which would be BAD.
		294	*/
		295	if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL))
		296	return;
		297
		298	/*
		299	* Since the above failed to release, this must be the SLOW path.
		300	* Therefore start by looking up the blocked node and unhashing it.
		301	*/
		302	node = pv_unhash(lock);
		303
		304	/*
		305	* Now that we have a reference to the (likely) blocked pv_node,
		306	* release the lock.
		307	*/
		308	smp_store_release(&l->locked, 0);
		309
		310	/*
		311	* At this point the memory pointed at by lock can be freed/reused,
		312	* however we can still use the pv_node to kick the CPU.
		313	*/
		314	if (READ_ONCE(node->state) == vcpu_halted)
		315	pv_kick(node->cpu);
		316	}
		317	/*
		318	* Include the architecture specific callee-save thunk of the
		319	* __pv_queued_spin_unlock(). This thunk is put together with
		320	* __pv_queued_spin_unlock() near the top of the file to make sure
		321	* that the callee-save thunk and the real unlock function are close
		322	* to each other sharing consecutive instruction cachelines.
		323	*/
		324	#include <asm/qspinlock_paravirt.h>
		325