Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar: "The main changes are: - 'qspinlock' support, enabled on x86: queued spinlocks - these are now the spinlock variant used by x86 as they outperform ticket spinlocks in every category. (Waiman Long) - 'pvqspinlock' support on x86: paravirtualized variant of queued spinlocks. (Waiman Long, Peter Zijlstra) - 'qrwlock' support, enabled on x86: queued rwlocks. Similar to queued spinlocks, they are now the variant used by x86: CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y CONFIG_QUEUED_SPINLOCKS=y CONFIG_ARCH_USE_QUEUED_RWLOCKS=y CONFIG_QUEUED_RWLOCKS=y - various lockdep fixlets - various locking primitives cleanups, further WRITE_ONCE() propagation" * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits) locking/lockdep: Remove hard coded array size dependency locking/qrwlock: Don't contend with readers when setting _QW_WAITING lockdep: Do not break user-visible string locking/arch: Rename set_mb() to smp_store_mb() locking/arch: Add WRITE_ONCE() to set_mb() rtmutex: Warn if trylock is called from hard/softirq context arch: Remove __ARCH_HAVE_CMPXCHG locking/rtmutex: Drop usage of __HAVE_ARCH_CMPXCHG locking/qrwlock: Rename QUEUE_RWLOCK to QUEUED_RWLOCKS locking/pvqspinlock: Rename QUEUED_SPINLOCK to QUEUED_SPINLOCKS locking/pvqspinlock: Replace xchg() by the more descriptive set_mb() locking/pvqspinlock, x86: Enable PV qspinlock for Xen locking/pvqspinlock, x86: Enable PV qspinlock for KVM locking/pvqspinlock, x86: Implement the paravirt qspinlock call patching locking/pvqspinlock: Implement simple paravirt support for the qspinlock locking/qspinlock: Revert to test-and-set on hypervisors locking/qspinlock: Use a simple write to grab the lock locking/qspinlock: Optimize for smaller NR_CPUS locking/qspinlock: Extract out code snippets for the next patch locking/qspinlock: Add pending bit ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-06-22 17:54:22 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-06-22 17:54:22 -0400
commit: 1bf7067c6e173dc10411704db48338ed69c05565 (patch)
tree: 06d731d9647c525fa598d03d7ec957ff9772ff40 /kernel
parent: fc934d40178ad4e551a17e2733241d9f29fddd70 (diff)
parent: 68722101ec3a0e179408a13708dd020e04f54aab (diff)
11 files changed, 894 insertions, 17 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 08561f1acd13..ebdb0043203a 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -235,9 +235,16 @@ config LOCK_SPIN_ON_OWNER
       def_bool y
       depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
-config ARCH_USE_QUEUE_RWLOCK
+config ARCH_USE_QUEUED_SPINLOCKS
        bool
-config QUEUE_RWLOCK
+config QUEUED_SPINLOCKS
-        def_bool y if ARCH_USE_QUEUE_RWLOCK
+        def_bool y if ARCH_USE_QUEUED_SPINLOCKS
+        depends on SMP
+config ARCH_USE_QUEUED_RWLOCKS
+        bool
+config QUEUED_RWLOCKS
+        def_bool y if ARCH_USE_QUEUED_RWLOCKS
        depends on SMP
diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407ff67..55ca63ad9622 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2055,7 +2055,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 {
        /*
         * The task state is guaranteed to be set before another task can
-         * wake it. set_current_state() is implemented using set_mb() and
+         * wake it. set_current_state() is implemented using smp_store_mb() and
         * queue_me() calls spin_unlock() upon completion, both serializing
         * access to the hash list and forcing another memory barrier.
         */
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index de7a416cca2a..7dd5c9918e4c 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
 obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
@@ -25,5 +26,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
-obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index aaeae885d9af..456614136f1a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4067,8 +4067,7 @@ void __init lockdep_info(void)
 #ifdef CONFIG_DEBUG_LOCKDEP
        if (lockdep_init_error) {
-                printk("WARNING: lockdep init error! lock-%s was acquired"
+                printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
-                        "before lockdep_init\n", lock_init_error);
                printk("Call stack leading to lockdep invocation was:\n");
                print_stack_trace(&lockdep_init_trace, 0);
        }
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 75e114bdf3f2..fd91aaa4554c 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -17,6 +17,7 @@
 struct mcs_spinlock {
        struct mcs_spinlock *next;
        int locked; /* 1 if lock acquired */
+        int count;  /* nesting count, see qspinlock.c */
 };
 #ifndef arch_mcs_spin_lock_contended
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f956ede7f90d..6c5da483966b 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,5 +1,5 @@
 /*
- * Queue read/write lock
+ * Queued read/write locks
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -22,6 +22,26 @@
 #include <linux/hardirq.h>
 #include <asm/qrwlock.h>
+/*
+ * This internal data structure is used for optimizing access to some of
+ * the subfields within the atomic_t cnts.
+ */
+struct __qrwlock {
+        union {
+                atomic_t cnts;
+                struct {
+#ifdef __LITTLE_ENDIAN
+                        u8 wmode;       /* Writer mode   */
+                        u8 rcnts[3];    /* Reader counts */
+#else
+                        u8 rcnts[3];    /* Reader counts */
+                        u8 wmode;       /* Writer mode   */
+#endif
+                };
+        };
+        arch_spinlock_t lock;
+};
 /**
 * rspin_until_writer_unlock - inc reader count & spin until writer is gone
 * @lock  : Pointer to queue rwlock structure
@@ -107,10 +127,10 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
         * or wait for a previous writer to go away.
         */
        for (;;) {
-                cnts = atomic_read(&lock->cnts);
+                struct __qrwlock *l = (struct __qrwlock *)lock;
-                if (!(cnts & _QW_WMASK) &&
-                    (atomic_cmpxchg(&lock->cnts, cnts,
+                if (!READ_ONCE(l->wmode) &&
-                                    cnts | _QW_WAITING) == cnts))
+                   (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0))
                        break;
                cpu_relax_lowlatency();
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
new file mode 100644
index 000000000000..38c49202d532
--- /dev/null
+++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,473 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ *          Peter Zijlstra <peterz@infradead.org>
+ */
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/byteorder.h>
+#include <asm/qspinlock.h>
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. The paper below provides a good description for this kind
+ * of lock.
+ *
+ * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to make
+ * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
+ * API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ *      atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+#include "mcs_spinlock.h"
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES       8
+#else
+#define MAX_NODES       4
+#endif
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+static inline u32 encode_tail(int cpu, int idx)
+{
+        u32 tail;
+#ifdef CONFIG_DEBUG_SPINLOCK
+        BUG_ON(idx > 3);
+#endif
+        tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+        tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+        return tail;
+}
+static inline struct mcs_spinlock *decode_tail(u32 tail)
+{
+        int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+        int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+        return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ *
+ * This internal structure is also used by the set_locked function which
+ * is not restricted to _Q_PENDING_BITS == 8.
+ */
+struct __qspinlock {
+        union {
+                atomic_t val;
+#ifdef __LITTLE_ENDIAN
+                struct {
+                        u8      locked;
+                        u8      pending;
+                };
+                struct {
+                        u16     locked_pending;
+                        u16     tail;
+                };
+#else
+                struct {
+                        u16     tail;
+                        u16     locked_pending;
+                };
+                struct {
+                        u8      reserved[2];
+                        u8      pending;
+                        u8      locked;
+                };
+#endif
+        };
+};
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+        struct __qspinlock *l = (void *)lock;
+        return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+#else /* _Q_PENDING_BITS == 8 */
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+        atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+        u32 old, new, val = atomic_read(&lock->val);
+        for (;;) {
+                new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+                old = atomic_cmpxchg(&lock->val, val, new);
+                if (old == val)
+                        break;
+                val = old;
+        }
+        return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+                                           struct mcs_spinlock *node) { }
+#define pv_enabled()            false
+#define pv_init_node            __pv_init_node
+#define pv_wait_node            __pv_wait_node
+#define pv_kick_node            __pv_kick_node
+#define pv_wait_head            __pv_wait_head
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath       native_queued_spin_lock_slowpath
+#endif
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+/**
+ * queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ *              fast     :    slow                                  :    unlock
+ *                       :                                          :
+ * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ *                       :       | ^--------.------.             /  :
+ *                       :       v           \      \            |  :
+ * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
+ *                       :       | ^--'              |           |  :
+ *                       :       v                   |           |  :
+ * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
+ *   queue               :       | ^--'                          |  :
+ *                       :       v                               |  :
+ * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
+ *   queue               :         ^--'                             :
+ */
+void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+        struct mcs_spinlock *prev, *next, *node;
+        u32 new, old, tail;
+        int idx;
+        BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+        if (pv_enabled())
+                goto queue;
+        if (virt_queued_spin_lock(lock))
+                return;
+        /*
+         * wait for in-progress pending->locked hand-overs
+         *
+         * 0,1,0 -> 0,0,1
+         */
+        if (val == _Q_PENDING_VAL) {
+                while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+                        cpu_relax();
+        }
+        /*
+         * trylock || pending
+         *
+         * 0,0,0 -> 0,0,1 ; trylock
+         * 0,0,1 -> 0,1,1 ; pending
+         */
+        for (;;) {
+                /*
+                 * If we observe any contention; queue.
+                 */
+                if (val & ~_Q_LOCKED_MASK)
+                        goto queue;
+                new = _Q_LOCKED_VAL;
+                if (val == new)
+                        new |= _Q_PENDING_VAL;
+                old = atomic_cmpxchg(&lock->val, val, new);
+                if (old == val)
+                        break;
+                val = old;
+        }
+        /*
+         * we won the trylock
+         */
+        if (new == _Q_LOCKED_VAL)
+                return;
+        /*
+         * we're pending, wait for the owner to go away.
+         *
+         * *,1,1 -> *,1,0
+         *
+         * this wait loop must be a load-acquire such that we match the
+         * store-release that clears the locked bit and create lock
+         * sequentiality; this is because not all clear_pending_set_locked()
+         * implementations imply full barriers.
+         */
+        while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
+                cpu_relax();
+        /*
+         * take ownership and clear the pending bit.
+         *
+         * *,1,0 -> *,0,1
+         */
+        clear_pending_set_locked(lock);
+        return;
+        /*
+         * End of pending bit optimistic spinning and beginning of MCS
+         * queuing.
+         */
+queue:
+        node = this_cpu_ptr(&mcs_nodes[0]);
+        idx = node->count++;
+        tail = encode_tail(smp_processor_id(), idx);
+        node += idx;
+        node->locked = 0;
+        node->next = NULL;
+        pv_init_node(node);
+        /*
+         * We touched a (possibly) cold cacheline in the per-cpu queue node;
+         * attempt the trylock once more in the hope someone let go while we
+         * weren't watching.
+         */
+        if (queued_spin_trylock(lock))
+                goto release;
+        /*
+         * We have already touched the queueing cacheline; don't bother with
+         * pending stuff.
+         *
+         * p,*,* -> n,*,*
+         */
+        old = xchg_tail(lock, tail);
+        /*
+         * if there was a previous node; link it and wait until reaching the
+         * head of the waitqueue.
+         */
+        if (old & _Q_TAIL_MASK) {
+                prev = decode_tail(old);
+                WRITE_ONCE(prev->next, node);
+                pv_wait_node(node);
+                arch_mcs_spin_lock_contended(&node->locked);
+        }
+        /*
+         * we're at the head of the waitqueue, wait for the owner & pending to
+         * go away.
+         *
+         * *,x,y -> *,0,0
+         *
+         * this wait loop must use a load-acquire such that we match the
+         * store-release that clears the locked bit and create lock
+         * sequentiality; this is because the set_locked() function below
+         * does not imply a full barrier.
+         *
+         */
+        pv_wait_head(lock, node);
+        while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+                cpu_relax();
+        /*
+         * claim the lock:
+         *
+         * n,0,0 -> 0,0,1 : lock, uncontended
+         * *,0,0 -> *,0,1 : lock, contended
+         *
+         * If the queue head is the only one in the queue (lock value == tail),
+         * clear the tail code and grab the lock. Otherwise, we only need
+         * to grab the lock.
+         */
+        for (;;) {
+                if (val != tail) {
+                        set_locked(lock);
+                        break;
+                }
+                old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+                if (old == val)
+                        goto release;   /* No contention */
+                val = old;
+        }
+        /*
+         * contended path; wait for next, release.
+         */
+        while (!(next = READ_ONCE(node->next)))
+                cpu_relax();
+        arch_mcs_spin_unlock_contended(&next->locked);
+        pv_kick_node(next);
+release:
+        /*
+         * release the node
+         */
+        this_cpu_dec(mcs_nodes[0].count);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+#undef  pv_enabled
+#define pv_enabled()    true
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head
+#undef  queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath       __pv_queued_spin_lock_slowpath
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+#endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
new file mode 100644
index 000000000000..04ab18151cc8
--- /dev/null
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -0,0 +1,325 @@
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ *   pv_kick(cpu)             -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+#define _Q_SLOW_VAL     (3U << _Q_LOCKED_OFFSET)
+enum vcpu_state {
+        vcpu_running = 0,
+        vcpu_halted,
+};
+struct pv_node {
+        struct mcs_spinlock     mcs;
+        struct mcs_spinlock     __res[3];
+        int                     cpu;
+        u8                      state;
+};
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+        struct qspinlock *lock;
+        struct pv_node   *node;
+};
+#define PV_HE_PER_LINE  (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN       (PAGE_SIZE / sizeof(struct pv_hash_entry))
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+        int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+        if (pv_hash_size < PV_HE_MIN)
+                pv_hash_size = PV_HE_MIN;
+        /*
+         * Allocate space from bootmem which should be page-size aligned
+         * and hence cacheline aligned.
+         */
+        pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+                                               sizeof(struct pv_hash_entry),
+                                               pv_hash_size, 0, HASH_EARLY,
+                                               &pv_lock_hash_bits, NULL,
+                                               pv_hash_size, pv_hash_size);
+}
+#define for_each_hash_entry(he, offset, hash)                                           \
+        for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;       \
+             offset < (1 << pv_lock_hash_bits);                                         \
+             offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+{
+        unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+        struct pv_hash_entry *he;
+        for_each_hash_entry(he, offset, hash) {
+                if (!cmpxchg(&he->lock, NULL, lock)) {
+                        WRITE_ONCE(he->node, node);
+                        return &he->lock;
+                }
+        }
+        /*
+         * Hard assume there is a free entry for us.
+         *
+         * This is guaranteed by ensuring every blocked lock only ever consumes
+         * a single entry, and since we only have 4 nesting levels per CPU
+         * and allocated 4*nr_possible_cpus(), this must be so.
+         *
+         * The single entry is guaranteed by having the lock owner unhash
+         * before it releases.
+         */
+        BUG();
+}
+static struct pv_node *pv_unhash(struct qspinlock *lock)
+{
+        unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+        struct pv_hash_entry *he;
+        struct pv_node *node;
+        for_each_hash_entry(he, offset, hash) {
+                if (READ_ONCE(he->lock) == lock) {
+                        node = READ_ONCE(he->node);
+                        WRITE_ONCE(he->lock, NULL);
+                        return node;
+                }
+        }
+        /*
+         * Hard assume we'll find an entry.
+         *
+         * This guarantees a limited lookup time and is itself guaranteed by
+         * having the lock owner do the unhash -- IFF the unlock sees the
+         * SLOW flag, there MUST be a hash entry.
+         */
+        BUG();
+}
+/*
+ * Initialize the PV part of the mcs_spinlock node.
+ */
+static void pv_init_node(struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+        pn->cpu = smp_processor_id();
+        pn->state = vcpu_running;
+}
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to wake the vcpu again.
+ */
+static void pv_wait_node(struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        int loop;
+        for (;;) {
+                for (loop = SPIN_THRESHOLD; loop; loop--) {
+                        if (READ_ONCE(node->locked))
+                                return;
+                        cpu_relax();
+                }
+                /*
+                 * Order pn->state vs pn->locked thusly:
+                 *
+                 * [S] pn->state = vcpu_halted    [S] next->locked = 1
+                 *     MB                             MB
+                 * [L] pn->locked               [RmW] pn->state = vcpu_running
+                 *
+                 * Matches the xchg() from pv_kick_node().
+                 */
+                smp_store_mb(pn->state, vcpu_halted);
+                if (!READ_ONCE(node->locked))
+                        pv_wait(&pn->state, vcpu_halted);
+                /*
+                 * Reset the vCPU state to avoid unncessary CPU kicking
+                 */
+                WRITE_ONCE(pn->state, vcpu_running);
+                /*
+                 * If the locked flag is still not set after wakeup, it is a
+                 * spurious wakeup and the vCPU should wait again. However,
+                 * there is a pretty high overhead for CPU halting and kicking.
+                 * So it is better to spin for a while in the hope that the
+                 * MCS lock will be released soon.
+                 */
+        }
+        /*
+         * By now our node->locked should be 1 and our caller will not actually
+         * spin-wait for it. We do however rely on our caller to do a
+         * load-acquire for us.
+         */
+}
+/*
+ * Called after setting next->locked = 1, used to wake those stuck in
+ * pv_wait_node().
+ */
+static void pv_kick_node(struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        /*
+         * Note that because node->locked is already set, this actual
+         * mcs_spinlock entry could be re-used already.
+         *
+         * This should be fine however, kicking people for no reason is
+         * harmless.
+         *
+         * See the comment in pv_wait_node().
+         */
+        if (xchg(&pn->state, vcpu_running) == vcpu_halted)
+                pv_kick(pn->cpu);
+}
+/*
+ * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ */
+static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+        struct pv_node *pn = (struct pv_node *)node;
+        struct __qspinlock *l = (void *)lock;
+        struct qspinlock **lp = NULL;
+        int loop;
+        for (;;) {
+                for (loop = SPIN_THRESHOLD; loop; loop--) {
+                        if (!READ_ONCE(l->locked))
+                                return;
+                        cpu_relax();
+                }
+                WRITE_ONCE(pn->state, vcpu_halted);
+                if (!lp) { /* ONCE */
+                        lp = pv_hash(lock, pn);
+                        /*
+                         * lp must be set before setting _Q_SLOW_VAL
+                         *
+                         * [S] lp = lock                [RmW] l = l->locked = 0
+                         *     MB                             MB
+                         * [S] l->locked = _Q_SLOW_VAL  [L]   lp
+                         *
+                         * Matches the cmpxchg() in __pv_queued_spin_unlock().
+                         */
+                        if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+                                /*
+                                 * The lock is free and _Q_SLOW_VAL has never
+                                 * been set. Therefore we need to unhash before
+                                 * getting the lock.
+                                 */
+                                WRITE_ONCE(*lp, NULL);
+                                return;
+                        }
+                }
+                pv_wait(&l->locked, _Q_SLOW_VAL);
+                /*
+                 * The unlocker should have freed the lock before kicking the
+                 * CPU. So if the lock is still not free, it is a spurious
+                 * wakeup and so the vCPU should wait again after spinning for
+                 * a while.
+                 */
+        }
+        /*
+         * Lock is unlocked now; the caller will acquire it without waiting.
+         * As with pv_wait_node() we rely on the caller to do a load-acquire
+         * for us.
+         */
+}
+/*
+ * PV version of the unlock function to be used in stead of
+ * queued_spin_unlock().
+ */
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        struct pv_node *node;
+        /*
+         * We must not unlock if SLOW, because in that case we must first
+         * unhash. Otherwise it would be possible to have multiple @lock
+         * entries, which would be BAD.
+         */
+        if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL))
+                return;
+        /*
+         * Since the above failed to release, this must be the SLOW path.
+         * Therefore start by looking up the blocked node and unhashing it.
+         */
+        node = pv_unhash(lock);
+        /*
+         * Now that we have a reference to the (likely) blocked pv_node,
+         * release the lock.
+         */
+        smp_store_release(&l->locked, 0);
+        /*
+         * At this point the memory pointed at by lock can be freed/reused,
+         * however we can still use the pv_node to kick the CPU.
+         */
+        if (READ_ONCE(node->state) == vcpu_halted)
+                pv_kick(node->cpu);
+}
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() near the top of the file to make sure
+ * that the callee-save thunk and the real unlock function are close
+ * to each other sharing consecutive instruction cachelines.
+ */
+#include <asm/qspinlock_paravirt.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b025295f4966..30ec5b46cd8c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -70,10 +70,10 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 }
 /*
- * We can speed up the acquire/release, if the architecture
+ * We can speed up the acquire/release, if there's no debugging state to be
- * supports cmpxchg and if there's no debugging state to be set up
+ * set up.
 */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+#ifndef CONFIG_DEBUG_RT_MUTEXES
 # define rt_mutex_cmpxchg(l,c,n)        (cmpxchg(&l->owner, c, n) == c)
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
@@ -1443,10 +1443,17 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
 *
 * @lock:       the rt_mutex to be locked
 *
+ * This function can only be called in thread context. It's safe to
+ * call it from atomic regions, but not from hard interrupt or soft
+ * interrupt context.
+ *
 * Returns 1 on success and 0 on contention
 */
 int __sched rt_mutex_trylock(struct rt_mutex *lock)
 {
+        if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+                return 0;
        return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_trylock);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 3417d0172a5d..0f189714e457 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -409,11 +409,24 @@ done:
        return taken;
 }
+/*
+ * Return true if the rwsem has active spinner
+ */
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+        return osq_is_locked(&sem->osq);
+}
 #else
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 {
        return false;
 }
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+        return false;
+}
 #endif
 /*
@@ -496,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
        unsigned long flags;
+        /*
+         * If a spinner is present, it is not necessary to do the wakeup.
+         * Try to do wakeup only if the trylock succeeds to minimize
+         * spinlock contention which may introduce too much delay in the
+         * unlock operation.
+         *
+         *    spinning writer           up_write/up_read caller
+         *    ---------------           -----------------------
+         * [S]   osq_unlock()           [L]   osq
+         *       MB                           RMB
+         * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
+         *
+         * Here, it is important to make sure that there won't be a missed
+         * wakeup while the rwsem is free and the only spinning writer goes
+         * to sleep without taking the rwsem. Even when the spinning writer
+         * is just going to break out of the waiting loop, it will still do
+         * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
+         * rwsem_has_spinner() is true, it will guarantee at least one
+         * trylock attempt on the rwsem later on.
+         */
+        if (rwsem_has_spinner(sem)) {
+                /*
+                 * The smp_rmb() here is to make sure that the spinner
+                 * state is consulted before reading the wait_lock.
+                 */
+                smp_rmb();
+                if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
+                        return sem;
+                goto locked;
+        }
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+locked:
        /* do nothing if list empty */
        if (!list_empty(&sem->wait_list))
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79f36..9bc82329eaad 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -341,7 +341,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
         * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
         * an event.
         */
-        set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+        smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
        return timeout;
 }
@@ -354,7 +354,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
         * doesn't imply write barrier and the users expects write
         * barrier semantics on wakeup functions.  The following
         * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-         * and is paired with set_mb() in wait_woken().
+         * and is paired with smp_store_mb() in wait_woken().
         */
        smp_wmb(); /* C */
        wait->flags |= WQ_FLAG_WOKEN;
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-06-22 17:54:22 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-06-22 17:54:22 -0400
commit	1bf7067c6e173dc10411704db48338ed69c05565 (patch)
tree	06d731d9647c525fa598d03d7ec957ff9772ff40 /kernel
parent	fc934d40178ad4e551a17e2733241d9f29fddd70 (diff)
parent	68722101ec3a0e179408a13708dd020e04f54aab (diff)