Merge branch 'core-locking-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull core locking updates from Ingo Molnar: "The biggest change is the MCS spinlock generalization changes from Tim Chen, Peter Zijlstra, Jason Low et al. There's also lockdep fixes/enhancements from Oleg Nesterov, in particular a false negative fix related to lockdep_set_novalidate_class() usage" * 'core-locking-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (22 commits) locking/mutex: Fix debug checks locking/mutexes: Add extra reschedule point locking/mutexes: Introduce cancelable MCS lock for adaptive spinning locking/mutexes: Unlock the mutex without the wait_lock locking/mutexes: Modify the way optimistic spinners are queued locking/mutexes: Return false if task need_resched() in mutex_can_spin_on_owner() locking: Move mcs_spinlock.h into kernel/locking/ m68k: Skip futex_atomic_cmpxchg_inatomic() test futex: Allow architectures to skip futex_atomic_cmpxchg_inatomic() test Revert "sched/wait: Suppress Sparse 'variable shadowing' warning" lockdep: Change lockdep_set_novalidate_class() to use _and_name lockdep: Change mark_held_locks() to check hlock->check instead of lockdep_no_validate lockdep: Don't create the wrong dependency on hlock->check == 0 lockdep: Make held_lock->check and "int check" argument bool locking/mcs: Allow architecture specific asm files to be used for contended case locking/mcs: Order the header files in Kbuild of each architecture in alphabetical order sched/wait: Suppress Sparse 'variable shadowing' warning hung_task/Documentation: Fix hung_task_warnings description locking/mcs: Allow architectures to hook in to contended paths locking/mcs: Micro-optimize the MCS code, add extra comments ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-03-31 13:59:39 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-03-31 13:59:39 -0400
commit: 462bf234a82ae1ae9d7628f59bc81022591e1348 (patch)
tree: f75eea7864ae7c72c0757d5d090e38f757b5cb2d /kernel/locking
parent: 455c6fdbd219161bd09b1165f11699d6d73de11c (diff)
parent: 6f008e72cd111a119b5d8de8c5438d892aae99eb (diff)
6 files changed, 353 insertions, 73 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5e7f66..2a9ee96ecf00 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o lglock.o
+obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = -pg
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index eb8a54783fa0..bf0c6b0dd9c5 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1936,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
        for (;;) {
                int distance = curr->lockdep_depth - depth + 1;
-                hlock = curr->held_locks + depth-1;
+                hlock = curr->held_locks + depth - 1;
                /*
                 * Only non-recursive-read entries get new dependencies
                 * added:
                 */
-                if (hlock->read != 2) {
+                if (hlock->read != 2 && hlock->check) {
                        if (!check_prev_add(curr, hlock, next,
                                                distance, trylock_loop))
                                return 0;
@@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
         * (If lookup_chain_cache() returns with 1 it acquires
         * graph_lock for us)
         */
-        if (!hlock->trylock && (hlock->check == 2) &&
+        if (!hlock->trylock && hlock->check &&
            lookup_chain_cache(curr, hlock, chain_key)) {
                /*
                 * Check whether last held lock:
@@ -2517,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
                BUG_ON(usage_bit >= LOCK_USAGE_STATES);
-                if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
+                if (!hlock->check)
                        continue;
                if (!mark_lock(curr, hlock, usage_bit))
@@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        int class_idx;
        u64 chain_key;
-        if (!prove_locking)
-                check = 1;
        if (unlikely(!debug_locks))
                return 0;
@@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
-        if (lock->key == &__lockdep_no_validate__)
+        if (!prove_locking || lock->key == &__lockdep_no_validate__)
-                check = 1;
+                check = 0;
        if (subclass < NR_LOCKDEP_CACHING_CLASSES)
                class = lock->class_cache[subclass];
@@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        hlock->holdtime_stamp = lockstat_clock();
 #endif
-        if (check == 2 && !mark_irqflags(curr, hlock))
+        if (check && !mark_irqflags(curr, hlock))
                return 0;
        /* mark it as used: */
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
new file mode 100644
index 000000000000..838dc9e00669
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.c
@@ -0,0 +1,178 @@
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include "mcs_spinlock.h"
+#ifdef CONFIG_SMP
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_queue *
+osq_wait_next(struct optimistic_spin_queue **lock,
+              struct optimistic_spin_queue *node,
+              struct optimistic_spin_queue *prev)
+{
+        struct optimistic_spin_queue *next = NULL;
+        for (;;) {
+                if (*lock == node && cmpxchg(lock, node, prev) == node) {
+                        /*
+                         * We were the last queued, we moved @lock back. @prev
+                         * will now observe @lock and will complete its
+                         * unlock()/unqueue().
+                         */
+                        break;
+                }
+                /*
+                 * We must xchg() the @node->next value, because if we were to
+                 * leave it in, a concurrent unlock()/unqueue() from
+                 * @node->next might complete Step-A and think its @prev is
+                 * still valid.
+                 *
+                 * If the concurrent unlock()/unqueue() wins the race, we'll
+                 * wait for either @lock to point to us, through its Step-B, or
+                 * wait for a new @node->next from its Step-C.
+                 */
+                if (node->next) {
+                        next = xchg(&node->next, NULL);
+                        if (next)
+                                break;
+                }
+                arch_mutex_cpu_relax();
+        }
+        return next;
+}
+bool osq_lock(struct optimistic_spin_queue **lock)
+{
+        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_queue *prev, *next;
+        node->locked = 0;
+        node->next = NULL;
+        node->prev = prev = xchg(lock, node);
+        if (likely(prev == NULL))
+                return true;
+        ACCESS_ONCE(prev->next) = node;
+        /*
+         * Normally @prev is untouchable after the above store; because at that
+         * moment unlock can proceed and wipe the node element from stack.
+         *
+         * However, since our nodes are static per-cpu storage, we're
+         * guaranteed their existence -- this allows us to apply
+         * cmpxchg in an attempt to undo our queueing.
+         */
+        while (!smp_load_acquire(&node->locked)) {
+                /*
+                 * If we need to reschedule bail... so we can block.
+                 */
+                if (need_resched())
+                        goto unqueue;
+                arch_mutex_cpu_relax();
+        }
+        return true;
+unqueue:
+        /*
+         * Step - A  -- stabilize @prev
+         *
+         * Undo our @prev->next assignment; this will make @prev's
+         * unlock()/unqueue() wait for a next pointer since @lock points to us
+         * (or later).
+         */
+        for (;;) {
+                if (prev->next == node &&
+                    cmpxchg(&prev->next, node, NULL) == node)
+                        break;
+                /*
+                 * We can only fail the cmpxchg() racing against an unlock(),
+                 * in which case we should observe @node->locked becomming
+                 * true.
+                 */
+                if (smp_load_acquire(&node->locked))
+                        return true;
+                arch_mutex_cpu_relax();
+                /*
+                 * Or we race against a concurrent unqueue()'s step-B, in which
+                 * case its step-C will write us a new @node->prev pointer.
+                 */
+                prev = ACCESS_ONCE(node->prev);
+        }
+        /*
+         * Step - B -- stabilize @next
+         *
+         * Similar to unlock(), wait for @node->next or move @lock from @node
+         * back to @prev.
+         */
+        next = osq_wait_next(lock, node, prev);
+        if (!next)
+                return false;
+        /*
+         * Step - C -- unlink
+         *
+         * @prev is stable because its still waiting for a new @prev->next
+         * pointer, @next is stable because our @node->next pointer is NULL and
+         * it will wait in Step-A.
+         */
+        ACCESS_ONCE(next->prev) = prev;
+        ACCESS_ONCE(prev->next) = next;
+        return false;
+}
+void osq_unlock(struct optimistic_spin_queue **lock)
+{
+        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_queue *next;
+        /*
+         * Fast path for the uncontended case.
+         */
+        if (likely(cmpxchg(lock, node, NULL) == node))
+                return;
+        /*
+         * Second most likely case.
+         */
+        next = xchg(&node->next, NULL);
+        if (next) {
+                ACCESS_ONCE(next->locked) = 1;
+                return;
+        }
+        next = osq_wait_next(lock, node, NULL);
+        if (next)
+                ACCESS_ONCE(next->locked) = 1;
+}
+#endif
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
new file mode 100644
index 000000000000..a2dbac4aca6b
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.h
@@ -0,0 +1,129 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */
+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+#include <asm/mcs_spinlock.h>
+struct mcs_spinlock {
+        struct mcs_spinlock *next;
+        int locked; /* 1 if lock acquired */
+};
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_mcs_spin_lock_contended(l)                                 \
+do {                                                                    \
+        while (!(smp_load_acquire(l)))                                  \
+                arch_mutex_cpu_relax();                                 \
+} while (0)
+#endif
+#ifndef arch_mcs_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_mcs_spin_unlock_contended(l)                               \
+        smp_store_release((l), 1)
+#endif
+/*
+ * Note: the smp_load_acquire/smp_store_release pair is not
+ * sufficient to form a full memory barrier across
+ * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
+ * For applications that need a full barrier across multiple cpus
+ * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
+ * used after mcs_lock.
+ */
+/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ *
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static inline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+        struct mcs_spinlock *prev;
+        /* Init node */
+        node->locked = 0;
+        node->next   = NULL;
+        prev = xchg(lock, node);
+        if (likely(prev == NULL)) {
+                /*
+                 * Lock acquired, don't need to set node->locked to 1. Threads
+                 * only spin on its own node->locked value for lock acquisition.
+                 * However, since this thread can immediately acquire the lock
+                 * and does not proceed to spin on its own node->locked, this
+                 * value won't be used. If a debug mode is needed to
+                 * audit lock status, then set node->locked value here.
+                 */
+                return;
+        }
+        ACCESS_ONCE(prev->next) = node;
+        /* Wait until the lock holder passes the lock down. */
+        arch_mcs_spin_lock_contended(&node->locked);
+}
+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+        struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+        if (likely(!next)) {
+                /*
+                 * Release the lock by setting it to NULL
+                 */
+                if (likely(cmpxchg(lock, node, NULL) == node))
+                        return;
+                /* Wait until the next pointer is set */
+                while (!(next = ACCESS_ONCE(node->next)))
+                        arch_mutex_cpu_relax();
+        }
+        /* Pass lock to next waiter. */
+        arch_mcs_spin_unlock_contended(&next->locked);
+}
+/*
+ * Cancellable version of the MCS lock above.
+ *
+ * Intended for adaptive spinning of sleeping locks:
+ * mutex_lock()/rwsem_down_{read,write}() etc.
+ */
+struct optimistic_spin_queue {
+        struct optimistic_spin_queue *next, *prev;
+        int locked; /* 1 if lock acquired */
+};
+extern bool osq_lock(struct optimistic_spin_queue **lock);
+extern void osq_unlock(struct optimistic_spin_queue **lock);
+#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index faf6f5b53e77..e1191c996c59 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -83,6 +83,12 @@ void debug_mutex_unlock(struct mutex *lock)
        DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
        mutex_clear_owner(lock);
+        /*
+         * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
+         * mutexes so that we can do it here after we've verified state.
+         */
+        atomic_set(&lock->count, 1);
 }
 void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4dd6e4c219de..14fe72cc8ce7 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,6 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
+#include "mcs_spinlock.h"
 /*
 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -33,6 +34,13 @@
 #ifdef CONFIG_DEBUG_MUTEXES
 # include "mutex-debug.h"
 # include <asm-generic/mutex-null.h>
+/*
+ * Must be 0 for the debug case so we do not do the unlock outside of the
+ * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
+ * case.
+ */
+# undef __mutex_slowpath_needs_to_unlock
+# define  __mutex_slowpath_needs_to_unlock()    0
 #else
 # include "mutex.h"
 # include <asm/mutex.h>
@@ -52,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
        INIT_LIST_HEAD(&lock->wait_list);
        mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-        lock->spin_mlock = NULL;
+        lock->osq = NULL;
 #endif
        debug_mutex_init(lock, name, key);
@@ -111,54 +119,7 @@ EXPORT_SYMBOL(mutex_lock);
 * more or less simultaneously, the spinners need to acquire a MCS lock
 * first before spinning on the owner field.
 *
- * We don't inline mspin_lock() so that perf can correctly account for the
- * time spent in this lock function.
 */
-struct mspin_node {
-        struct mspin_node *next ;
-        int               locked;       /* 1 if lock acquired */
-};
-#define MLOCK(mutex)    ((struct mspin_node **)&((mutex)->spin_mlock))
-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{
-        struct mspin_node *prev;
-        /* Init node */
-        node->locked = 0;
-        node->next   = NULL;
-        prev = xchg(lock, node);
-        if (likely(prev == NULL)) {
-                /* Lock acquired */
-                node->locked = 1;
-                return;
-        }
-        ACCESS_ONCE(prev->next) = node;
-        smp_wmb();
-        /* Wait until the lock holder passes the lock down */
-        while (!ACCESS_ONCE(node->locked))
-                arch_mutex_cpu_relax();
-}
-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{
-        struct mspin_node *next = ACCESS_ONCE(node->next);
-        if (likely(!next)) {
-                /*
-                 * Release the lock by setting it to NULL
-                 */
-                if (cmpxchg(lock, node, NULL) == node)
-                        return;
-                /* Wait until the next pointer is set */
-                while (!(next = ACCESS_ONCE(node->next)))
-                        arch_mutex_cpu_relax();
-        }
-        ACCESS_ONCE(next->locked) = 1;
-        smp_wmb();
-}
 /*
 * Mutex spinning code migrated from kernel/sched/core.c
@@ -212,6 +173,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
        struct task_struct *owner;
        int retval = 1;
+        if (need_resched())
+                return 0;
        rcu_read_lock();
        owner = ACCESS_ONCE(lock->owner);
        if (owner)
@@ -446,9 +410,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        if (!mutex_can_spin_on_owner(lock))
                goto slowpath;
+        if (!osq_lock(&lock->osq))
+                goto slowpath;
        for (;;) {
                struct task_struct *owner;
-                struct mspin_node  node;
                if (use_ww_ctx && ww_ctx->acquired > 0) {
                        struct ww_mutex *ww;
@@ -463,19 +429,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                         * performed the optimistic spinning cannot be done.
                         */
                        if (ACCESS_ONCE(ww->ctx))
-                                goto slowpath;
+                                break;
                }
                /*
                 * If there's an owner, wait for it to either
                 * release the lock or go to sleep.
                 */
-                mspin_lock(MLOCK(lock), &node);
                owner = ACCESS_ONCE(lock->owner);
-                if (owner && !mutex_spin_on_owner(lock, owner)) {
+                if (owner && !mutex_spin_on_owner(lock, owner))
-                        mspin_unlock(MLOCK(lock), &node);
+                        break;
-                        goto slowpath;
-                }
                if ((atomic_read(&lock->count) == 1) &&
                    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
@@ -488,11 +451,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                        }
                        mutex_set_owner(lock);
-                        mspin_unlock(MLOCK(lock), &node);
+                        osq_unlock(&lock->osq);
                        preempt_enable();
                        return 0;
                }
-                mspin_unlock(MLOCK(lock), &node);
                /*
                 * When there's no owner, we might have preempted between the
@@ -501,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * the owner complete.
                 */
                if (!owner && (need_resched() || rt_task(task)))
-                        goto slowpath;
+                        break;
                /*
                 * The cpu_relax() call is a compiler barrier which forces
@@ -511,7 +473,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 */
                arch_mutex_cpu_relax();
        }
+        osq_unlock(&lock->osq);
 slowpath:
+        /*
+         * If we fell out of the spin path because of need_resched(),
+         * reschedule now, before we try-lock the mutex. This avoids getting
+         * scheduled out right after we obtained the mutex.
+         */
+        if (need_resched())
+                schedule_preempt_disabled();
 #endif
        spin_lock_mutex(&lock->wait_lock, flags);
@@ -717,10 +687,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
        struct mutex *lock = container_of(lock_count, struct mutex, count);
        unsigned long flags;
-        spin_lock_mutex(&lock->wait_lock, flags);
-        mutex_release(&lock->dep_map, nested, _RET_IP_);
-        debug_mutex_unlock(lock);
        /*
         * some architectures leave the lock unlocked in the fastpath failure
         * case, others need to leave it locked. In the later case we have to
@@ -729,6 +695,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
        if (__mutex_slowpath_needs_to_unlock())
                atomic_set(&lock->count, 1);
+        spin_lock_mutex(&lock->wait_lock, flags);
+        mutex_release(&lock->dep_map, nested, _RET_IP_);
+        debug_mutex_unlock(lock);
        if (!list_empty(&lock->wait_list)) {
                /* get the first entry from the wait-list: */
                struct mutex_waiter *waiter =
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-03-31 13:59:39 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-03-31 13:59:39 -0400
commit	462bf234a82ae1ae9d7628f59bc81022591e1348 (patch)
tree	f75eea7864ae7c72c0757d5d090e38f757b5cb2d /kernel/locking
parent	455c6fdbd219161bd09b1165f11699d6d73de11c (diff)
parent	6f008e72cd111a119b5d8de8c5438d892aae99eb (diff)