5 files changed, 165 insertions, 72 deletions
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8a0efac4f99d..6a3bccba7e7d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)
 unsigned long nr_stack_trace_entries;
 static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static void print_lockdep_off(const char *bug_msg)
+{
+        printk(KERN_DEBUG "%s\n", bug_msg);
+        printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+        printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+}
 static int save_trace(struct stack_trace *trace)
 {
        trace->nr_entries = 0;
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)
                if (!debug_locks_off_graph_unlock())
                        return 0;
-                printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
+                print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
-                printk("turning off the locking correctness validator.\n");
                dump_stack();
                return 0;
@@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
                }
                raw_local_irq_restore(flags);
-                printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
+                print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
-                printk("turning off the locking correctness validator.\n");
                dump_stack();
                return NULL;
        }
@@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)
                if (!debug_locks_off_graph_unlock())
                        return NULL;
-                printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
+                print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
-                printk("turning off the locking correctness validator.\n");
                dump_stack();
                return NULL;
        }
@@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
        struct lock_class *class = hlock_class(hlock);
        struct list_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
-        struct held_lock *hlock_curr, *hlock_next;
+        struct held_lock *hlock_curr;
        int i, j;
        /*
@@ -2048,8 +2052,7 @@ cache_hit:
                if (!debug_locks_off_graph_unlock())
                        return 0;
-                printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
+                print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
-                printk("turning off the locking correctness validator.\n");
                dump_stack();
                return 0;
        }
@@ -2057,12 +2060,10 @@ cache_hit:
        chain->chain_key = chain_key;
        chain->irq_context = hlock->irq_context;
        /* Find the first held_lock of current chain */
-        hlock_next = hlock;
        for (i = curr->lockdep_depth - 1; i >= 0; i--) {
                hlock_curr = curr->held_locks + i;
-                if (hlock_curr->irq_context != hlock_next->irq_context)
+                if (hlock_curr->irq_context != hlock->irq_context)
                        break;
-                hlock_next = hlock;
        }
        i++;
        chain->depth = curr->lockdep_depth + 1 - i;
@@ -3190,9 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 #endif
        if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
                debug_locks_off();
-                printk("BUG: MAX_LOCK_DEPTH too low, depth: %i  max: %lu!\n",
+                print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
+                printk(KERN_DEBUG "depth: %i  max: %lu!\n",
                       curr->lockdep_depth, MAX_LOCK_DEPTH);
-                printk("turning off the locking correctness validator.\n");
                lockdep_print_held_locks(current);
                debug_show_all_locks();
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 52f23011b6e0..ad53a664f113 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -37,6 +37,12 @@
 # include <asm/mutex.h>
 #endif
+/*
+ * A negative mutex count indicates that waiters are sleeping waiting for the
+ * mutex.
+ */
+#define MUTEX_SHOW_NO_WAITER(mutex)     (atomic_read(&(mutex)->count) >= 0)
 void
 __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 {
@@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
        spin_lock_init(&lock->wait_lock);
        INIT_LIST_HEAD(&lock->wait_list);
        mutex_clear_owner(lock);
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+        lock->spin_mlock = NULL;
+#endif
        debug_mutex_init(lock, name, key);
 }
@@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * In order to avoid a stampede of mutex spinners from acquiring the mutex
+ * more or less simultaneously, the spinners need to acquire a MCS lock
+ * first before spinning on the owner field.
+ *
+ * We don't inline mspin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+struct mspin_node {
+        struct mspin_node *next ;
+        int               locked;       /* 1 if lock acquired */
+};
+#define MLOCK(mutex)    ((struct mspin_node **)&((mutex)->spin_mlock))
+static noinline
+void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
+{
+        struct mspin_node *prev;
+        /* Init node */
+        node->locked = 0;
+        node->next   = NULL;
+        prev = xchg(lock, node);
+        if (likely(prev == NULL)) {
+                /* Lock acquired */
+                node->locked = 1;
+                return;
+        }
+        ACCESS_ONCE(prev->next) = node;
+        smp_wmb();
+        /* Wait until the lock holder passes the lock down */
+        while (!ACCESS_ONCE(node->locked))
+                arch_mutex_cpu_relax();
+}
+static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
+{
+        struct mspin_node *next = ACCESS_ONCE(node->next);
+        if (likely(!next)) {
+                /*
+                 * Release the lock by setting it to NULL
+                 */
+                if (cmpxchg(lock, node, NULL) == node)
+                        return;
+                /* Wait until the next pointer is set */
+                while (!(next = ACCESS_ONCE(node->next)))
+                        arch_mutex_cpu_relax();
+        }
+        ACCESS_ONCE(next->locked) = 1;
+        smp_wmb();
+}
+/*
+ * Mutex spinning code migrated from kernel/sched/core.c
+ */
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+        if (lock->owner != owner)
+                return false;
+        /*
+         * Ensure we emit the owner->on_cpu, dereference _after_ checking
+         * lock->owner still matches owner, if that fails, owner might
+         * point to free()d memory, if it still matches, the rcu_read_lock()
+         * ensures the memory stays valid.
+         */
+        barrier();
+        return owner->on_cpu;
+}
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+static noinline
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+        rcu_read_lock();
+        while (owner_running(lock, owner)) {
+                if (need_resched())
+                        break;
+                arch_mutex_cpu_relax();
+        }
+        rcu_read_unlock();
+        /*
+         * We break out the loop above on need_resched() and when the
+         * owner changed, which is a sign for heavy contention. Return
+         * success only when lock->owner is NULL.
+         */
+        return lock->owner == NULL;
+}
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline int mutex_can_spin_on_owner(struct mutex *lock)
+{
+        int retval = 1;
+        rcu_read_lock();
+        if (lock->owner)
+                retval = lock->owner->on_cpu;
+        rcu_read_unlock();
+        /*
+         * if lock->owner is not set, the mutex owner may have just acquired
+         * it and not set the owner yet or the mutex has been released.
+         */
+        return retval;
+}
+#endif
 static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 /**
@@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
         *
         * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
         * to serialize everything.
+         *
+         * The mutex spinners are queued up using MCS lock so that only one
+         * spinner can compete for the mutex. However, if mutex spinning isn't
+         * going to happen, there is no point in going through the lock/unlock
+         * overhead.
         */
+        if (!mutex_can_spin_on_owner(lock))
+                goto slowpath;
        for (;;) {
                struct task_struct *owner;
+                struct mspin_node  node;
                /*
                 * If there's an owner, wait for it to either
                 * release the lock or go to sleep.
                 */
+                mspin_lock(MLOCK(lock), &node);
                owner = ACCESS_ONCE(lock->owner);
-                if (owner && !mutex_spin_on_owner(lock, owner))
+                if (owner && !mutex_spin_on_owner(lock, owner)) {
+                        mspin_unlock(MLOCK(lock), &node);
                        break;
+                }
-                if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+                if ((atomic_read(&lock->count) == 1) &&
+                    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
                        lock_acquired(&lock->dep_map, ip);
                        mutex_set_owner(lock);
+                        mspin_unlock(MLOCK(lock), &node);
                        preempt_enable();
                        return 0;
                }
+                mspin_unlock(MLOCK(lock), &node);
                /*
                 * When there's no owner, we might have preempted between the
@@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 */
                arch_mutex_cpu_relax();
        }
+slowpath:
 #endif
        spin_lock_mutex(&lock->wait_lock, flags);
@@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        list_add_tail(&waiter.list, &lock->wait_list);
        waiter.task = task;
-        if (atomic_xchg(&lock->count, -1) == 1)
+        if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
                goto done;
        lock_contended(&lock->dep_map, ip);
@@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * that when we release the lock, we properly wake up the
                 * other waiters:
                 */
-                if (atomic_xchg(&lock->count, -1) == 1)
+                if (MUTEX_SHOW_NO_WAITER(lock) &&
+                   (atomic_xchg(&lock->count, -1) == 1))
                        break;
                /*
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 7890b10084a7..1d96dd0d93c1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -14,6 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
+#include <linux/stat.h>
 #include "rtmutex.h"
@@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
        return curr - buf;
 }
-static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
+static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
+static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
 static struct bus_type rttest_subsys = {
        .name = "rttest",
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 67d04651f44b..42053547e0f5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2999,51 +2999,6 @@ void __sched schedule_preempt_disabled(void)
        preempt_disable();
 }
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-        if (lock->owner != owner)
-                return false;
-        /*
-         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * lock->owner still matches owner, if that fails, owner might
-         * point to free()d memory, if it still matches, the rcu_read_lock()
-         * ensures the memory stays valid.
-         */
-        barrier();
-        return owner->on_cpu;
-}
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
-        if (!sched_feat(OWNER_SPIN))
-                return 0;
-        rcu_read_lock();
-        while (owner_running(lock, owner)) {
-                if (need_resched())
-                        break;
-                arch_mutex_cpu_relax();
-        }
-        rcu_read_unlock();
-        /*
-         * We break out the loop above on need_resched() and when the
-         * owner changed, which is a sign for heavy contention. Return
-         * success only when lock->owner is NULL.
-         */
-        return lock->owner == NULL;
-}
-#endif
 #ifdef CONFIG_PREEMPT
 /*
 * this is the entry point to schedule() from in-kernel preemption
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395f..99399f8e4799 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
 /*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, true)
-/*
 * Decrement CPU power based on time not spent running tasks
 */
 SCHED_FEAT(NONTASK_POWER, true)