9 files changed, 173 insertions, 128 deletions
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 88d0d4420ad2..ba77ab5f64dd 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class)
        if (!new_class->name)
                return 0;
-        list_for_each_entry(class, &all_lock_classes, lock_entry) {
+        list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
                if (new_class->key - new_class->subclass == class->key)
                        return class->name_version;
                if (class->name && !strcmp(class->name, new_class->name))
@@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        hash_head = classhashentry(key);
        /*
-         * We can walk the hash lockfree, because the hash only
+         * We do an RCU walk of the hash, see lockdep_free_key_range().
-         * grows, and we are careful when adding entries to the end:
         */
-        list_for_each_entry(class, hash_head, hash_entry) {
+        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+                return NULL;
+        list_for_each_entry_rcu(class, hash_head, hash_entry) {
                if (class->key == key) {
                        /*
                         * Huh! same key, different name? Did someone trample
@@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        struct lockdep_subclass_key *key;
        struct list_head *hash_head;
        struct lock_class *class;
-        unsigned long flags;
+        DEBUG_LOCKS_WARN_ON(!irqs_disabled());
        class = look_up_lock_class(lock, subclass);
        if (likely(class))
@@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        key = lock->key->subkeys + subclass;
        hash_head = classhashentry(key);
-        raw_local_irq_save(flags);
        if (!graph_lock()) {
-                raw_local_irq_restore(flags);
                return NULL;
        }
        /*
         * We have to do the hash-walk again, to avoid races
         * with another CPU:
         */
-        list_for_each_entry(class, hash_head, hash_entry)
+        list_for_each_entry_rcu(class, hash_head, hash_entry) {
                if (class->key == key)
                        goto out_unlock_set;
+        }
        /*
         * Allocate a new key from the static array, and add it to
         * the hash:
         */
        if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
                if (!debug_locks_off_graph_unlock()) {
-                        raw_local_irq_restore(flags);
                        return NULL;
                }
-                raw_local_irq_restore(flags);
                print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
                dump_stack();
@@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        if (verbose(class)) {
                graph_unlock();
-                raw_local_irq_restore(flags);
                printk("\nnew class %p: %s", class->key, class->name);
                if (class->name_version > 1)
@@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
                printk("\n");
                dump_stack();
-                raw_local_irq_save(flags);
                if (!graph_lock()) {
-                        raw_local_irq_restore(flags);
                        return NULL;
                }
        }
 out_unlock_set:
        graph_unlock();
-        raw_local_irq_restore(flags);
 out_set_class_cache:
        if (!subclass || force)
@@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
        entry->distance = distance;
        entry->trace = *trace;
        /*
-         * Since we never remove from the dependency list, the list can
+         * Both allocation and removal are done under the graph lock; but
-         * be walked lockless by other CPUs, it's only allocation
+         * iteration is under RCU-sched; see look_up_lock_class() and
-         * that must be protected by the spinlock. But this also means
+         * lockdep_free_key_range().
-         * we must make new entries visible only once writes to the
-         * entry become visible - hence the RCU op:
         */
        list_add_tail_rcu(&entry->entry, head);
@@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry,
                else
                        head = &lock->class->locks_before;
-                list_for_each_entry(entry, head, entry) {
+                DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+                list_for_each_entry_rcu(entry, head, entry) {
                        if (!lock_accessed(entry)) {
                                unsigned int cq_depth;
                                mark_lock_accessed(entry, lock);
@@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
         * We can walk it lock-free, because entries only get added
         * to the hash:
         */
-        list_for_each_entry(chain, hash_head, entry) {
+        list_for_each_entry_rcu(chain, hash_head, entry) {
                if (chain->chain_key == chain_key) {
 cache_hit:
                        debug_atomic_inc(chain_lookup_hits);
@@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
        if (unlikely(!debug_locks))
                return;
-        if (subclass)
+        if (subclass) {
+                unsigned long flags;
+                if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+                        return;
+                raw_local_irq_save(flags);
+                current->lockdep_recursion = 1;
                register_lock_class(lock, subclass, 1);
+                current->lockdep_recursion = 0;
+                raw_local_irq_restore(flags);
+        }
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size)
        return addr >= start && addr < start + size;
 }
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one sync_sched() before getting here, so we're guaranteed
+ * nobody will look up these exact classes -- they're properly dead but still
+ * allocated.
+ */
 void lockdep_free_key_range(void *start, unsigned long size)
 {
-        struct lock_class *class, *next;
+        struct lock_class *class;
        struct list_head *head;
        unsigned long flags;
        int i;
@@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
                head = classhash_table + i;
                if (list_empty(head))
                        continue;
-                list_for_each_entry_safe(class, next, head, hash_entry) {
+                list_for_each_entry_rcu(class, head, hash_entry) {
                        if (within(class->key, start, size))
                                zap_class(class);
                        else if (within(class->name, start, size))
@@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size)
        if (locked)
                graph_unlock();
        raw_local_irq_restore(flags);
+        /*
+         * Wait for any possible iterators from look_up_lock_class() to pass
+         * before continuing to free the memory they refer to.
+         *
+         * sync_sched() is sufficient because the read-side is IRQ disable.
+         */
+        synchronize_sched();
+        /*
+         * XXX at this point we could return the resources to the pool;
+         * instead we leak them. We would need to change to bitmap allocators
+         * instead of the linear allocators we have now.
+         */
 }
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
-        struct lock_class *class, *next;
+        struct lock_class *class;
        struct list_head *head;
        unsigned long flags;
        int i, j;
@@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                head = classhash_table + i;
                if (list_empty(head))
                        continue;
-                list_for_each_entry_safe(class, next, head, hash_entry) {
+                list_for_each_entry_rcu(class, head, hash_entry) {
                        int match = 0;
                        for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index d1fe2ba5bac9..75e114bdf3f2 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
                 */
                return;
        }
-        ACCESS_ONCE(prev->next) = node;
+        WRITE_ONCE(prev->next, node);
        /* Wait until the lock holder passes the lock down. */
        arch_mcs_spin_lock_contended(&node->locked);
@@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 static inline
 void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 {
-        struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+        struct mcs_spinlock *next = READ_ONCE(node->next);
        if (likely(!next)) {
                /*
@@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
                if (likely(cmpxchg(lock, node, NULL) == node))
                        return;
                /* Wait until the next pointer is set */
-                while (!(next = ACCESS_ONCE(node->next)))
+                while (!(next = READ_ONCE(node->next)))
                        cpu_relax_lowlatency();
        }
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 94674e5919cb..4cccea6b8934 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,7 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
-#include "mcs_spinlock.h"
+#include <linux/osq_lock.h>
 /*
 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -217,44 +217,35 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock,
 }
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-        if (lock->owner != owner)
-                return false;
-        /*
-         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * lock->owner still matches owner, if that fails, owner might
-         * point to free()d memory, if it still matches, the rcu_read_lock()
-         * ensures the memory stays valid.
-         */
-        barrier();
-        return owner->on_cpu;
-}
 /*
 * Look out! "owner" is an entirely speculative pointer
 * access and not reliable.
 */
 static noinline
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 {
+        bool ret = true;
        rcu_read_lock();
-        while (owner_running(lock, owner)) {
+        while (lock->owner == owner) {
-                if (need_resched())
+                /*
+                 * Ensure we emit the owner->on_cpu, dereference _after_
+                 * checking lock->owner still matches owner. If that fails,
+                 * owner might point to freed memory. If it still matches,
+                 * the rcu_read_lock() ensures the memory stays valid.
+                 */
+                barrier();
+                if (!owner->on_cpu || need_resched()) {
+                        ret = false;
                        break;
+                }
                cpu_relax_lowlatency();
        }
        rcu_read_unlock();
-        /*
+        return ret;
-         * We break out the loop above on need_resched() and when the
-         * owner changed, which is a sign for heavy contention. Return
-         * success only when lock->owner is NULL.
-         */
-        return lock->owner == NULL;
 }
 /*
@@ -269,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
                return 0;
        rcu_read_lock();
-        owner = ACCESS_ONCE(lock->owner);
+        owner = READ_ONCE(lock->owner);
        if (owner)
                retval = owner->on_cpu;
        rcu_read_unlock();
@@ -343,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
                         * As such, when deadlock detection needs to be
                         * performed the optimistic spinning cannot be done.
                         */
-                        if (ACCESS_ONCE(ww->ctx))
+                        if (READ_ONCE(ww->ctx))
                                break;
                }
@@ -351,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
                 * If there's an owner, wait for it to either
                 * release the lock or go to sleep.
                 */
-                owner = ACCESS_ONCE(lock->owner);
+                owner = READ_ONCE(lock->owner);
                if (owner && !mutex_spin_on_owner(lock, owner))
                        break;
@@ -490,7 +481,7 @@ static inline int __sched
 __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 {
        struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-        struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+        struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
        if (!hold_ctx)
                return 0;
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index c112d00341b0..dc85ee23a26f 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
        prev = decode_cpu(old);
        node->prev = prev;
-        ACCESS_ONCE(prev->next) = node;
+        WRITE_ONCE(prev->next, node);
        /*
         * Normally @prev is untouchable after the above store; because at that
@@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
         * cmpxchg in an attempt to undo our queueing.
         */
-        while (!ACCESS_ONCE(node->locked)) {
+        while (!READ_ONCE(node->locked)) {
                /*
                 * If we need to reschedule bail... so we can block.
                 */
@@ -148,7 +148,7 @@ unqueue:
                 * Or we race against a concurrent unqueue()'s step-B, in which
                 * case its step-C will write us a new @node->prev pointer.
                 */
-                prev = ACCESS_ONCE(node->prev);
+                prev = READ_ONCE(node->prev);
        }
        /*
@@ -170,8 +170,8 @@ unqueue:
         * it will wait in Step-A.
         */
-        ACCESS_ONCE(next->prev) = prev;
+        WRITE_ONCE(next->prev, prev);
-        ACCESS_ONCE(prev->next) = next;
+        WRITE_ONCE(prev->next, next);
        return false;
 }
@@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
        node = this_cpu_ptr(&osq_node);
        next = xchg(&node->next, NULL);
        if (next) {
-                ACCESS_ONCE(next->locked) = 1;
+                WRITE_ONCE(next->locked, 1);
                return;
        }
        next = osq_wait_next(lock, node, NULL);
        if (next)
-                ACCESS_ONCE(next->locked) = 1;
+                WRITE_ONCE(next->locked, 1);
 }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6357265a31ad..b73279367087 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
 *
 * @task:       the task owning the mutex (owner) for which a chain walk is
 *              probably needed
- * @deadlock_detect: do we have to carry out deadlock detection?
+ * @chwalk:     do we have to carry out deadlock detection?
 * @orig_lock:  the mutex (can be NULL if we are walking the chain to recheck
 *              things for a task that has just got its priority adjusted, and
 *              is waiting on a mutex)
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2555ae15ec14..3a5048572065 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
                list_del(&waiter->list);
                tsk = waiter->task;
+                /*
+                 * Make sure we do not wakeup the next reader before
+                 * setting the nil condition to grant the next reader;
+                 * otherwise we could miss the wakeup on the other
+                 * side and end up sleeping again. See the pairing
+                 * in rwsem_down_read_failed().
+                 */
                smp_mb();
                waiter->task = NULL;
                wake_up_process(tsk);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2f7cc4076f50..3417d0172a5d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -14,8 +14,9 @@
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/sched/rt.h>
+#include <linux/osq_lock.h>
-#include "mcs_spinlock.h"
+#include "rwsem.h"
 /*
 * Guide to the rw_semaphore's count field for common values.
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
                waiter = list_entry(next, struct rwsem_waiter, list);
                next = waiter->list.next;
                tsk = waiter->task;
+                /*
+                 * Make sure we do not wakeup the next reader before
+                 * setting the nil condition to grant the next reader;
+                 * otherwise we could miss the wakeup on the other
+                 * side and end up sleeping again. See the pairing
+                 * in rwsem_down_read_failed().
+                 */
                smp_mb();
                waiter->task = NULL;
                wake_up_process(tsk);
@@ -258,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
                    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
                if (!list_is_singular(&sem->wait_list))
                        rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+                rwsem_set_owner(sem);
                return true;
        }
@@ -270,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 */
 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 {
-        long old, count = ACCESS_ONCE(sem->count);
+        long old, count = READ_ONCE(sem->count);
        while (true) {
                if (!(count == 0 || count == RWSEM_WAITING_BIAS))
                        return false;
                old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
-                if (old == count)
+                if (old == count) {
+                        rwsem_set_owner(sem);
                        return true;
+                }
                count = old;
        }
@@ -287,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 {
        struct task_struct *owner;
-        bool on_cpu = false;
+        bool ret = true;
        if (need_resched())
                return false;
        rcu_read_lock();
-        owner = ACCESS_ONCE(sem->owner);
+        owner = READ_ONCE(sem->owner);
-        if (owner)
+        if (!owner) {
-                on_cpu = owner->on_cpu;
+                long count = READ_ONCE(sem->count);
-        rcu_read_unlock();
+                /*
+                 * If sem->owner is not set, yet we have just recently entered the
-        /*
+                 * slowpath with the lock being active, then there is a possibility
-         * If sem->owner is not set, yet we have just recently entered the
+                 * reader(s) may have the lock. To be safe, bail spinning in these
-         * slowpath, then there is a possibility reader(s) may have the lock.
+                 * situations.
-         * To be safe, avoid spinning in these situations.
+                 */
-         */
+                if (count & RWSEM_ACTIVE_MASK)
-        return on_cpu;
+                        ret = false;
-}
+                goto done;
+        }
-static inline bool owner_running(struct rw_semaphore *sem,
-                                 struct task_struct *owner)
-{
-        if (sem->owner != owner)
-                return false;
-        /*
-         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * sem->owner still matches owner, if that fails, owner might
-         * point to free()d memory, if it still matches, the rcu_read_lock()
-         * ensures the memory stays valid.
-         */
-        barrier();
-        return owner->on_cpu;
+        ret = owner->on_cpu;
+done:
+        rcu_read_unlock();
+        return ret;
 }
 static noinline
 bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
 {
+        long count;
        rcu_read_lock();
-        while (owner_running(sem, owner)) {
+        while (sem->owner == owner) {
-                if (need_resched())
+                /*
-                        break;
+                 * Ensure we emit the owner->on_cpu, dereference _after_
+                 * checking sem->owner still matches owner, if that fails,
+                 * owner might point to free()d memory, if it still matches,
+                 * the rcu_read_lock() ensures the memory stays valid.
+                 */
+                barrier();
+                /* abort spinning when need_resched or owner is not running */
+                if (!owner->on_cpu || need_resched()) {
+                        rcu_read_unlock();
+                        return false;
+                }
                cpu_relax_lowlatency();
        }
        rcu_read_unlock();
+        if (READ_ONCE(sem->owner))
+                return true; /* new owner, continue spinning */
        /*
-         * We break out the loop above on need_resched() or when the
+         * When the owner is not set, the lock could be free or
-         * owner changed, which is a sign for heavy contention. Return
+         * held by readers. Check the counter to verify the
-         * success only when sem->owner is NULL.
+         * state.
         */
-        return sem->owner == NULL;
+        count = READ_ONCE(sem->count);
+        return (count == 0 || count == RWSEM_WAITING_BIAS);
 }
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -358,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
                goto done;
        while (true) {
-                owner = ACCESS_ONCE(sem->owner);
+                owner = READ_ONCE(sem->owner);
                if (owner && !rwsem_spin_on_owner(sem, owner))
                        break;
@@ -432,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
        /* we're now waiting on the lock, but no longer actively locking */
        if (waiting) {
-                count = ACCESS_ONCE(sem->count);
+                count = READ_ONCE(sem->count);
                /*
                 * If there were already threads queued before us and there are
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e2d3bc7f03b4..205be0ce34de 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -9,29 +9,9 @@
 #include <linux/sched.h>
 #include <linux/export.h>
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+#include "rwsem.h"
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-        sem->owner = current;
-}
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-        sem->owner = NULL;
-}
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-#endif
 /*
 * lock for reading
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644
index 000000000000..870ed9a5b426
--- /dev/null
+++ b/kernel/locking/rwsem.h
@@ -0,0 +1,20 @@
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+        sem->owner = current;
+}
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+        sem->owner = NULL;
+}
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif