Merge tag 'v3.15-rc1' into patchwork

Linux 3.15-rc1 * tag 'v3.15-rc1': (12180 commits) Linux 3.15-rc1 mm: Initialize error in shmem_file_aio_read() cifs: Use min_t() when comparing "size_t" and "unsigned long" sym53c8xx_2: Set DID_REQUEUE return code when aborting squeue powerpc: Don't try to set LPCR unless we're in hypervisor mode futex: update documentation for ordering guarantees ceph: fix pr_fmt() redefinition vti: don't allow to add the same tunnel twice gre: don't allow to add the same tunnel twice drivers: net: xen-netfront: fix array initialization bug missing bits of "splice: fix racy pipe->buffers uses" cifs: fix the race in cifs_writev() ceph_sync_{,direct_}write: fix an oops on ceph_osdc_new_request() failure pktgen: be friendly to LLTX devices r8152: check RTL8152_UNPLUG net: sun4i-emac: add promiscuous support net/apne: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO blackfin: cleanup board files bf609: clock: drop unused clock bit set/clear functions Blackfin: bf537: rename "CONFIG_ADT75" ...
author: Mauro Carvalho Chehab <m.chehab@samsung.com> 2014-04-14 11:00:36 -0400
committer: Mauro Carvalho Chehab <m.chehab@samsung.com> 2014-04-14 11:00:36 -0400
commit: 277a163c83d7ba93fba1e8980d29a9f8bfcfba6c (patch)
tree: ccfd357d152292958957b6b8a993892e7a8cc95f /kernel/locking
parent: a83b93a7480441a47856dc9104bea970e84cda87 (diff)
parent: c9eaa447e77efe77b7fa4c953bd62de8297fd6c5 (diff)
9 files changed, 831 insertions, 81 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5e7f66..b8bdcd4785b7 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o lglock.o
+obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
 endif
 obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
@@ -23,3 +24,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index eb8a54783fa0..b0e9467922e1 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1936,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
        for (;;) {
                int distance = curr->lockdep_depth - depth + 1;
-                hlock = curr->held_locks + depth-1;
+                hlock = curr->held_locks + depth - 1;
                /*
                 * Only non-recursive-read entries get new dependencies
                 * added:
                 */
-                if (hlock->read != 2) {
+                if (hlock->read != 2 && hlock->check) {
                        if (!check_prev_add(curr, hlock, next,
                                                distance, trylock_loop))
                                return 0;
@@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
         * (If lookup_chain_cache() returns with 1 it acquires
         * graph_lock for us)
         */
-        if (!hlock->trylock && (hlock->check == 2) &&
+        if (!hlock->trylock && hlock->check &&
            lookup_chain_cache(curr, hlock, chain_key)) {
                /*
                 * Check whether last held lock:
@@ -2517,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
                BUG_ON(usage_bit >= LOCK_USAGE_STATES);
-                if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
+                if (!hlock->check)
                        continue;
                if (!mark_lock(curr, hlock, usage_bit))
@@ -2557,7 +2557,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
        debug_atomic_inc(hardirqs_on_events);
 }
-void trace_hardirqs_on_caller(unsigned long ip)
+__visible void trace_hardirqs_on_caller(unsigned long ip)
 {
        time_hardirqs_on(CALLER_ADDR0, ip);
@@ -2610,7 +2610,7 @@ EXPORT_SYMBOL(trace_hardirqs_on);
 /*
 * Hardirqs were disabled:
 */
-void trace_hardirqs_off_caller(unsigned long ip)
+__visible void trace_hardirqs_off_caller(unsigned long ip)
 {
        struct task_struct *curr = current;
@@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        int class_idx;
        u64 chain_key;
-        if (!prove_locking)
-                check = 1;
        if (unlikely(!debug_locks))
                return 0;
@@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
-        if (lock->key == &__lockdep_no_validate__)
+        if (!prove_locking || lock->key == &__lockdep_no_validate__)
-                check = 1;
+                check = 0;
        if (subclass < NR_LOCKDEP_CACHING_CLASSES)
                class = lock->class_cache[subclass];
@@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        hlock->holdtime_stamp = lockstat_clock();
 #endif
-        if (check == 2 && !mark_irqflags(curr, hlock))
+        if (check && !mark_irqflags(curr, hlock))
                return 0;
        /* mark it as used: */
@@ -4191,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
-void lockdep_sys_exit(void)
+asmlinkage void lockdep_sys_exit(void)
 {
        struct task_struct *curr = current;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
new file mode 100644
index 000000000000..f26b1a18e34e
--- /dev/null
+++ b/kernel/locking/locktorture.c
@@ -0,0 +1,452 @@
+/*
+ * Module-based torture test facility for locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ *      Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+torture_param(int, nwriters_stress, -1,
+             "Number of write-locking stress-test threads");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+             "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3,
+             "Number of jiffies between shuffles, 0=disable");
+torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
+torture_param(int, stat_interval, 60,
+             "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(bool, verbose, true,
+             "Enable verbose debugging printk()s");
+static char *torture_type = "spin_lock";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type,
+                 "Type of lock to torture (spin_lock, spin_lock_irq, ...)");
+static atomic_t n_lock_torture_errors;
+static struct task_struct *stats_task;
+static struct task_struct **writer_tasks;
+static int nrealwriters_stress;
+static bool lock_is_write_held;
+struct lock_writer_stress_stats {
+        long n_write_lock_fail;
+        long n_write_lock_acquired;
+};
+static struct lock_writer_stress_stats *lwsa;
+#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE)
+#define LOCKTORTURE_RUNNABLE_INIT 1
+#else
+#define LOCKTORTURE_RUNNABLE_INIT 0
+#endif
+int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+module_param(locktorture_runnable, int, 0444);
+MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot");
+/* Forward reference. */
+static void lock_torture_cleanup(void);
+/*
+ * Operations vector for selecting different types of tests.
+ */
+struct lock_torture_ops {
+        void (*init)(void);
+        int (*writelock)(void);
+        void (*write_delay)(struct torture_random_state *trsp);
+        void (*writeunlock)(void);
+        unsigned long flags;
+        const char *name;
+};
+static struct lock_torture_ops *cur_ops;
+/*
+ * Definitions for lock torture testing.
+ */
+static int torture_lock_busted_write_lock(void)
+{
+        return 0;  /* BUGGY, do not use in real life!!! */
+}
+static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
+{
+        const unsigned long longdelay_us = 100;
+        /* We want a long delay occasionally to force massive contention.  */
+        if (!(torture_random(trsp) %
+              (nrealwriters_stress * 2000 * longdelay_us)))
+                mdelay(longdelay_us);
+#ifdef CONFIG_PREEMPT
+        if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+                preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+static void torture_lock_busted_write_unlock(void)
+{
+          /* BUGGY, do not use in real life!!! */
+}
+static struct lock_torture_ops lock_busted_ops = {
+        .writelock      = torture_lock_busted_write_lock,
+        .write_delay    = torture_lock_busted_write_delay,
+        .writeunlock    = torture_lock_busted_write_unlock,
+        .name           = "lock_busted"
+};
+static DEFINE_SPINLOCK(torture_spinlock);
+static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+{
+        spin_lock(&torture_spinlock);
+        return 0;
+}
+static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
+{
+        const unsigned long shortdelay_us = 2;
+        const unsigned long longdelay_us = 100;
+        /* We want a short delay mostly to emulate likely code, and
+         * we want a long delay occasionally to force massive contention.
+         */
+        if (!(torture_random(trsp) %
+              (nrealwriters_stress * 2000 * longdelay_us)))
+                mdelay(longdelay_us);
+        if (!(torture_random(trsp) %
+              (nrealwriters_stress * 2 * shortdelay_us)))
+                udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+        if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+                preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+{
+        spin_unlock(&torture_spinlock);
+}
+static struct lock_torture_ops spin_lock_ops = {
+        .writelock      = torture_spin_lock_write_lock,
+        .write_delay    = torture_spin_lock_write_delay,
+        .writeunlock    = torture_spin_lock_write_unlock,
+        .name           = "spin_lock"
+};
+static int torture_spin_lock_write_lock_irq(void)
+__acquires(torture_spinlock_irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&torture_spinlock, flags);
+        cur_ops->flags = flags;
+        return 0;
+}
+static void torture_lock_spin_write_unlock_irq(void)
+__releases(torture_spinlock)
+{
+        spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags);
+}
+static struct lock_torture_ops spin_lock_irq_ops = {
+        .writelock      = torture_spin_lock_write_lock_irq,
+        .write_delay    = torture_spin_lock_write_delay,
+        .writeunlock    = torture_lock_spin_write_unlock_irq,
+        .name           = "spin_lock_irq"
+};
+/*
+ * Lock torture writer kthread.  Repeatedly acquires and releases
+ * the lock, checking for duplicate acquisitions.
+ */
+static int lock_torture_writer(void *arg)
+{
+        struct lock_writer_stress_stats *lwsp = arg;
+        static DEFINE_TORTURE_RANDOM(rand);
+        VERBOSE_TOROUT_STRING("lock_torture_writer task started");
+        set_user_nice(current, 19);
+        do {
+                schedule_timeout_uninterruptible(1);
+                cur_ops->writelock();
+                if (WARN_ON_ONCE(lock_is_write_held))
+                        lwsp->n_write_lock_fail++;
+                lock_is_write_held = 1;
+                lwsp->n_write_lock_acquired++;
+                cur_ops->write_delay(&rand);
+                lock_is_write_held = 0;
+                cur_ops->writeunlock();
+                stutter_wait("lock_torture_writer");
+        } while (!torture_must_stop());
+        torture_kthread_stopping("lock_torture_writer");
+        return 0;
+}
+/*
+ * Create an lock-torture-statistics message in the specified buffer.
+ */
+static void lock_torture_printk(char *page)
+{
+        bool fail = 0;
+        int i;
+        long max = 0;
+        long min = lwsa[0].n_write_lock_acquired;
+        long long sum = 0;
+        for (i = 0; i < nrealwriters_stress; i++) {
+                if (lwsa[i].n_write_lock_fail)
+                        fail = true;
+                sum += lwsa[i].n_write_lock_acquired;
+                if (max < lwsa[i].n_write_lock_fail)
+                        max = lwsa[i].n_write_lock_fail;
+                if (min > lwsa[i].n_write_lock_fail)
+                        min = lwsa[i].n_write_lock_fail;
+        }
+        page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
+        page += sprintf(page,
+                        "Writes:  Total: %lld  Max/Min: %ld/%ld %s  Fail: %d %s\n",
+                        sum, max, min, max / 2 > min ? "???" : "",
+                        fail, fail ? "!!!" : "");
+        if (fail)
+                atomic_inc(&n_lock_torture_errors);
+}
+/*
+ * Print torture statistics.  Caller must ensure that there is only one
+ * call to this function at a given time!!!  This is normally accomplished
+ * by relying on the module system to only have one copy of the module
+ * loaded, and then by giving the lock_torture_stats kthread full control
+ * (or the init/cleanup functions when lock_torture_stats thread is not
+ * running).
+ */
+static void lock_torture_stats_print(void)
+{
+        int size = nrealwriters_stress * 200 + 8192;
+        char *buf;
+        buf = kmalloc(size, GFP_KERNEL);
+        if (!buf) {
+                pr_err("lock_torture_stats_print: Out of memory, need: %d",
+                       size);
+                return;
+        }
+        lock_torture_printk(buf);
+        pr_alert("%s", buf);
+        kfree(buf);
+}
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int lock_torture_stats(void *arg)
+{
+        VERBOSE_TOROUT_STRING("lock_torture_stats task started");
+        do {
+                schedule_timeout_interruptible(stat_interval * HZ);
+                lock_torture_stats_print();
+                torture_shutdown_absorb("lock_torture_stats");
+        } while (!torture_must_stop());
+        torture_kthread_stopping("lock_torture_stats");
+        return 0;
+}
+static inline void
+lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
+                                const char *tag)
+{
+        pr_alert("%s" TORTURE_FLAG
+                 "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+                 torture_type, tag, nrealwriters_stress, stat_interval, verbose,
+                 shuffle_interval, stutter, shutdown_secs,
+                 onoff_interval, onoff_holdoff);
+}
+static void lock_torture_cleanup(void)
+{
+        int i;
+        if (torture_cleanup())
+                return;
+        if (writer_tasks) {
+                for (i = 0; i < nrealwriters_stress; i++)
+                        torture_stop_kthread(lock_torture_writer,
+                                             writer_tasks[i]);
+                kfree(writer_tasks);
+                writer_tasks = NULL;
+        }
+        torture_stop_kthread(lock_torture_stats, stats_task);
+        lock_torture_stats_print();  /* -After- the stats thread is stopped! */
+        if (atomic_read(&n_lock_torture_errors))
+                lock_torture_print_module_parms(cur_ops,
+                                                "End of test: FAILURE");
+        else if (torture_onoff_failures())
+                lock_torture_print_module_parms(cur_ops,
+                                                "End of test: LOCK_HOTPLUG");
+        else
+                lock_torture_print_module_parms(cur_ops,
+                                                "End of test: SUCCESS");
+}
+static int __init lock_torture_init(void)
+{
+        int i;
+        int firsterr = 0;
+        static struct lock_torture_ops *torture_ops[] = {
+                &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
+        };
+        torture_init_begin(torture_type, verbose, &locktorture_runnable);
+        /* Process args and tell the world that the torturer is on the job. */
+        for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+                cur_ops = torture_ops[i];
+                if (strcmp(torture_type, cur_ops->name) == 0)
+                        break;
+        }
+        if (i == ARRAY_SIZE(torture_ops)) {
+                pr_alert("lock-torture: invalid torture type: \"%s\"\n",
+                         torture_type);
+                pr_alert("lock-torture types:");
+                for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+                        pr_alert(" %s", torture_ops[i]->name);
+                pr_alert("\n");
+                torture_init_end();
+                return -EINVAL;
+        }
+        if (cur_ops->init)
+                cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+        if (nwriters_stress >= 0)
+                nrealwriters_stress = nwriters_stress;
+        else
+                nrealwriters_stress = 2 * num_online_cpus();
+        lock_torture_print_module_parms(cur_ops, "Start of test");
+        /* Initialize the statistics so that each run gets its own numbers. */
+        lock_is_write_held = 0;
+        lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL);
+        if (lwsa == NULL) {
+                VERBOSE_TOROUT_STRING("lwsa: Out of memory");
+                firsterr = -ENOMEM;
+                goto unwind;
+        }
+        for (i = 0; i < nrealwriters_stress; i++) {
+                lwsa[i].n_write_lock_fail = 0;
+                lwsa[i].n_write_lock_acquired = 0;
+        }
+        /* Start up the kthreads. */
+        if (onoff_interval > 0) {
+                firsterr = torture_onoff_init(onoff_holdoff * HZ,
+                                              onoff_interval * HZ);
+                if (firsterr)
+                        goto unwind;
+        }
+        if (shuffle_interval > 0) {
+                firsterr = torture_shuffle_init(shuffle_interval);
+                if (firsterr)
+                        goto unwind;
+        }
+        if (shutdown_secs > 0) {
+                firsterr = torture_shutdown_init(shutdown_secs,
+                                                 lock_torture_cleanup);
+                if (firsterr)
+                        goto unwind;
+        }
+        if (stutter > 0) {
+                firsterr = torture_stutter_init(stutter);
+                if (firsterr)
+                        goto unwind;
+        }
+        writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]),
+                               GFP_KERNEL);
+        if (writer_tasks == NULL) {
+                VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+                firsterr = -ENOMEM;
+                goto unwind;
+        }
+        for (i = 0; i < nrealwriters_stress; i++) {
+                firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i],
+                                                  writer_tasks[i]);
+                if (firsterr)
+                        goto unwind;
+        }
+        if (stat_interval > 0) {
+                firsterr = torture_create_kthread(lock_torture_stats, NULL,
+                                                  stats_task);
+                if (firsterr)
+                        goto unwind;
+        }
+        torture_init_end();
+        return 0;
+unwind:
+        torture_init_end();
+        lock_torture_cleanup();
+        return firsterr;
+}
+module_init(lock_torture_init);
+module_exit(lock_torture_cleanup);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
new file mode 100644
index 000000000000..838dc9e00669
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.c
@@ -0,0 +1,178 @@
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include "mcs_spinlock.h"
+#ifdef CONFIG_SMP
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_queue *
+osq_wait_next(struct optimistic_spin_queue **lock,
+              struct optimistic_spin_queue *node,
+              struct optimistic_spin_queue *prev)
+{
+        struct optimistic_spin_queue *next = NULL;
+        for (;;) {
+                if (*lock == node && cmpxchg(lock, node, prev) == node) {
+                        /*
+                         * We were the last queued, we moved @lock back. @prev
+                         * will now observe @lock and will complete its
+                         * unlock()/unqueue().
+                         */
+                        break;
+                }
+                /*
+                 * We must xchg() the @node->next value, because if we were to
+                 * leave it in, a concurrent unlock()/unqueue() from
+                 * @node->next might complete Step-A and think its @prev is
+                 * still valid.
+                 *
+                 * If the concurrent unlock()/unqueue() wins the race, we'll
+                 * wait for either @lock to point to us, through its Step-B, or
+                 * wait for a new @node->next from its Step-C.
+                 */
+                if (node->next) {
+                        next = xchg(&node->next, NULL);
+                        if (next)
+                                break;
+                }
+                arch_mutex_cpu_relax();
+        }
+        return next;
+}
+bool osq_lock(struct optimistic_spin_queue **lock)
+{
+        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_queue *prev, *next;
+        node->locked = 0;
+        node->next = NULL;
+        node->prev = prev = xchg(lock, node);
+        if (likely(prev == NULL))
+                return true;
+        ACCESS_ONCE(prev->next) = node;
+        /*
+         * Normally @prev is untouchable after the above store; because at that
+         * moment unlock can proceed and wipe the node element from stack.
+         *
+         * However, since our nodes are static per-cpu storage, we're
+         * guaranteed their existence -- this allows us to apply
+         * cmpxchg in an attempt to undo our queueing.
+         */
+        while (!smp_load_acquire(&node->locked)) {
+                /*
+                 * If we need to reschedule bail... so we can block.
+                 */
+                if (need_resched())
+                        goto unqueue;
+                arch_mutex_cpu_relax();
+        }
+        return true;
+unqueue:
+        /*
+         * Step - A  -- stabilize @prev
+         *
+         * Undo our @prev->next assignment; this will make @prev's
+         * unlock()/unqueue() wait for a next pointer since @lock points to us
+         * (or later).
+         */
+        for (;;) {
+                if (prev->next == node &&
+                    cmpxchg(&prev->next, node, NULL) == node)
+                        break;
+                /*
+                 * We can only fail the cmpxchg() racing against an unlock(),
+                 * in which case we should observe @node->locked becomming
+                 * true.
+                 */
+                if (smp_load_acquire(&node->locked))
+                        return true;
+                arch_mutex_cpu_relax();
+                /*
+                 * Or we race against a concurrent unqueue()'s step-B, in which
+                 * case its step-C will write us a new @node->prev pointer.
+                 */
+                prev = ACCESS_ONCE(node->prev);
+        }
+        /*
+         * Step - B -- stabilize @next
+         *
+         * Similar to unlock(), wait for @node->next or move @lock from @node
+         * back to @prev.
+         */
+        next = osq_wait_next(lock, node, prev);
+        if (!next)
+                return false;
+        /*
+         * Step - C -- unlink
+         *
+         * @prev is stable because its still waiting for a new @prev->next
+         * pointer, @next is stable because our @node->next pointer is NULL and
+         * it will wait in Step-A.
+         */
+        ACCESS_ONCE(next->prev) = prev;
+        ACCESS_ONCE(prev->next) = next;
+        return false;
+}
+void osq_unlock(struct optimistic_spin_queue **lock)
+{
+        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_queue *next;
+        /*
+         * Fast path for the uncontended case.
+         */
+        if (likely(cmpxchg(lock, node, NULL) == node))
+                return;
+        /*
+         * Second most likely case.
+         */
+        next = xchg(&node->next, NULL);
+        if (next) {
+                ACCESS_ONCE(next->locked) = 1;
+                return;
+        }
+        next = osq_wait_next(lock, node, NULL);
+        if (next)
+                ACCESS_ONCE(next->locked) = 1;
+}
+#endif
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
new file mode 100644
index 000000000000..a2dbac4aca6b
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.h
@@ -0,0 +1,129 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */
+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+#include <asm/mcs_spinlock.h>
+struct mcs_spinlock {
+        struct mcs_spinlock *next;
+        int locked; /* 1 if lock acquired */
+};
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_mcs_spin_lock_contended(l)                                 \
+do {                                                                    \
+        while (!(smp_load_acquire(l)))                                  \
+                arch_mutex_cpu_relax();                                 \
+} while (0)
+#endif
+#ifndef arch_mcs_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_mcs_spin_unlock_contended(l)                               \
+        smp_store_release((l), 1)
+#endif
+/*
+ * Note: the smp_load_acquire/smp_store_release pair is not
+ * sufficient to form a full memory barrier across
+ * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
+ * For applications that need a full barrier across multiple cpus
+ * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
+ * used after mcs_lock.
+ */
+/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ *
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static inline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+        struct mcs_spinlock *prev;
+        /* Init node */
+        node->locked = 0;
+        node->next   = NULL;
+        prev = xchg(lock, node);
+        if (likely(prev == NULL)) {
+                /*
+                 * Lock acquired, don't need to set node->locked to 1. Threads
+                 * only spin on its own node->locked value for lock acquisition.
+                 * However, since this thread can immediately acquire the lock
+                 * and does not proceed to spin on its own node->locked, this
+                 * value won't be used. If a debug mode is needed to
+                 * audit lock status, then set node->locked value here.
+                 */
+                return;
+        }
+        ACCESS_ONCE(prev->next) = node;
+        /* Wait until the lock holder passes the lock down. */
+        arch_mcs_spin_lock_contended(&node->locked);
+}
+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+        struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+        if (likely(!next)) {
+                /*
+                 * Release the lock by setting it to NULL
+                 */
+                if (likely(cmpxchg(lock, node, NULL) == node))
+                        return;
+                /* Wait until the next pointer is set */
+                while (!(next = ACCESS_ONCE(node->next)))
+                        arch_mutex_cpu_relax();
+        }
+        /* Pass lock to next waiter. */
+        arch_mcs_spin_unlock_contended(&next->locked);
+}
+/*
+ * Cancellable version of the MCS lock above.
+ *
+ * Intended for adaptive spinning of sleeping locks:
+ * mutex_lock()/rwsem_down_{read,write}() etc.
+ */
+struct optimistic_spin_queue {
+        struct optimistic_spin_queue *next, *prev;
+        int locked; /* 1 if lock acquired */
+};
+extern bool osq_lock(struct optimistic_spin_queue **lock);
+extern void osq_unlock(struct optimistic_spin_queue **lock);
+#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index faf6f5b53e77..e1191c996c59 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -83,6 +83,12 @@ void debug_mutex_unlock(struct mutex *lock)
        DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
        mutex_clear_owner(lock);
+        /*
+         * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
+         * mutexes so that we can do it here after we've verified state.
+         */
+        atomic_set(&lock->count, 1);
 }
 void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4dd6e4c219de..bc73d33c6760 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,6 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
+#include "mcs_spinlock.h"
 /*
 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -33,6 +34,13 @@
 #ifdef CONFIG_DEBUG_MUTEXES
 # include "mutex-debug.h"
 # include <asm-generic/mutex-null.h>
+/*
+ * Must be 0 for the debug case so we do not do the unlock outside of the
+ * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
+ * case.
+ */
+# undef __mutex_slowpath_needs_to_unlock
+# define  __mutex_slowpath_needs_to_unlock()    0
 #else
 # include "mutex.h"
 # include <asm/mutex.h>
@@ -52,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
        INIT_LIST_HEAD(&lock->wait_list);
        mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-        lock->spin_mlock = NULL;
+        lock->osq = NULL;
 #endif
        debug_mutex_init(lock, name, key);
@@ -67,8 +75,7 @@ EXPORT_SYMBOL(__mutex_init);
 * We also put the fastpath first in the kernel image, to make sure the
 * branch is predicted by the CPU as default-untaken.
 */
-static __used noinline void __sched
+__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
-__mutex_lock_slowpath(atomic_t *lock_count);
 /**
 * mutex_lock - acquire the mutex
@@ -111,54 +118,7 @@ EXPORT_SYMBOL(mutex_lock);
 * more or less simultaneously, the spinners need to acquire a MCS lock
 * first before spinning on the owner field.
 *
- * We don't inline mspin_lock() so that perf can correctly account for the
- * time spent in this lock function.
 */
-struct mspin_node {
-        struct mspin_node *next ;
-        int               locked;       /* 1 if lock acquired */
-};
-#define MLOCK(mutex)    ((struct mspin_node **)&((mutex)->spin_mlock))
-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{
-        struct mspin_node *prev;
-        /* Init node */
-        node->locked = 0;
-        node->next   = NULL;
-        prev = xchg(lock, node);
-        if (likely(prev == NULL)) {
-                /* Lock acquired */
-                node->locked = 1;
-                return;
-        }
-        ACCESS_ONCE(prev->next) = node;
-        smp_wmb();
-        /* Wait until the lock holder passes the lock down */
-        while (!ACCESS_ONCE(node->locked))
-                arch_mutex_cpu_relax();
-}
-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{
-        struct mspin_node *next = ACCESS_ONCE(node->next);
-        if (likely(!next)) {
-                /*
-                 * Release the lock by setting it to NULL
-                 */
-                if (cmpxchg(lock, node, NULL) == node)
-                        return;
-                /* Wait until the next pointer is set */
-                while (!(next = ACCESS_ONCE(node->next)))
-                        arch_mutex_cpu_relax();
-        }
-        ACCESS_ONCE(next->locked) = 1;
-        smp_wmb();
-}
 /*
 * Mutex spinning code migrated from kernel/sched/core.c
@@ -212,6 +172,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
        struct task_struct *owner;
        int retval = 1;
+        if (need_resched())
+                return 0;
        rcu_read_lock();
        owner = ACCESS_ONCE(lock->owner);
        if (owner)
@@ -225,7 +188,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 }
 #endif
-static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+__visible __used noinline
+void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 /**
 * mutex_unlock - release the mutex
@@ -446,9 +410,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        if (!mutex_can_spin_on_owner(lock))
                goto slowpath;
+        if (!osq_lock(&lock->osq))
+                goto slowpath;
        for (;;) {
                struct task_struct *owner;
-                struct mspin_node  node;
                if (use_ww_ctx && ww_ctx->acquired > 0) {
                        struct ww_mutex *ww;
@@ -463,19 +429,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                         * performed the optimistic spinning cannot be done.
                         */
                        if (ACCESS_ONCE(ww->ctx))
-                                goto slowpath;
+                                break;
                }
                /*
                 * If there's an owner, wait for it to either
                 * release the lock or go to sleep.
                 */
-                mspin_lock(MLOCK(lock), &node);
                owner = ACCESS_ONCE(lock->owner);
-                if (owner && !mutex_spin_on_owner(lock, owner)) {
+                if (owner && !mutex_spin_on_owner(lock, owner))
-                        mspin_unlock(MLOCK(lock), &node);
+                        break;
-                        goto slowpath;
-                }
                if ((atomic_read(&lock->count) == 1) &&
                    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
@@ -488,11 +451,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                        }
                        mutex_set_owner(lock);
-                        mspin_unlock(MLOCK(lock), &node);
+                        osq_unlock(&lock->osq);
                        preempt_enable();
                        return 0;
                }
-                mspin_unlock(MLOCK(lock), &node);
                /*
                 * When there's no owner, we might have preempted between the
@@ -501,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * the owner complete.
                 */
                if (!owner && (need_resched() || rt_task(task)))
-                        goto slowpath;
+                        break;
                /*
                 * The cpu_relax() call is a compiler barrier which forces
@@ -511,7 +473,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 */
                arch_mutex_cpu_relax();
        }
+        osq_unlock(&lock->osq);
 slowpath:
+        /*
+         * If we fell out of the spin path because of need_resched(),
+         * reschedule now, before we try-lock the mutex. This avoids getting
+         * scheduled out right after we obtained the mutex.
+         */
+        if (need_resched())
+                schedule_preempt_disabled();
 #endif
        spin_lock_mutex(&lock->wait_lock, flags);
@@ -717,10 +687,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
        struct mutex *lock = container_of(lock_count, struct mutex, count);
        unsigned long flags;
-        spin_lock_mutex(&lock->wait_lock, flags);
-        mutex_release(&lock->dep_map, nested, _RET_IP_);
-        debug_mutex_unlock(lock);
        /*
         * some architectures leave the lock unlocked in the fastpath failure
         * case, others need to leave it locked. In the later case we have to
@@ -729,6 +695,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
        if (__mutex_slowpath_needs_to_unlock())
                atomic_set(&lock->count, 1);
+        spin_lock_mutex(&lock->wait_lock, flags);
+        mutex_release(&lock->dep_map, nested, _RET_IP_);
+        debug_mutex_unlock(lock);
        if (!list_empty(&lock->wait_list)) {
                /* get the first entry from the wait-list: */
                struct mutex_waiter *waiter =
@@ -746,7 +716,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 /*
 * Release the lock, slowpath:
 */
-static __used noinline void
+__visible void
 __mutex_unlock_slowpath(atomic_t *lock_count)
 {
        __mutex_unlock_common_slowpath(lock_count, 1);
@@ -803,7 +773,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
-static __used noinline void __sched
+__visible void __sched
 __mutex_lock_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2e960a2bab81..aa4dff04b594 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -213,6 +213,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 }
 /*
+ * Called by sched_setscheduler() to check whether the priority change
+ * is overruled by a possible priority boosting.
+ */
+int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+        if (!task_has_pi_waiters(task))
+                return 0;
+        return task_top_pi_waiter(task)->task->prio <= newprio;
+}
+/*
 * Adjust the priority of a task, after its pi_waiters got modified.
 *
 * This can be both boosting and unboosting. task->pi_lock must be held.
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 19c5fa95e0b4..1d66e08e897d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -143,6 +143,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 /*
 * wait for the read lock to be granted
 */
+__visible
 struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
        long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
@@ -190,6 +191,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 /*
 * wait until we successfully acquire the write lock
 */
+__visible
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
        long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
@@ -252,6 +254,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 * handle waking up a waiter on the semaphore
 * - up_read/up_write has decremented the active part of count if we come here
 */
+__visible
 struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
        unsigned long flags;
@@ -272,6 +275,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 * - caller incremented waiting part of count and discovered it still negative
 * - just wake up any readers at the front of the queue
 */
+__visible
 struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 {
        unsigned long flags;
author	Mauro Carvalho Chehab <m.chehab@samsung.com>	2014-04-14 11:00:36 -0400
committer	Mauro Carvalho Chehab <m.chehab@samsung.com>	2014-04-14 11:00:36 -0400
commit	277a163c83d7ba93fba1e8980d29a9f8bfcfba6c (patch)
tree	ccfd357d152292958957b6b8a993892e7a8cc95f /kernel/locking
parent	a83b93a7480441a47856dc9104bea970e84cda87 (diff)
parent	c9eaa447e77efe77b7fa4c953bd62de8297fd6c5 (diff)