Merge 4.14-rc4 into staging-next

We want the staging/iio fixes in here as well to handle merge issues. Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
author: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2017-10-09 03:02:35 -0400
committer: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2017-10-09 03:02:35 -0400
commit: 1236d6bb6e19fc72ffc6bbcdeb1bfefe450e54ee (patch)
tree: 47da3feee8e263e8c9352c85cf518e624be3c211 /kernel
parent: 750b1a6894ecc9b178c6e3d0a1170122971b2036 (diff)
parent: 8a5776a5f49812d29fe4b2d0a2d71675c3facf3f (diff)
37 files changed, 1324 insertions, 825 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..7b62df86be1d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1022,7 +1022,7 @@ select_insn:
                struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
                struct bpf_array *array = container_of(map, struct bpf_array, map);
                struct bpf_prog *prog;
-                u64 index = BPF_R3;
+                u32 index = BPF_R3;
                if (unlikely(index >= array->map.max_entries))
                        goto out;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 959c9a07f318..e093d9a2c4dd 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -75,8 +75,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
 static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 {
        struct bpf_dtab *dtab;
+        int err = -EINVAL;
        u64 cost;
-        int err;
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -108,6 +108,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
        if (err)
                goto free_dtab;
+        err = -ENOMEM;
        /* A per cpu bitfield with a bit per possible net device */
        dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr),
                                            __alignof__(unsigned long));
@@ -128,7 +130,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 free_dtab:
        free_percpu(dtab->flush_needed);
        kfree(dtab);
-        return ERR_PTR(-ENOMEM);
+        return ERR_PTR(err);
 }
 static void dev_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb17e1cd1d43..25d074920a00 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map)
 static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 {
+        unsigned long flags;
        if (do_idr_lock)
-                spin_lock_bh(&map_idr_lock);
+                spin_lock_irqsave(&map_idr_lock, flags);
        else
                __acquire(&map_idr_lock);
        idr_remove(&map_idr, map->id);
        if (do_idr_lock)
-                spin_unlock_bh(&map_idr_lock);
+                spin_unlock_irqrestore(&map_idr_lock, flags);
        else
                __release(&map_idr_lock);
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 799b2451ef2d..b914fbe1383e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4205,7 +4205,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
                }
                if (insn->imm == BPF_FUNC_redirect_map) {
-                        u64 addr = (unsigned long)prog;
+                        /* Note, we cannot use prog directly as imm as subsequent
+                         * rewrites would still change the prog pointer. The only
+                         * stable address we can use is aux, which also works with
+                         * prog clones during blinding.
+                         */
+                        u64 addr = (unsigned long)prog->aux;
                        struct bpf_insn r4_ld[] = {
                                BPF_LD_IMM64(BPF_REG_4, addr),
                                *insn,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..44857278eb8a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2311,6 +2311,14 @@ out_release_tset:
                list_del_init(&cset->mg_node);
        }
        spin_unlock_irq(&css_set_lock);
+        /*
+         * Re-initialize the cgroup_taskset structure in case it is reused
+         * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
+         * iteration.
+         */
+        tset->nr_tasks = 0;
+        tset->csets    = &tset->src_csets;
        return ret;
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index acf5308fad51..d851df22f5c5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,6 +24,7 @@
 #include <linux/lockdep.h>
 #include <linux/tick.h>
 #include <linux/irq.h>
+#include <linux/nmi.h>
 #include <linux/smpboot.h>
 #include <linux/relay.h>
 #include <linux/slab.h>
@@ -46,11 +47,13 @@
 * @bringup:    Single callback bringup or teardown selector
 * @cb_state:   The state for a single callback (install/uninstall)
 * @result:     Result of the operation
- * @done:       Signal completion to the issuer of the task
+ * @done_up:    Signal completion to the issuer of the task for cpu-up
+ * @done_down:  Signal completion to the issuer of the task for cpu-down
 */
 struct cpuhp_cpu_state {
        enum cpuhp_state        state;
        enum cpuhp_state        target;
+        enum cpuhp_state        fail;
 #ifdef CONFIG_SMP
        struct task_struct      *thread;
        bool                    should_run;
@@ -58,18 +61,39 @@ struct cpuhp_cpu_state {
        bool                    single;
        bool                    bringup;
        struct hlist_node       *node;
+        struct hlist_node       *last;
        enum cpuhp_state        cb_state;
        int                     result;
-        struct completion       done;
+        struct completion       done_up;
+        struct completion       done_down;
 #endif
 };
-static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
+        .fail = CPUHP_INVALID,
+};
 #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
-static struct lock_class_key cpuhp_state_key;
+static struct lockdep_map cpuhp_state_up_map =
-static struct lockdep_map cpuhp_state_lock_map =
+        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
-        STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key);
+static struct lockdep_map cpuhp_state_down_map =
+        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
+static void inline cpuhp_lock_acquire(bool bringup)
+{
+        lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+static void inline cpuhp_lock_release(bool bringup)
+{
+        lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+#else
+static void inline cpuhp_lock_acquire(bool bringup) { }
+static void inline cpuhp_lock_release(bool bringup) { }
 #endif
 /**
@@ -123,13 +147,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
 /**
 * cpuhp_invoke_callback _ Invoke the callbacks for a given state
 * @cpu:        The cpu for which the callback should be invoked
- * @step:       The step in the state machine
+ * @state:      The state to do callbacks for
 * @bringup:    True if the bringup callback should be invoked
+ * @node:       For multi-instance, do a single entry callback for install/remove
+ * @lastp:      For multi-instance rollback, remember how far we got
 *
 * Called from cpu hotplug and from the state register machinery.
 */
 static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
-                                 bool bringup, struct hlist_node *node)
+                                 bool bringup, struct hlist_node *node,
+                                 struct hlist_node **lastp)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct cpuhp_step *step = cpuhp_get_step(state);
@@ -137,7 +164,17 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
        int (*cb)(unsigned int cpu);
        int ret, cnt;
+        if (st->fail == state) {
+                st->fail = CPUHP_INVALID;
+                if (!(bringup ? step->startup.single : step->teardown.single))
+                        return 0;
+                return -EAGAIN;
+        }
        if (!step->multi_instance) {
+                WARN_ON_ONCE(lastp && *lastp);
                cb = bringup ? step->startup.single : step->teardown.single;
                if (!cb)
                        return 0;
@@ -152,6 +189,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
        /* Single invocation for instance add/remove */
        if (node) {
+                WARN_ON_ONCE(lastp && *lastp);
                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
@@ -161,13 +199,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
        /* State transition. Invoke on all instances */
        cnt = 0;
        hlist_for_each(node, &step->list) {
+                if (lastp && node == *lastp)
+                        break;
                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
-                if (ret)
+                if (ret) {
-                        goto err;
+                        if (!lastp)
+                                goto err;
+                        *lastp = node;
+                        return ret;
+                }
                cnt++;
        }
+        if (lastp)
+                *lastp = NULL;
        return 0;
 err:
        /* Rollback the instances if one failed */
@@ -178,12 +226,39 @@ err:
        hlist_for_each(node, &step->list) {
                if (!cnt--)
                        break;
-                cbm(cpu, node);
+                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+                ret = cbm(cpu, node);
+                trace_cpuhp_exit(cpu, st->state, state, ret);
+                /*
+                 * Rollback must not fail,
+                 */
+                WARN_ON_ONCE(ret);
        }
        return ret;
 }
 #ifdef CONFIG_SMP
+static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+        struct completion *done = bringup ? &st->done_up : &st->done_down;
+        wait_for_completion(done);
+}
+static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+        struct completion *done = bringup ? &st->done_up : &st->done_down;
+        complete(done);
+}
+/*
+ * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
+ */
+static bool cpuhp_is_atomic_state(enum cpuhp_state state)
+{
+        return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
+}
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 bool cpuhp_tasks_frozen;
@@ -271,14 +346,79 @@ void cpu_hotplug_enable(void)
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 #endif  /* CONFIG_HOTPLUG_CPU */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+static inline enum cpuhp_state
+cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+        enum cpuhp_state prev_state = st->state;
+        st->rollback = false;
+        st->last = NULL;
+        st->target = target;
+        st->single = false;
+        st->bringup = st->state < target;
+        return prev_state;
+}
+static inline void
+cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+{
+        st->rollback = true;
+        /*
+         * If we have st->last we need to undo partial multi_instance of this
+         * state first. Otherwise start undo at the previous state.
+         */
+        if (!st->last) {
+                if (st->bringup)
+                        st->state--;
+                else
+                        st->state++;
+        }
+        st->target = prev_state;
+        st->bringup = !st->bringup;
+}
+/* Regular hotplug invocation of the AP hotplug thread */
+static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
+{
+        if (!st->single && st->state == st->target)
+                return;
+        st->result = 0;
+        /*
+         * Make sure the above stores are visible before should_run becomes
+         * true. Paired with the mb() above in cpuhp_thread_fun()
+         */
+        smp_mb();
+        st->should_run = true;
+        wake_up_process(st->thread);
+        wait_for_ap_thread(st, st->bringup);
+}
+static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+        enum cpuhp_state prev_state;
+        int ret;
+        prev_state = cpuhp_set_state(st, target);
+        __cpuhp_kick_ap(st);
+        if ((ret = st->result)) {
+                cpuhp_reset_state(st, prev_state);
+                __cpuhp_kick_ap(st);
+        }
+        return ret;
+}
 static int bringup_wait_for_ap(unsigned int cpu)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
-        wait_for_completion(&st->done);
+        wait_for_ap_thread(st, true);
        if (WARN_ON_ONCE((!cpu_online(cpu))))
                return -ECANCELED;
@@ -286,12 +426,10 @@ static int bringup_wait_for_ap(unsigned int cpu)
        stop_machine_unpark(cpu);
        kthread_unpark(st->thread);
-        /* Should we go further up ? */
+        if (st->target <= CPUHP_AP_ONLINE_IDLE)
-        if (st->target > CPUHP_AP_ONLINE_IDLE) {
+                return 0;
-                __cpuhp_kick_ap_work(st);
-                wait_for_completion(&st->done);
+        return cpuhp_kick_ap(st, st->target);
-        }
-        return st->result;
 }
 static int bringup_cpu(unsigned int cpu)
@@ -317,32 +455,6 @@ static int bringup_cpu(unsigned int cpu)
 /*
 * Hotplug state machine related functions
 */
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
-        for (st->state++; st->state < st->target; st->state++) {
-                struct cpuhp_step *step = cpuhp_get_step(st->state);
-                if (!step->skip_onerr)
-                        cpuhp_invoke_callback(cpu, st->state, true, NULL);
-        }
-}
-static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
-                                enum cpuhp_state target)
-{
-        enum cpuhp_state prev_state = st->state;
-        int ret = 0;
-        for (; st->state > target; st->state--) {
-                ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
-                if (ret) {
-                        st->target = prev_state;
-                        undo_cpu_down(cpu, st);
-                        break;
-                }
-        }
-        return ret;
-}
 static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
@@ -350,7 +462,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
                struct cpuhp_step *step = cpuhp_get_step(st->state);
                if (!step->skip_onerr)
-                        cpuhp_invoke_callback(cpu, st->state, false, NULL);
+                        cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
        }
 }
@@ -362,7 +474,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
        while (st->state < target) {
                st->state++;
-                ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
+                ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
                if (ret) {
                        st->target = prev_state;
                        undo_cpu_up(cpu, st);
@@ -379,7 +491,8 @@ static void cpuhp_create(unsigned int cpu)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-        init_completion(&st->done);
+        init_completion(&st->done_up);
+        init_completion(&st->done_down);
 }
 static int cpuhp_should_run(unsigned int cpu)
@@ -389,69 +502,90 @@ static int cpuhp_should_run(unsigned int cpu)
        return st->should_run;
 }
-/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
-static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
-        enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
-        return cpuhp_down_callbacks(cpu, st, target);
-}
-/* Execute the online startup callbacks. Used to be CPU_ONLINE */
-static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
-        return cpuhp_up_callbacks(cpu, st, st->target);
-}
 /*
 * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
 * callbacks when a state gets [un]installed at runtime.
+ *
+ * Each invocation of this function by the smpboot thread does a single AP
+ * state callback.
+ *
+ * It has 3 modes of operation:
+ *  - single: runs st->cb_state
+ *  - up:     runs ++st->state, while st->state < st->target
+ *  - down:   runs st->state--, while st->state > st->target
+ *
+ * When complete or on error, should_run is cleared and the completion is fired.
 */
 static void cpuhp_thread_fun(unsigned int cpu)
 {
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
-        int ret = 0;
+        bool bringup = st->bringup;
+        enum cpuhp_state state;
        /*
-         * Paired with the mb() in cpuhp_kick_ap_work and
+         * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
-         * cpuhp_invoke_ap_callback, so the work set is consistent visible.
+         * that if we see ->should_run we also see the rest of the state.
         */
        smp_mb();
-        if (!st->should_run)
+        if (WARN_ON_ONCE(!st->should_run))
                return;
-        st->should_run = false;
+        cpuhp_lock_acquire(bringup);
-        lock_map_acquire(&cpuhp_state_lock_map);
-        /* Single callback invocation for [un]install ? */
        if (st->single) {
-                if (st->cb_state < CPUHP_AP_ONLINE) {
+                state = st->cb_state;
-                        local_irq_disable();
+                st->should_run = false;
-                        ret = cpuhp_invoke_callback(cpu, st->cb_state,
+        } else {
-                                                    st->bringup, st->node);
+                if (bringup) {
-                        local_irq_enable();
+                        st->state++;
+                        state = st->state;
+                        st->should_run = (st->state < st->target);
+                        WARN_ON_ONCE(st->state > st->target);
                } else {
-                        ret = cpuhp_invoke_callback(cpu, st->cb_state,
+                        state = st->state;
-                                                    st->bringup, st->node);
+                        st->state--;
+                        st->should_run = (st->state > st->target);
+                        WARN_ON_ONCE(st->state < st->target);
                }
-        } else if (st->rollback) {
+        }
-                BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+        WARN_ON_ONCE(!cpuhp_is_ap_state(state));
+        if (st->rollback) {
+                struct cpuhp_step *step = cpuhp_get_step(state);
+                if (step->skip_onerr)
+                        goto next;
+        }
+        if (cpuhp_is_atomic_state(state)) {
+                local_irq_disable();
+                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+                local_irq_enable();
-                undo_cpu_down(cpu, st);
+                /*
-                st->rollback = false;
+                 * STARTING/DYING must not fail!
+                 */
+                WARN_ON_ONCE(st->result);
        } else {
-                /* Cannot happen .... */
+                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
-                BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+        }
-                /* Regular hotplug work */
+        if (st->result) {
-                if (st->state < st->target)
+                /*
-                        ret = cpuhp_ap_online(cpu, st);
+                 * If we fail on a rollback, we're up a creek without no
-                else if (st->state > st->target)
+                 * paddle, no way forward, no way back. We loose, thanks for
-                        ret = cpuhp_ap_offline(cpu, st);
+                 * playing.
+                 */
+                WARN_ON_ONCE(st->rollback);
+                st->should_run = false;
        }
-        lock_map_release(&cpuhp_state_lock_map);
-        st->result = ret;
+next:
-        complete(&st->done);
+        cpuhp_lock_release(bringup);
+        if (!st->should_run)
+                complete_ap_thread(st, bringup);
 }
 /* Invoke a single callback on a remote cpu */
@@ -460,62 +594,64 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
                         struct hlist_node *node)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+        int ret;
        if (!cpu_online(cpu))
                return 0;
-        lock_map_acquire(&cpuhp_state_lock_map);
+        cpuhp_lock_acquire(false);
-        lock_map_release(&cpuhp_state_lock_map);
+        cpuhp_lock_release(false);
+        cpuhp_lock_acquire(true);
+        cpuhp_lock_release(true);
        /*
         * If we are up and running, use the hotplug thread. For early calls
         * we invoke the thread function directly.
         */
        if (!st->thread)
-                return cpuhp_invoke_callback(cpu, state, bringup, node);
+                return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
+        st->rollback = false;
+        st->last = NULL;
+        st->node = node;
+        st->bringup = bringup;
        st->cb_state = state;
        st->single = true;
-        st->bringup = bringup;
-        st->node = node;
-        /*
+        __cpuhp_kick_ap(st);
-         * Make sure the above stores are visible before should_run becomes
-         * true. Paired with the mb() above in cpuhp_thread_fun()
-         */
-        smp_mb();
-        st->should_run = true;
-        wake_up_process(st->thread);
-        wait_for_completion(&st->done);
-        return st->result;
-}
-/* Regular hotplug invocation of the AP hotplug thread */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
-{
-        st->result = 0;
-        st->single = false;
        /*
-         * Make sure the above stores are visible before should_run becomes
+         * If we failed and did a partial, do a rollback.
-         * true. Paired with the mb() above in cpuhp_thread_fun()
         */
-        smp_mb();
+        if ((ret = st->result) && st->last) {
-        st->should_run = true;
+                st->rollback = true;
-        wake_up_process(st->thread);
+                st->bringup = !bringup;
+                __cpuhp_kick_ap(st);
+        }
+        return ret;
 }
 static int cpuhp_kick_ap_work(unsigned int cpu)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-        enum cpuhp_state state = st->state;
+        enum cpuhp_state prev_state = st->state;
+        int ret;
+        cpuhp_lock_acquire(false);
+        cpuhp_lock_release(false);
-        trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
+        cpuhp_lock_acquire(true);
-        lock_map_acquire(&cpuhp_state_lock_map);
+        cpuhp_lock_release(true);
-        lock_map_release(&cpuhp_state_lock_map);
-        __cpuhp_kick_ap_work(st);
+        trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
-        wait_for_completion(&st->done);
+        ret = cpuhp_kick_ap(st, st->target);
-        trace_cpuhp_exit(cpu, st->state, state, st->result);
+        trace_cpuhp_exit(cpu, st->state, prev_state, ret);
-        return st->result;
+        return ret;
 }
 static struct smp_hotplug_thread cpuhp_threads = {
@@ -581,6 +717,7 @@ static int take_cpu_down(void *_param)
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
        int err, cpu = smp_processor_id();
+        int ret;
        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
@@ -594,8 +731,13 @@ static int take_cpu_down(void *_param)
        WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
        st->state--;
        /* Invoke the former CPU_DYING callbacks */
-        for (; st->state > target; st->state--)
+        for (; st->state > target; st->state--) {
-                cpuhp_invoke_callback(cpu, st->state, false, NULL);
+                ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+                /*
+                 * DYING must not fail!
+                 */
+                WARN_ON_ONCE(ret);
+        }
        /* Give up timekeeping duties */
        tick_handover_do_timer();
@@ -639,7 +781,7 @@ static int takedown_cpu(unsigned int cpu)
         *
         * Wait for the stop thread to go away.
         */
-        wait_for_completion(&st->done);
+        wait_for_ap_thread(st, false);
        BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
        /* Interrupts are moved away from the dying cpu, reenable alloc/free */
@@ -658,7 +800,7 @@ static void cpuhp_complete_idle_dead(void *arg)
 {
        struct cpuhp_cpu_state *st = arg;
-        complete(&st->done);
+        complete_ap_thread(st, false);
 }
 void cpuhp_report_idle_dead(void)
@@ -676,11 +818,32 @@ void cpuhp_report_idle_dead(void)
                                 cpuhp_complete_idle_dead, st, 0);
 }
-#else
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-#define takedown_cpu            NULL
+{
-#endif
+        for (st->state++; st->state < st->target; st->state++) {
+                struct cpuhp_step *step = cpuhp_get_step(st->state);
-#ifdef CONFIG_HOTPLUG_CPU
+                if (!step->skip_onerr)
+                        cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+        }
+}
+static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+                                enum cpuhp_state target)
+{
+        enum cpuhp_state prev_state = st->state;
+        int ret = 0;
+        for (; st->state > target; st->state--) {
+                ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+                if (ret) {
+                        st->target = prev_state;
+                        undo_cpu_down(cpu, st);
+                        break;
+                }
+        }
+        return ret;
+}
 /* Requires cpu_add_remove_lock to be held */
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
@@ -699,13 +862,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
        cpuhp_tasks_frozen = tasks_frozen;
-        prev_state = st->state;
+        prev_state = cpuhp_set_state(st, target);
-        st->target = target;
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread.
         */
        if (st->state > CPUHP_TEARDOWN_CPU) {
+                st->target = max((int)target, CPUHP_TEARDOWN_CPU);
                ret = cpuhp_kick_ap_work(cpu);
                /*
                 * The AP side has done the error rollback already. Just
@@ -720,6 +883,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
                 */
                if (st->state > CPUHP_TEARDOWN_CPU)
                        goto out;
+                st->target = target;
        }
        /*
         * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
@@ -727,13 +892,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
         */
        ret = cpuhp_down_callbacks(cpu, st, target);
        if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
-                st->target = prev_state;
+                cpuhp_reset_state(st, prev_state);
-                st->rollback = true;
+                __cpuhp_kick_ap(st);
-                cpuhp_kick_ap_work(cpu);
        }
 out:
        cpus_write_unlock();
+        /*
+         * Do post unplug cleanup. This is still protected against
+         * concurrent CPU hotplug via cpu_add_remove_lock.
+         */
+        lockup_detector_cleanup();
        return ret;
 }
@@ -754,11 +923,15 @@ out:
        cpu_maps_update_done();
        return err;
 }
 int cpu_down(unsigned int cpu)
 {
        return do_cpu_down(cpu, CPUHP_OFFLINE);
 }
 EXPORT_SYMBOL(cpu_down);
+#else
+#define takedown_cpu            NULL
 #endif /*CONFIG_HOTPLUG_CPU*/
 /**
@@ -772,11 +945,16 @@ void notify_cpu_starting(unsigned int cpu)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+        int ret;
        rcu_cpu_starting(cpu);  /* Enables RCU usage on this CPU. */
        while (st->state < target) {
                st->state++;
-                cpuhp_invoke_callback(cpu, st->state, true, NULL);
+                ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+                /*
+                 * STARTING must not fail!
+                 */
+                WARN_ON_ONCE(ret);
        }
 }
@@ -794,7 +972,7 @@ void cpuhp_online_idle(enum cpuhp_state state)
                return;
        st->state = CPUHP_AP_ONLINE_IDLE;
-        complete(&st->done);
+        complete_ap_thread(st, true);
 }
 /* Requires cpu_add_remove_lock to be held */
@@ -829,7 +1007,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
        cpuhp_tasks_frozen = tasks_frozen;
-        st->target = target;
+        cpuhp_set_state(st, target);
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread once more.
@@ -1296,6 +1474,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int ret;
+        /*
+         * If there's nothing to do, we done.
+         * Relies on the union for multi_instance.
+         */
        if ((bringup && !sp->startup.single) ||
            (!bringup && !sp->teardown.single))
                return 0;
@@ -1307,9 +1489,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
        if (cpuhp_is_ap_state(state))
                ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
        else
-                ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+                ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
 #else
-        ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+        ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
 #endif
        BUG_ON(ret && !bringup);
        return ret;
@@ -1641,9 +1823,55 @@ static ssize_t show_cpuhp_target(struct device *dev,
 }
 static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
+static ssize_t write_cpuhp_fail(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t count)
+{
+        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+        struct cpuhp_step *sp;
+        int fail, ret;
+        ret = kstrtoint(buf, 10, &fail);
+        if (ret)
+                return ret;
+        /*
+         * Cannot fail STARTING/DYING callbacks.
+         */
+        if (cpuhp_is_atomic_state(fail))
+                return -EINVAL;
+        /*
+         * Cannot fail anything that doesn't have callbacks.
+         */
+        mutex_lock(&cpuhp_state_mutex);
+        sp = cpuhp_get_step(fail);
+        if (!sp->startup.single && !sp->teardown.single)
+                ret = -EINVAL;
+        mutex_unlock(&cpuhp_state_mutex);
+        if (ret)
+                return ret;
+        st->fail = fail;
+        return count;
+}
+static ssize_t show_cpuhp_fail(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+        return sprintf(buf, "%d\n", st->fail);
+}
+static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
 static struct attribute *cpuhp_cpu_attrs[] = {
        &dev_attr_state.attr,
        &dev_attr_target.attr,
+        &dev_attr_fail.attr,
        NULL
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e691b75b2db..6bc21e202ae4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
                }
        }
        event->tp_event->prog = prog;
+        event->tp_event->bpf_prog_owner = event;
        return 0;
 }
@@ -8185,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
                return;
        prog = event->tp_event->prog;
-        if (prog) {
+        if (prog && event->tp_event->bpf_prog_owner == event) {
                event->tp_event->prog = NULL;
                bpf_prog_put(prog);
        }
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index af71a84e12ee..f684d8e5fa2b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -412,6 +412,19 @@ err:
        return NULL;
 }
+static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
+{
+        if (rb->aux_overwrite)
+                return false;
+        if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+                rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+                return true;
+        }
+        return false;
+}
 /*
 * Commit the data written by hardware into the ring buffer by adjusting
 * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
@@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
        }
        rb->user_page->aux_head = rb->aux_head;
-        if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+        if (rb_need_aux_wakeup(rb))
                wakeup = true;
-                rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
-        }
        if (wakeup) {
                if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
@@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
        rb->aux_head += size;
        rb->user_page->aux_head = rb->aux_head;
-        if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+        if (rb_need_aux_wakeup(rb)) {
                perf_output_wakeup(handle);
-                rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
                handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
        }
diff --git a/kernel/exit.c b/kernel/exit.c
index 3481ababd06a..f2cd53e92147 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
        struct waitid_info info = {.status = 0};
        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
        int signo = 0;
        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
-        }
-        if (!err) {
                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
                        return -EFAULT;
        }
@@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid,
        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
-        }
+                if (uru) {
+                        /* kernel_waitid() overwrites everything in ru */
-        if (!err && uru) {
+                        if (COMPAT_USE_64BIT_TIME)
-                /* kernel_waitid() overwrites everything in ru */
+                                err = copy_to_user(uru, &ru, sizeof(ru));
-                if (COMPAT_USE_64BIT_TIME)
+                        else
-                        err = copy_to_user(uru, &ru, sizeof(ru));
+                                err = put_compat_rusage(&ru, uru);
-                else
+                        if (err)
-                        err = put_compat_rusage(&ru, uru);
+                                return -EFAULT;
-                if (err)
+                }
-                        return -EFAULT;
        }
        if (!infop)
diff --git a/kernel/extable.c b/kernel/extable.c
index 38c2412401a1..9aa1cc41ecf7 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr)
 int __kernel_text_address(unsigned long addr)
 {
-        if (core_kernel_text(addr))
+        if (kernel_text_address(addr))
-                return 1;
-        if (is_module_text_address(addr))
-                return 1;
-        if (is_ftrace_trampoline(addr))
-                return 1;
-        if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
-                return 1;
-        if (is_bpf_text_address(addr))
                return 1;
        /*
         * There might be init symbols in saved stacktraces.
@@ -127,17 +119,42 @@ int __kernel_text_address(unsigned long addr)
 int kernel_text_address(unsigned long addr)
 {
+        bool no_rcu;
+        int ret = 1;
        if (core_kernel_text(addr))
                return 1;
+        /*
+         * If a stack dump happens while RCU is not watching, then
+         * RCU needs to be notified that it requires to start
+         * watching again. This can happen either by tracing that
+         * triggers a stack trace, or a WARN() that happens during
+         * coming back from idle, or cpu on or offlining.
+         *
+         * is_module_text_address() as well as the kprobe slots
+         * and is_bpf_text_address() require RCU to be watching.
+         */
+        no_rcu = !rcu_is_watching();
+        /* Treat this like an NMI as it can happen anywhere */
+        if (no_rcu)
+                rcu_nmi_enter();
        if (is_module_text_address(addr))
-                return 1;
+                goto out;
        if (is_ftrace_trampoline(addr))
-                return 1;
+                goto out;
        if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
-                return 1;
+                goto out;
        if (is_bpf_text_address(addr))
-                return 1;
+                goto out;
-        return 0;
+        ret = 0;
+out:
+        if (no_rcu)
+                rcu_nmi_exit();
+        return ret;
 }
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 10646182440f..e702cb9ffbd8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
+#ifdef CONFIG_MMU
+static void mmput_async_fn(struct work_struct *work)
+{
+        struct mm_struct *mm = container_of(work, struct mm_struct,
+                                            async_put_work);
+        __mmput(mm);
+}
+void mmput_async(struct mm_struct *mm)
+{
+        if (atomic_dec_and_test(&mm->mm_users)) {
+                INIT_WORK(&mm->async_put_work, mmput_async_fn);
+                schedule_work(&mm->async_put_work);
+        }
+}
+#endif
 /**
 * set_mm_exe_file - change a reference to the mm's executable file
 *
diff --git a/kernel/futex.c b/kernel/futex.c
index 3d38eaf05492..0518a0bfc746 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state)
 /*
 * Drops a reference to the pi_state object and frees or caches it
 * when the last reference is gone.
- *
- * Must be called with the hb lock held.
 */
 static void put_pi_state(struct futex_pi_state *pi_state)
 {
@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state)
         * and has cleaned up the pi_state already
         */
        if (pi_state->owner) {
-                raw_spin_lock_irq(&pi_state->owner->pi_lock);
+                struct task_struct *owner;
-                list_del_init(&pi_state->list);
-                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
-                rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+                owner = pi_state->owner;
+                if (owner) {
+                        raw_spin_lock(&owner->pi_lock);
+                        list_del_init(&pi_state->list);
+                        raw_spin_unlock(&owner->pi_lock);
+                }
+                rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
+                raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
        }
-        if (current->pi_state_cache)
+        if (current->pi_state_cache) {
                kfree(pi_state);
-        else {
+        } else {
                /*
                 * pi_state->list is already empty.
                 * clear pi_state->owner.
@@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr)
                raw_spin_unlock_irq(&curr->pi_lock);
                spin_lock(&hb->lock);
+                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-                raw_spin_lock_irq(&curr->pi_lock);
+                raw_spin_lock(&curr->pi_lock);
                /*
                 * We dropped the pi-lock, so re-check whether this
                 * task still owns the PI-state:
                 */
                if (head->next != next) {
+                        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
                        spin_unlock(&hb->lock);
                        continue;
                }
@@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr)
                WARN_ON(list_empty(&pi_state->list));
                list_del_init(&pi_state->list);
                pi_state->owner = NULL;
-                raw_spin_unlock_irq(&curr->pi_lock);
+                raw_spin_unlock(&curr->pi_lock);
                get_pi_state(pi_state);
+                raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
                spin_unlock(&hb->lock);
                rt_mutex_futex_unlock(&pi_state->pi_mutex);
@@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &p->pi_state_list);
+        /*
+         * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+         * because there is no concurrency as the object is not published yet.
+         */
        pi_state->owner = p;
        raw_spin_unlock_irq(&p->pi_lock);
@@ -2878,6 +2888,7 @@ retry:
                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
                spin_unlock(&hb->lock);
+                /* drops pi_state->pi_mutex.wait_lock */
                ret = wake_futex_pi(uaddr, uval, pi_state);
                put_pi_state(pi_state);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f51b7b6d2451..6fc89fd93824 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -202,7 +202,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
        irqd_clr_managed_shutdown(d);
-        if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) {
+        if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) {
                /*
                 * Catch code which fiddles with enable_irq() on a managed
                 * and potentially shutdown IRQ. Chained interrupt
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f7086b78ad6e..5270a54b9fa4 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
                /* Calc pointer to the next generic chip */
                tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
        }
-        d->name = name;
        return 0;
 }
 EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e84b7056bb08..ac4644e92b49 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
        struct irq_desc *desc;
        struct irq_domain *domain;
        struct radix_tree_iter iter;
-        void **slot;
+        void __rcu **slot;
        int i;
        seq_printf(m, " %-16s  %-6s  %-10s  %-10s  %s\n",
@@ -1453,7 +1453,7 @@ out_free_desc:
 /* The irq_data was moved, fix the revmap to refer to the new location */
 static void irq_domain_fix_revmap(struct irq_data *d)
 {
-        void **slot;
+        void __rcu **slot;
        if (d->hwirq < d->domain->revmap_size)
                return; /* Not using radix tree. */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 573dc52b0806..d00132b5c325 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id)
 #endif
        action = __free_irq(irq, dev_id);
+        if (!action)
+                return NULL;
        devname = action->name;
        kfree(action);
        return devname;
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index ea34ed8bb952..055bb2962a0b 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1,
        if (filp_epoll) {
                filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
                fput(filp_epoll);
-        } else
+        }
        if (IS_ERR(filp_tgt))
                return PTR_ERR(filp_tgt);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 02f660666ab8..1fefe6dcafd7 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
        DEFINE_WAKE_Q(wake_q);
        /*
+        * __rwsem_down_write_failed_common(sem)
+        *   rwsem_optimistic_spin(sem)
+        *     osq_unlock(sem->osq)
+        *   ...
+        *   atomic_long_add_return(&sem->count)
+        *
+        *      - VS -
+        *
+        *              __up_write()
+        *                if (atomic_long_sub_return_release(&sem->count) < 0)
+        *                  rwsem_wake(sem)
+        *                    osq_is_locked(&sem->osq)
+        *
+        * And __up_write() must observe !osq_is_locked() when it observes the
+        * atomic_long_add_return() in order to not miss a wakeup.
+        *
+        * This boils down to:
+        *
+        * [S.rel] X = 1                [RmW] r0 = (Y += 0)
+        *         MB                         RMB
+        * [RmW]   Y += 1               [L]   r1 = X
+        *
+        * exists (r0=1 /\ r1=0)
+        */
+        smp_rmb();
+        /*
         * If a spinner is present, it is not necessary to do the wakeup.
         * Try to do wakeup only if the trylock succeeds to minimize
         * spinlock contention which may introduce too much delay in the
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6bcbfbf1a8fd..403ab9cdb949 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
        pgprot_t pgprot = PAGE_KERNEL;
        struct dev_pagemap *pgmap;
        struct page_map *page_map;
-        int error, nid, is_ram;
+        int error, nid, is_ram, i = 0;
        align_start = res->start & ~(SECTION_SIZE - 1);
        align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
                list_del(&page->lru);
                page->pgmap = pgmap;
                percpu_ref_get(ref);
+                if (!(++i % 1024))
+                        cond_resched();
        }
        devres_add(dev, page_map);
        return __va(res->start);
diff --git a/kernel/params.c b/kernel/params.c
index 60b2d8101355..cc9108c2a1fd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -224,7 +224,7 @@ char *parse_args(const char *doing,
        }                                                               \
        int param_get_##name(char *buffer, const struct kernel_param *kp) \
        {                                                               \
-                return scnprintf(buffer, PAGE_SIZE, format,             \
+                return scnprintf(buffer, PAGE_SIZE, format "\n",        \
                                *((type *)kp->arg));                    \
        }                                                               \
        const struct kernel_param_ops param_ops_##name = {                      \
@@ -236,14 +236,14 @@ char *parse_args(const char *doing,
        EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(byte,        unsigned char,          "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(short,       short,                  "%hi",  kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(ushort,      unsigned short,         "%hu",  kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(int,         int,                    "%i",   kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(uint,        unsigned int,           "%u",   kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(long,        long,                   "%li",  kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ulong,       unsigned long,          "%lu",  kstrtoul);
-STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(ullong,      unsigned long long,     "%llu", kstrtoull);
 int param_set_charp(const char *val, const struct kernel_param *kp)
 {
@@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp);
 int param_get_charp(char *buffer, const struct kernel_param *kp)
 {
-        return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
+        return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));
 }
 EXPORT_SYMBOL(param_get_charp);
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool);
 int param_get_bool(char *buffer, const struct kernel_param *kp)
 {
        /* Y and N chosen as being relatively non-coder friendly */
-        return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
+        return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');
 }
 EXPORT_SYMBOL(param_get_bool);
@@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool);
 int param_get_invbool(char *buffer, const struct kernel_param *kp)
 {
-        return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
+        return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');
 }
 EXPORT_SYMBOL(param_get_invbool);
@@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
        struct kernel_param p = *kp;
        for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+                /* Replace \n with comma */
                if (i)
-                        buffer[off++] = ',';
+                        buffer[off - 1] = ',';
                p.arg = arr->elem + arr->elemsize * i;
                check_kparam_locked(p.mod);
                ret = arr->ops->get(buffer + off, &p);
@@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring);
 int param_get_string(char *buffer, const struct kernel_param *kp)
 {
        const struct kparam_string *kps = kp->str;
-        return strlcpy(buffer, kps->string, kps->maxlen);
+        return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);
 }
 EXPORT_SYMBOL(param_get_string);
@@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
        kernel_param_lock(mk->mod);
        count = attribute->param->ops->get(buf, attribute->param);
        kernel_param_unlock(mk->mod);
-        if (count > 0) {
-                strcat(buf, "\n");
-                ++count;
-        }
        return count;
 }
@@ -600,7 +597,7 @@ EXPORT_SYMBOL(kernel_param_unlock);
 /*
 * add_sysfs_param - add a parameter to sysfs
 * @mk: struct module_kobject
- * @kparam: the actual parameter definition to add to sysfs
+ * @kp: the actual parameter definition to add to sysfs
 * @name: name of parameter
 *
 * Create a kobject if for a (per-module) parameter if mp NULL, and
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3e2b4f519009..ccd2d20e6b06 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -120,22 +120,26 @@ static void s2idle_loop(void)
                 * frozen processes + suspended devices + idle processors.
                 * Thus s2idle_enter() should be called right after
                 * all devices have been suspended.
+                 *
+                 * Wakeups during the noirq suspend of devices may be spurious,
+                 * so prevent them from terminating the loop right away.
                 */
                error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
                if (!error)
                        s2idle_enter();
+                else if (error == -EBUSY && pm_wakeup_pending())
+                        error = 0;
-                dpm_noirq_resume_devices(PMSG_RESUME);
+                if (!error && s2idle_ops && s2idle_ops->wake)
-                if (error && (error != -EBUSY || !pm_wakeup_pending())) {
-                        dpm_noirq_end();
-                        break;
-                }
-                if (s2idle_ops && s2idle_ops->wake)
                        s2idle_ops->wake();
+                dpm_noirq_resume_devices(PMSG_RESUME);
                dpm_noirq_end();
+                if (error)
+                        break;
                if (s2idle_ops && s2idle_ops->sync)
                        s2idle_ops->sync();
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1250e4bd4b85..b0ad62b0e7b8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -882,6 +882,11 @@ void rcu_irq_exit(void)
        RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
        rdtp = this_cpu_ptr(&rcu_dynticks);
+        /* Page faults can happen in NMI handlers, so check... */
+        if (rdtp->dynticks_nmi_nesting)
+                return;
        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
                     rdtp->dynticks_nesting < 1);
        if (rdtp->dynticks_nesting <= 1) {
@@ -1015,6 +1020,11 @@ void rcu_irq_enter(void)
        RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
        rdtp = this_cpu_ptr(&rcu_dynticks);
+        /* Page faults can happen in NMI handlers, so check... */
+        if (rdtp->dynticks_nmi_nesting)
+                return;
        oldval = rdtp->dynticks_nesting;
        rdtp->dynticks_nesting++;
        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18a6966567da..d17c5da523a0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5166,6 +5166,28 @@ void sched_show_task(struct task_struct *p)
        put_task_stack(p);
 }
+static inline bool
+state_filter_match(unsigned long state_filter, struct task_struct *p)
+{
+        /* no filter, everything matches */
+        if (!state_filter)
+                return true;
+        /* filter, but doesn't match */
+        if (!(p->state & state_filter))
+                return false;
+        /*
+         * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
+         * TASK_KILLABLE).
+         */
+        if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+                return false;
+        return true;
+}
 void show_state_filter(unsigned long state_filter)
 {
        struct task_struct *g, *p;
@@ -5188,7 +5210,7 @@ void show_state_filter(unsigned long state_filter)
                 */
                touch_nmi_watchdog();
                touch_all_softlockup_watchdogs();
-                if (!state_filter || (p->state & state_filter))
+                if (state_filter_match(state_filter, p))
                        sched_show_task(p);
        }
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 01217fb5a5de..2f93e4a2d9f6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg)
 }
 #endif
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 98b59b5db90b..bb3a38005b9c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -17,11 +17,13 @@
 #include <linux/audit.h>
 #include <linux/compat.h>
 #include <linux/coredump.h>
+#include <linux/kmemleak.h>
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
 #include <linux/seccomp.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
+#include <linux/sysctl.h>
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
 #include <asm/syscall.h>
@@ -42,6 +44,7 @@
 *         get/put helpers should be used when accessing an instance
 *         outside of a lifetime-guarded section.  In general, this
 *         is only needed for handling filters shared across tasks.
+ * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
 * @prev: points to a previously installed, or inherited, filter
 * @prog: the BPF program to evaluate
 *
@@ -57,6 +60,7 @@
 */
 struct seccomp_filter {
        refcount_t usage;
+        bool log;
        struct seccomp_filter *prev;
        struct bpf_prog *prog;
 };
@@ -171,10 +175,15 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 /**
 * seccomp_run_filters - evaluates all seccomp filters against @sd
 * @sd: optional seccomp data to be passed to filters
+ * @match: stores struct seccomp_filter that resulted in the return value,
+ *         unless filter returned SECCOMP_RET_ALLOW, in which case it will
+ *         be unchanged.
 *
 * Returns valid seccomp BPF response codes.
 */
-static u32 seccomp_run_filters(const struct seccomp_data *sd)
+#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
+static u32 seccomp_run_filters(const struct seccomp_data *sd,
+                               struct seccomp_filter **match)
 {
        struct seccomp_data sd_local;
        u32 ret = SECCOMP_RET_ALLOW;
@@ -184,7 +193,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
        /* Ensure unexpected behavior doesn't result in failing open. */
        if (unlikely(WARN_ON(f == NULL)))
-                return SECCOMP_RET_KILL;
+                return SECCOMP_RET_KILL_PROCESS;
        if (!sd) {
                populate_seccomp_data(&sd_local);
@@ -198,8 +207,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
        for (; f; f = f->prev) {
                u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
-                if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+                if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
                        ret = cur_ret;
+                        *match = f;
+                }
        }
        return ret;
 }
@@ -444,6 +455,10 @@ static long seccomp_attach_filter(unsigned int flags,
                        return ret;
        }
+        /* Set log flag, if present. */
+        if (flags & SECCOMP_FILTER_FLAG_LOG)
+                filter->log = true;
        /*
         * If there is an existing filter, make it the prev and don't drop its
         * task reference.
@@ -458,14 +473,19 @@ static long seccomp_attach_filter(unsigned int flags,
        return 0;
 }
+void __get_seccomp_filter(struct seccomp_filter *filter)
+{
+        /* Reference count is bounded by the number of total processes. */
+        refcount_inc(&filter->usage);
+}
 /* get_seccomp_filter - increments the reference count of the filter on @tsk */
 void get_seccomp_filter(struct task_struct *tsk)
 {
        struct seccomp_filter *orig = tsk->seccomp.filter;
        if (!orig)
                return;
-        /* Reference count is bounded by the number of total processes. */
+        __get_seccomp_filter(orig);
-        refcount_inc(&orig->usage);
 }
 static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -476,10 +496,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter)
        }
 }
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+static void __put_seccomp_filter(struct seccomp_filter *orig)
-void put_seccomp_filter(struct task_struct *tsk)
 {
-        struct seccomp_filter *orig = tsk->seccomp.filter;
        /* Clean up single-reference branches iteratively. */
        while (orig && refcount_dec_and_test(&orig->usage)) {
                struct seccomp_filter *freeme = orig;
@@ -488,6 +506,12 @@ void put_seccomp_filter(struct task_struct *tsk)
        }
 }
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+        __put_seccomp_filter(tsk->seccomp.filter);
+}
 static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
 {
        memset(info, 0, sizeof(*info));
@@ -514,6 +538,65 @@ static void seccomp_send_sigsys(int syscall, int reason)
 }
 #endif  /* CONFIG_SECCOMP_FILTER */
+/* For use with seccomp_actions_logged */
+#define SECCOMP_LOG_KILL_PROCESS        (1 << 0)
+#define SECCOMP_LOG_KILL_THREAD         (1 << 1)
+#define SECCOMP_LOG_TRAP                (1 << 2)
+#define SECCOMP_LOG_ERRNO               (1 << 3)
+#define SECCOMP_LOG_TRACE               (1 << 4)
+#define SECCOMP_LOG_LOG                 (1 << 5)
+#define SECCOMP_LOG_ALLOW               (1 << 6)
+static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
+                                    SECCOMP_LOG_KILL_THREAD  |
+                                    SECCOMP_LOG_TRAP  |
+                                    SECCOMP_LOG_ERRNO |
+                                    SECCOMP_LOG_TRACE |
+                                    SECCOMP_LOG_LOG;
+static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
+                               bool requested)
+{
+        bool log = false;
+        switch (action) {
+        case SECCOMP_RET_ALLOW:
+                break;
+        case SECCOMP_RET_TRAP:
+                log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
+                break;
+        case SECCOMP_RET_ERRNO:
+                log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
+                break;
+        case SECCOMP_RET_TRACE:
+                log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
+                break;
+        case SECCOMP_RET_LOG:
+                log = seccomp_actions_logged & SECCOMP_LOG_LOG;
+                break;
+        case SECCOMP_RET_KILL_THREAD:
+                log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
+                break;
+        case SECCOMP_RET_KILL_PROCESS:
+        default:
+                log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
+        }
+        /*
+         * Force an audit message to be emitted when the action is RET_KILL_*,
+         * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
+         * allowed to be logged by the admin.
+         */
+        if (log)
+                return __audit_seccomp(syscall, signr, action);
+        /*
+         * Let the audit subsystem decide if the action should be audited based
+         * on whether the current task itself is being audited.
+         */
+        return audit_seccomp(syscall, signr, action);
+}
 /*
 * Secure computing mode 1 allows only read/write/exit/sigreturn.
 * To be fully secure this must be combined with rlimit
@@ -539,7 +622,7 @@ static void __secure_computing_strict(int this_syscall)
 #ifdef SECCOMP_DEBUG
        dump_stack();
 #endif
-        audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+        seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
        do_exit(SIGKILL);
 }
@@ -566,6 +649,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
                            const bool recheck_after_trace)
 {
        u32 filter_ret, action;
+        struct seccomp_filter *match = NULL;
        int data;
        /*
@@ -574,9 +658,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
         */
        rmb();
-        filter_ret = seccomp_run_filters(sd);
+        filter_ret = seccomp_run_filters(sd, &match);
        data = filter_ret & SECCOMP_RET_DATA;
-        action = filter_ret & SECCOMP_RET_ACTION;
+        action = filter_ret & SECCOMP_RET_ACTION_FULL;
        switch (action) {
        case SECCOMP_RET_ERRNO:
@@ -637,14 +721,25 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
                return 0;
+        case SECCOMP_RET_LOG:
+                seccomp_log(this_syscall, 0, action, true);
+                return 0;
        case SECCOMP_RET_ALLOW:
+                /*
+                 * Note that the "match" filter will always be NULL for
+                 * this action since SECCOMP_RET_ALLOW is the starting
+                 * state in seccomp_run_filters().
+                 */
                return 0;
-        case SECCOMP_RET_KILL:
+        case SECCOMP_RET_KILL_THREAD:
+        case SECCOMP_RET_KILL_PROCESS:
        default:
-                audit_seccomp(this_syscall, SIGSYS, action);
+                seccomp_log(this_syscall, SIGSYS, action, true);
                /* Dump core only if this is the last remaining thread. */
-                if (get_nr_threads(current) == 1) {
+                if (action == SECCOMP_RET_KILL_PROCESS ||
+                    get_nr_threads(current) == 1) {
                        siginfo_t info;
                        /* Show the original registers in the dump. */
@@ -653,13 +748,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
                        seccomp_init_siginfo(&info, this_syscall, data);
                        do_coredump(&info);
                }
-                do_exit(SIGSYS);
+                if (action == SECCOMP_RET_KILL_PROCESS)
+                        do_group_exit(SIGSYS);
+                else
+                        do_exit(SIGSYS);
        }
        unreachable();
 skip:
-        audit_seccomp(this_syscall, 0, action);
+        seccomp_log(this_syscall, 0, action, match ? match->log : false);
        return -1;
 }
 #else
@@ -794,6 +892,29 @@ static inline long seccomp_set_mode_filter(unsigned int flags,
 }
 #endif
+static long seccomp_get_action_avail(const char __user *uaction)
+{
+        u32 action;
+        if (copy_from_user(&action, uaction, sizeof(action)))
+                return -EFAULT;
+        switch (action) {
+        case SECCOMP_RET_KILL_PROCESS:
+        case SECCOMP_RET_KILL_THREAD:
+        case SECCOMP_RET_TRAP:
+        case SECCOMP_RET_ERRNO:
+        case SECCOMP_RET_TRACE:
+        case SECCOMP_RET_LOG:
+        case SECCOMP_RET_ALLOW:
+                break;
+        default:
+                return -EOPNOTSUPP;
+        }
+        return 0;
+}
 /* Common entry point for both prctl and syscall. */
 static long do_seccomp(unsigned int op, unsigned int flags,
                       const char __user *uargs)
@@ -805,6 +926,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
                return seccomp_set_mode_strict();
        case SECCOMP_SET_MODE_FILTER:
                return seccomp_set_mode_filter(flags, uargs);
+        case SECCOMP_GET_ACTION_AVAIL:
+                if (flags != 0)
+                        return -EINVAL;
+                return seccomp_get_action_avail(uargs);
        default:
                return -EINVAL;
        }
@@ -908,13 +1034,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
        if (!data)
                goto out;
-        get_seccomp_filter(task);
+        __get_seccomp_filter(filter);
        spin_unlock_irq(&task->sighand->siglock);
        if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
                ret = -EFAULT;
-        put_seccomp_filter(task);
+        __put_seccomp_filter(filter);
        return ret;
 out:
@@ -922,3 +1048,185 @@ out:
        return ret;
 }
 #endif
+#ifdef CONFIG_SYSCTL
+/* Human readable action names for friendly sysctl interaction */
+#define SECCOMP_RET_KILL_PROCESS_NAME   "kill_process"
+#define SECCOMP_RET_KILL_THREAD_NAME    "kill_thread"
+#define SECCOMP_RET_TRAP_NAME           "trap"
+#define SECCOMP_RET_ERRNO_NAME          "errno"
+#define SECCOMP_RET_TRACE_NAME          "trace"
+#define SECCOMP_RET_LOG_NAME            "log"
+#define SECCOMP_RET_ALLOW_NAME          "allow"
+static const char seccomp_actions_avail[] =
+                                SECCOMP_RET_KILL_PROCESS_NAME   " "
+                                SECCOMP_RET_KILL_THREAD_NAME    " "
+                                SECCOMP_RET_TRAP_NAME           " "
+                                SECCOMP_RET_ERRNO_NAME          " "
+                                SECCOMP_RET_TRACE_NAME          " "
+                                SECCOMP_RET_LOG_NAME            " "
+                                SECCOMP_RET_ALLOW_NAME;
+struct seccomp_log_name {
+        u32             log;
+        const char      *name;
+};
+static const struct seccomp_log_name seccomp_log_names[] = {
+        { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
+        { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
+        { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
+        { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
+        { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
+        { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
+        { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
+        { }
+};
+static bool seccomp_names_from_actions_logged(char *names, size_t size,
+                                              u32 actions_logged)
+{
+        const struct seccomp_log_name *cur;
+        bool append_space = false;
+        for (cur = seccomp_log_names; cur->name && size; cur++) {
+                ssize_t ret;
+                if (!(actions_logged & cur->log))
+                        continue;
+                if (append_space) {
+                        ret = strscpy(names, " ", size);
+                        if (ret < 0)
+                                return false;
+                        names += ret;
+                        size -= ret;
+                } else
+                        append_space = true;
+                ret = strscpy(names, cur->name, size);
+                if (ret < 0)
+                        return false;
+                names += ret;
+                size -= ret;
+        }
+        return true;
+}
+static bool seccomp_action_logged_from_name(u32 *action_logged,
+                                            const char *name)
+{
+        const struct seccomp_log_name *cur;
+        for (cur = seccomp_log_names; cur->name; cur++) {
+                if (!strcmp(cur->name, name)) {
+                        *action_logged = cur->log;
+                        return true;
+                }
+        }
+        return false;
+}
+static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
+{
+        char *name;
+        *actions_logged = 0;
+        while ((name = strsep(&names, " ")) && *name) {
+                u32 action_logged = 0;
+                if (!seccomp_action_logged_from_name(&action_logged, name))
+                        return false;
+                *actions_logged |= action_logged;
+        }
+        return true;
+}
+static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
+                                          void __user *buffer, size_t *lenp,
+                                          loff_t *ppos)
+{
+        char names[sizeof(seccomp_actions_avail)];
+        struct ctl_table table;
+        int ret;
+        if (write && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        memset(names, 0, sizeof(names));
+        if (!write) {
+                if (!seccomp_names_from_actions_logged(names, sizeof(names),
+                                                       seccomp_actions_logged))
+                        return -EINVAL;
+        }
+        table = *ro_table;
+        table.data = names;
+        table.maxlen = sizeof(names);
+        ret = proc_dostring(&table, write, buffer, lenp, ppos);
+        if (ret)
+                return ret;
+        if (write) {
+                u32 actions_logged;
+                if (!seccomp_actions_logged_from_names(&actions_logged,
+                                                       table.data))
+                        return -EINVAL;
+                if (actions_logged & SECCOMP_LOG_ALLOW)
+                        return -EINVAL;
+                seccomp_actions_logged = actions_logged;
+        }
+        return 0;
+}
+static struct ctl_path seccomp_sysctl_path[] = {
+        { .procname = "kernel", },
+        { .procname = "seccomp", },
+        { }
+};
+static struct ctl_table seccomp_sysctl_table[] = {
+        {
+                .procname       = "actions_avail",
+                .data           = (void *) &seccomp_actions_avail,
+                .maxlen         = sizeof(seccomp_actions_avail),
+                .mode           = 0444,
+                .proc_handler   = proc_dostring,
+        },
+        {
+                .procname       = "actions_logged",
+                .mode           = 0644,
+                .proc_handler   = seccomp_actions_logged_handler,
+        },
+        { }
+};
+static int __init seccomp_sysctl_init(void)
+{
+        struct ctl_table_header *hdr;
+        hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
+        if (!hdr)
+                pr_warn("seccomp: sysctl registration failed\n");
+        else
+                kmemleak_not_leak(hdr);
+        return 0;
+}
+device_initcall(seccomp_sysctl_init)
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 1d71c051a951..5043e7433f4b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -344,39 +344,30 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
 * by the client, but only by calling this function.
 * This function can only be called on a registered smp_hotplug_thread.
 */
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-                                         const struct cpumask *new)
+                                          const struct cpumask *new)
 {
        struct cpumask *old = plug_thread->cpumask;
-        cpumask_var_t tmp;
+        static struct cpumask tmp;
        unsigned int cpu;
-        if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
+        lockdep_assert_cpus_held();
-                return -ENOMEM;
-        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
        /* Park threads that were exclusively enabled on the old mask. */
-        cpumask_andnot(tmp, old, new);
+        cpumask_andnot(&tmp, old, new);
-        for_each_cpu_and(cpu, tmp, cpu_online_mask)
+        for_each_cpu_and(cpu, &tmp, cpu_online_mask)
                smpboot_park_thread(plug_thread, cpu);
        /* Unpark threads that are exclusively enabled on the new mask. */
-        cpumask_andnot(tmp, new, old);
+        cpumask_andnot(&tmp, new, old);
-        for_each_cpu_and(cpu, tmp, cpu_online_mask)
+        for_each_cpu_and(cpu, &tmp, cpu_online_mask)
                smpboot_unpark_thread(plug_thread, cpu);
        cpumask_copy(old, new);
        mutex_unlock(&smpboot_threads_lock);
-        put_online_cpus();
-        free_cpumask_var(tmp);
-        return 0;
 }
-EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
 static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbbb8157..d9c31bc2eaea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_time_avg,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &one,
        },
 #ifdef CONFIG_SCHEDSTATS
        {
@@ -871,9 +872,9 @@ static struct ctl_table kern_table[] = {
 #if defined(CONFIG_LOCKUP_DETECTOR)
        {
                .procname       = "watchdog",
-                .data           = &watchdog_user_enabled,
+                .data           = &watchdog_user_enabled,
-                .maxlen         = sizeof (int),
+                .maxlen         = sizeof(int),
-                .mode           = 0644,
+                .mode           = 0644,
                .proc_handler   = proc_watchdog,
                .extra1         = &zero,
                .extra2         = &one,
@@ -889,16 +890,12 @@ static struct ctl_table kern_table[] = {
        },
        {
                .procname       = "nmi_watchdog",
-                .data           = &nmi_watchdog_enabled,
+                .data           = &nmi_watchdog_user_enabled,
-                .maxlen         = sizeof (int),
+                .maxlen         = sizeof(int),
-                .mode           = 0644,
+                .mode           = NMI_WATCHDOG_SYSCTL_PERM,
                .proc_handler   = proc_nmi_watchdog,
                .extra1         = &zero,
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
                .extra2         = &one,
-#else
-                .extra2         = &zero,
-#endif
        },
        {
                .procname       = "watchdog_cpumask",
@@ -910,9 +907,9 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
        {
                .procname       = "soft_watchdog",
-                .data           = &soft_watchdog_enabled,
+                .data           = &soft_watchdog_user_enabled,
-                .maxlen         = sizeof (int),
+                .maxlen         = sizeof(int),
-                .mode           = 0644,
+                .mode           = 0644,
                .proc_handler   = proc_soft_watchdog,
                .extra1         = &zero,
                .extra2         = &one,
@@ -2187,8 +2184,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
        if (write) {
                if (*lvalp > UINT_MAX)
                        return -EINVAL;
-                if (*lvalp > UINT_MAX)
-                        return -EINVAL;
                *valp = *lvalp;
        } else {
                unsigned int val = *valp;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2a685b45b73b..45a3928544ce 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -648,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start)
 }
 EXPORT_SYMBOL_GPL(blk_trace_startstop);
+/*
+ * When reading or writing the blktrace sysfs files, the references to the
+ * opened sysfs or device files should prevent the underlying block device
+ * from being removed. So no further delete protection is really needed.
+ */
 /**
 * blk_trace_ioctl: - handle the ioctls associated with tracing
 * @bdev:       the block device
@@ -665,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
        if (!q)
                return -ENXIO;
-        mutex_lock(&bdev->bd_mutex);
+        mutex_lock(&q->blk_trace_mutex);
        switch (cmd) {
        case BLKTRACESETUP:
@@ -691,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
                break;
        }
-        mutex_unlock(&bdev->bd_mutex);
+        mutex_unlock(&q->blk_trace_mutex);
        return ret;
 }
@@ -1727,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
        if (q == NULL)
                goto out_bdput;
-        mutex_lock(&bdev->bd_mutex);
+        mutex_lock(&q->blk_trace_mutex);
        if (attr == &dev_attr_enable) {
                ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1746,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
                ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
 out_unlock_bdev:
-        mutex_unlock(&bdev->bd_mutex);
+        mutex_unlock(&q->blk_trace_mutex);
 out_bdput:
        bdput(bdev);
 out:
@@ -1788,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
        if (q == NULL)
                goto out_bdput;
-        mutex_lock(&bdev->bd_mutex);
+        mutex_lock(&q->blk_trace_mutex);
        if (attr == &dev_attr_enable) {
                if (value)
@@ -1814,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
        }
 out_unlock_bdev:
-        mutex_unlock(&bdev->bd_mutex);
+        mutex_unlock(&q->blk_trace_mutex);
 out_bdput:
        bdput(bdev);
 out:
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6abfafd7f173..8319e09e15b9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
 static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
 static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
-static unsigned long save_global_trampoline;
-static unsigned long save_global_flags;
 static int __init set_graph_function(char *str)
 {
        strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void)
        unregister_pm_notifier(&ftrace_suspend_notifier);
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
-#ifdef CONFIG_DYNAMIC_FTRACE
-        /*
-         * Function graph does not allocate the trampoline, but
-         * other global_ops do. We need to reset the ALLOC_TRAMP flag
-         * if one was used.
-         */
-        global_ops.trampoline = save_global_trampoline;
-        if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
-                global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
-#endif
 out:
        mutex_unlock(&ftrace_lock);
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5360b7aec57a..752e5daf0896 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4020,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file)
        /* If this file was open for write, then erase contents */
        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
                int cpu = tracing_get_cpu(inode);
+                struct trace_buffer *trace_buf = &tr->trace_buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+                if (tr->current_trace->print_max)
+                        trace_buf = &tr->max_buffer;
+#endif
                if (cpu == RING_BUFFER_ALL_CPUS)
-                        tracing_reset_online_cpus(&tr->trace_buffer);
+                        tracing_reset_online_cpus(trace_buf);
                else
-                        tracing_reset(&tr->trace_buffer, cpu);
+                        tracing_reset(trace_buf, cpu);
        }
        if (file->f_mode & FMODE_READ) {
@@ -5358,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
        if (t == tr->current_trace)
                goto out;
+        /* Some tracers won't work on kernel command line */
+        if (system_state < SYSTEM_RUNNING && t->noboot) {
+                pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
+                        t->name);
+                goto out;
+        }
        /* Some tracers are only allowed for the top level buffer */
        if (!trace_ok_for_array(t, tr)) {
                ret = -EINVAL;
@@ -5667,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp)
                 *
                 * iter->pos will be 0 if we haven't read anything.
                 */
-                if (!tracing_is_on() && iter->pos)
+                if (!tracer_tracing_is_on(iter->tr) && iter->pos)
                        break;
                mutex_unlock(&iter->mutex);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fb5d54d0d1b3..652c682707cd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -444,6 +444,8 @@ struct tracer {
 #ifdef CONFIG_TRACER_MAX_TRACE
        bool                    use_max_tr;
 #endif
+        /* True if tracer cannot be enabled in kernel param */
+        bool                    noboot;
 };
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index cd7480d0a201..dca78fc48439 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -282,6 +282,7 @@ static struct tracer mmio_tracer __read_mostly =
        .close          = mmio_close,
        .read           = mmio_read,
        .print_line     = mmio_print_line,
+        .noboot         = true,
 };
 __init static int init_mmio_trace(void)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bac629af2285..c738e764e2a5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter)
        return !trace_seq_has_overflowed(s);
 }
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-static int task_state_char(unsigned long state)
-{
-        int bit = state ? __ffs(state) + 1 : 0;
-        return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
 /**
 * ftrace_find_event - find a registered event
 * @type: the type of event to look for
@@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
        trace_assign_type(field, iter->ent);
-        T = task_state_char(field->next_state);
+        T = __task_state_to_char(field->next_state);
-        S = task_state_char(field->prev_state);
+        S = __task_state_to_char(field->prev_state);
        trace_find_cmdline(field->next_pid, comm);
        trace_seq_printf(&iter->seq,
                         " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
        trace_assign_type(field, iter->ent);
        if (!S)
-                S = task_state_char(field->prev_state);
+                S = __task_state_to_char(field->prev_state);
-        T = task_state_char(field->next_state);
+        T = __task_state_to_char(field->next_state);
        trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
                         field->prev_pid,
                         field->prev_prio,
@@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
        trace_assign_type(field, iter->ent);
        if (!S)
-                S = task_state_char(field->prev_state);
+                S = __task_state_to_char(field->prev_state);
-        T = task_state_char(field->next_state);
+        T = __task_state_to_char(field->next_state);
        SEQ_PUT_HEX_FIELD(s, field->prev_pid);
        SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ddec53b67646..0c331978b1a6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->prev_pid                 = prev->pid;
        entry->prev_prio                = prev->prio;
-        entry->prev_state               = prev->state;
+        entry->prev_state               = __get_task_state(prev);
        entry->next_pid                 = next->pid;
        entry->next_prio                = next->prio;
-        entry->next_state               = next->state;
+        entry->next_state               = __get_task_state(next);
        entry->next_cpu = task_cpu(next);
        if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->prev_pid                 = curr->pid;
        entry->prev_prio                = curr->prio;
-        entry->prev_state               = curr->state;
+        entry->prev_state               = __get_task_state(curr);
        entry->next_pid                 = wakee->pid;
        entry->next_prio                = wakee->prio;
-        entry->next_state               = wakee->state;
+        entry->next_state               = __get_task_state(wakee);
        entry->next_cpu                 = task_cpu(wakee);
        if (!call_filter_check_discard(call, entry, buffer, event))
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a4df67cbc711..49cb41412eec 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack)
        if (in_nmi())
                return;
-        /*
-         * There's a slight chance that we are tracing inside the
-         * RCU infrastructure, and rcu_irq_enter() will not work
-         * as expected.
-         */
-        if (unlikely(rcu_irq_enter_disabled()))
-                return;
        local_irq_save(flags);
        arch_spin_lock(&stack_trace_max_lock);
-        /*
-         * RCU may not be watching, make it see us.
-         * The stack trace code uses rcu_sched.
-         */
-        rcu_irq_enter();
        /* In case another CPU set the tracer_frame on us */
        if (unlikely(!frame_size))
                this_size -= tracer_frame;
@@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack)
        }
 out:
-        rcu_irq_exit();
        arch_spin_unlock(&stack_trace_max_lock);
        local_irq_restore(flags);
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f5d52024f6b7..6bcb854909c0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,20 +29,29 @@
 #include <linux/kvm_para.h>
 #include <linux/kthread.h>
-/* Watchdog configuration */
+static DEFINE_MUTEX(watchdog_mutex);
-static DEFINE_MUTEX(watchdog_proc_mutex);
-int __read_mostly nmi_watchdog_enabled;
 #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
+# define WATCHDOG_DEFAULT       (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
-                                                NMI_WATCHDOG_ENABLED;
+# define NMI_WATCHDOG_DEFAULT   1
 #else
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT       (SOFT_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT   0
 #endif
+unsigned long __read_mostly watchdog_enabled;
+int __read_mostly watchdog_user_enabled = 1;
+int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
+int __read_mostly soft_watchdog_user_enabled = 1;
+int __read_mostly watchdog_thresh = 10;
+int __read_mostly nmi_watchdog_available;
+struct cpumask watchdog_allowed_mask __read_mostly;
+struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-/* boot commands */
 /*
 * Should we panic when a soft-lockup or hard-lockup occurs:
 */
@@ -56,9 +65,9 @@ unsigned int __read_mostly hardlockup_panic =
 * kernel command line parameters are parsed, because otherwise it is not
 * possible to override this in hardlockup_panic_setup().
 */
-void hardlockup_detector_disable(void)
+void __init hardlockup_detector_disable(void)
 {
-        watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+        nmi_watchdog_user_enabled = 0;
 }
 static int __init hardlockup_panic_setup(char *str)
@@ -68,48 +77,24 @@ static int __init hardlockup_panic_setup(char *str)
        else if (!strncmp(str, "nopanic", 7))
                hardlockup_panic = 0;
        else if (!strncmp(str, "0", 1))
-                watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+                nmi_watchdog_user_enabled = 0;
        else if (!strncmp(str, "1", 1))
-                watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+                nmi_watchdog_user_enabled = 1;
        return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif
+# ifdef CONFIG_SMP
-#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-int __read_mostly soft_watchdog_enabled;
-#endif
-int __read_mostly watchdog_user_enabled;
-int __read_mostly watchdog_thresh = 10;
-#ifdef CONFIG_SMP
-int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#endif
-struct cpumask watchdog_cpumask __read_mostly;
-unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
-/*
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
- * The 'watchdog_running' variable is set to 1 when the watchdog threads
+{
- * are registered/started and is set to 0 when the watchdog threads are
+        sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
- * unregistered/stopped, so it is an indicator whether the threads exist.
+        return 1;
- */
+}
-static int __read_mostly watchdog_running;
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
-/*
+# endif /* CONFIG_SMP */
- * If a subsystem has a need to deactivate the watchdog temporarily, it
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
- * can use the suspend/resume interface to achieve this. The content of
- * the 'watchdog_suspended' variable reflects this state. Existing threads
- * are parked/unparked by the lockup_detector_{suspend|resume} functions
- * (see comment blocks pertaining to those functions for further details).
- *
- * 'watchdog_suspended' also prevents threads from being registered/started
- * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
- * of 'watchdog_running' cannot change while the watchdog is deactivated
- * temporarily (see related code in 'proc' handlers).
- */
-int __read_mostly watchdog_suspended;
 /*
 * These functions can be overridden if an architecture implements its
@@ -121,36 +106,68 @@ int __read_mostly watchdog_suspended;
 */
 int __weak watchdog_nmi_enable(unsigned int cpu)
 {
+        hardlockup_detector_perf_enable();
        return 0;
 }
 void __weak watchdog_nmi_disable(unsigned int cpu)
 {
+        hardlockup_detector_perf_disable();
 }
-/*
+/* Return 0, if a NMI watchdog is available. Error code otherwise */
- * watchdog_nmi_reconfigure can be implemented to be notified after any
+int __weak __init watchdog_nmi_probe(void)
- * watchdog configuration change. The arch hardlockup watchdog should
+{
- * respond to the following variables:
+        return hardlockup_detector_perf_init();
- * - nmi_watchdog_enabled
+}
+/**
+ * watchdog_nmi_stop - Stop the watchdog for reconfiguration
+ *
+ * The reconfiguration steps are:
+ * watchdog_nmi_stop();
+ * update_variables();
+ * watchdog_nmi_start();
+ */
+void __weak watchdog_nmi_stop(void) { }
+/**
+ * watchdog_nmi_start - Start the watchdog after reconfiguration
+ *
+ * Counterpart to watchdog_nmi_stop().
+ *
+ * The following variables have been updated in update_variables() and
+ * contain the currently valid configuration:
+ * - watchdog_enabled
 * - watchdog_thresh
 * - watchdog_cpumask
- * - sysctl_hardlockup_all_cpu_backtrace
- * - hardlockup_panic
- * - watchdog_suspended
 */
-void __weak watchdog_nmi_reconfigure(void)
+void __weak watchdog_nmi_start(void) { }
+/**
+ * lockup_detector_update_enable - Update the sysctl enable bit
+ *
+ * Caller needs to make sure that the NMI/perf watchdogs are off, so this
+ * can't race with watchdog_nmi_disable().
+ */
+static void lockup_detector_update_enable(void)
 {
+        watchdog_enabled = 0;
+        if (!watchdog_user_enabled)
+                return;
+        if (nmi_watchdog_available && nmi_watchdog_user_enabled)
+                watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+        if (soft_watchdog_user_enabled)
+                watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
 }
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
-/* Helper for online, unparked cpus. */
+/* Global variables, exported for sysctl */
-#define for_each_watchdog_cpu(cpu) \
+unsigned int __read_mostly softlockup_panic =
-        for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+                        CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+static bool softlockup_threads_initialized __read_mostly;
 static u64 __read_mostly sample_period;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -164,50 +181,40 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static unsigned long soft_lockup_nmi_warn;
-unsigned int __read_mostly softlockup_panic =
-                        CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
 static int __init softlockup_panic_setup(char *str)
 {
        softlockup_panic = simple_strtoul(str, NULL, 0);
        return 1;
 }
 __setup("softlockup_panic=", softlockup_panic_setup);
 static int __init nowatchdog_setup(char *str)
 {
-        watchdog_enabled = 0;
+        watchdog_user_enabled = 0;
        return 1;
 }
 __setup("nowatchdog", nowatchdog_setup);
 static int __init nosoftlockup_setup(char *str)
 {
-        watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
+        soft_watchdog_user_enabled = 0;
        return 1;
 }
 __setup("nosoftlockup", nosoftlockup_setup);
 #ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 static int __init softlockup_all_cpu_backtrace_setup(char *str)
 {
-        sysctl_softlockup_all_cpu_backtrace =
+        sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
-                !!simple_strtol(str, NULL, 0);
        return 1;
 }
 __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int __init hardlockup_all_cpu_backtrace_setup(char *str)
-{
-        sysctl_hardlockup_all_cpu_backtrace =
-                !!simple_strtol(str, NULL, 0);
-        return 1;
-}
-__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
-#endif
 #endif
+static void __lockup_detector_cleanup(void);
 /*
 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
 * lockups can have false positives under extreme conditions. So we generally
@@ -278,11 +285,15 @@ void touch_all_softlockup_watchdogs(void)
        int cpu;
        /*
-         * this is done lockless
+         * watchdog_mutex cannpt be taken here, as this might be called
-         * do we care if a 0 races with a timestamp?
+         * from (soft)interrupt context, so the access to
-         * all it means is the softlock check starts one cycle later
+         * watchdog_allowed_cpumask might race with a concurrent update.
+         *
+         * The watchdog time stamp can race against a concurrent real
+         * update as well, the only side effect might be a cycle delay for
+         * the softlockup check.
         */
-        for_each_watchdog_cpu(cpu)
+        for_each_cpu(cpu, &watchdog_allowed_mask)
                per_cpu(watchdog_touch_ts, cpu) = 0;
        wq_watchdog_touch(-1);
 }
@@ -322,9 +333,6 @@ static void watchdog_interrupt_count(void)
        __this_cpu_inc(hrtimer_interrupts);
 }
-static int watchdog_enable_all_cpus(void);
-static void watchdog_disable_all_cpus(void);
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
@@ -333,7 +341,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        int duration;
        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
-        if (atomic_read(&watchdog_park_in_progress) != 0)
+        if (!watchdog_enabled)
                return HRTIMER_NORESTART;
        /* kick the hardlockup detector */
@@ -447,32 +455,38 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
 static void watchdog_enable(unsigned int cpu)
 {
-        struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+        struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
-        /* kick off the timer for the hardlockup detector */
+        /*
+         * Start the timer first to prevent the NMI watchdog triggering
+         * before the timer has a chance to fire.
+         */
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = watchdog_timer_fn;
-        /* Enable the perf event */
-        watchdog_nmi_enable(cpu);
-        /* done here because hrtimer_start can only pin to smp_processor_id() */
        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
                      HRTIMER_MODE_REL_PINNED);
-        /* initialize timestamp */
+        /* Initialize timestamp */
-        watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
        __touch_watchdog();
+        /* Enable the perf event */
+        if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
+                watchdog_nmi_enable(cpu);
+        watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
 }
 static void watchdog_disable(unsigned int cpu)
 {
-        struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+        struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
        watchdog_set_prio(SCHED_NORMAL, 0);
-        hrtimer_cancel(hrtimer);
+        /*
-        /* disable the perf event */
+         * Disable the perf event first. That prevents that a large delay
+         * between disabling the timer and disabling the perf event causes
+         * the perf NMI to detect a false positive.
+         */
        watchdog_nmi_disable(cpu);
+        hrtimer_cancel(hrtimer);
 }
 static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -499,21 +513,6 @@ static void watchdog(unsigned int cpu)
        __this_cpu_write(soft_lockup_hrtimer_cnt,
                         __this_cpu_read(hrtimer_interrupts));
        __touch_watchdog();
-        /*
-         * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
-         * failure path. Check for failures that can occur asynchronously -
-         * for example, when CPUs are on-lined - and shut down the hardware
-         * perf event on each CPU accordingly.
-         *
-         * The only non-obvious place this bit can be cleared is through
-         * watchdog_nmi_enable(), so a pr_info() is placed there.  Placing a
-         * pr_info here would be too noisy as it would result in a message
-         * every few seconds if the hardlockup was disabled but the softlockup
-         * enabled.
-         */
-        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-                watchdog_nmi_disable(cpu);
 }
 static struct smp_hotplug_thread watchdog_threads = {
@@ -527,295 +526,174 @@ static struct smp_hotplug_thread watchdog_threads = {
        .unpark                 = watchdog_enable,
 };
-/*
+static void softlockup_update_smpboot_threads(void)
- * park all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function returns an error if kthread_park() of a watchdog thread
- * fails. In this situation, the watchdog threads of some CPUs can already
- * be parked and the watchdog threads of other CPUs can still be runnable.
- * Callers are expected to handle this special condition as appropriate in
- * their context.
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static int watchdog_park_threads(void)
 {
-        int cpu, ret = 0;
+        lockdep_assert_held(&watchdog_mutex);
-        atomic_set(&watchdog_park_in_progress, 1);
+        if (!softlockup_threads_initialized)
+                return;
-        for_each_watchdog_cpu(cpu) {
+        smpboot_update_cpumask_percpu_thread(&watchdog_threads,
-                ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
+                                             &watchdog_allowed_mask);
-                if (ret)
-                        break;
-        }
-        atomic_set(&watchdog_park_in_progress, 0);
-        return ret;
 }
-/*
+/* Temporarily park all watchdog threads */
- * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+static void softlockup_park_all_threads(void)
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static void watchdog_unpark_threads(void)
 {
-        int cpu;
+        cpumask_clear(&watchdog_allowed_mask);
+        softlockup_update_smpboot_threads();
-        for_each_watchdog_cpu(cpu)
-                kthread_unpark(per_cpu(softlockup_watchdog, cpu));
 }
-static int update_watchdog_all_cpus(void)
+/* Unpark enabled threads */
+static void softlockup_unpark_threads(void)
 {
-        int ret;
+        cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+        softlockup_update_smpboot_threads();
-        ret = watchdog_park_threads();
-        if (ret)
-                return ret;
-        watchdog_unpark_threads();
-        return 0;
 }
-static int watchdog_enable_all_cpus(void)
+static void lockup_detector_reconfigure(void)
 {
-        int err = 0;
+        cpus_read_lock();
+        watchdog_nmi_stop();
-        if (!watchdog_running) {
+        softlockup_park_all_threads();
-                err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+        set_sample_period();
-                                                             &watchdog_cpumask);
+        lockup_detector_update_enable();
-                if (err)
+        if (watchdog_enabled && watchdog_thresh)
-                        pr_err("Failed to create watchdog threads, disabled\n");
+                softlockup_unpark_threads();
-                else
+        watchdog_nmi_start();
-                        watchdog_running = 1;
+        cpus_read_unlock();
-        } else {
+        /*
-                /*
+         * Must be called outside the cpus locked section to prevent
-                 * Enable/disable the lockup detectors or
+         * recursive locking in the perf code.
-                 * change the sample period 'on the fly'.
+         */
-                 */
+        __lockup_detector_cleanup();
-                err = update_watchdog_all_cpus();
-                if (err) {
-                        watchdog_disable_all_cpus();
-                        pr_err("Failed to update lockup detectors, disabled\n");
-                }
-        }
-        if (err)
-                watchdog_enabled = 0;
-        return err;
 }
-static void watchdog_disable_all_cpus(void)
+/*
+ * Create the watchdog thread infrastructure and configure the detector(s).
+ *
+ * The threads are not unparked as watchdog_allowed_mask is empty.  When
+ * the threads are sucessfully initialized, take the proper locks and
+ * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
+ */
+static __init void lockup_detector_setup(void)
 {
-        if (watchdog_running) {
+        int ret;
-                watchdog_running = 0;
-                smpboot_unregister_percpu_thread(&watchdog_threads);
-        }
-}
-#ifdef CONFIG_SYSCTL
+        /*
-static int watchdog_update_cpus(void)
+         * If sysctl is off and watchdog got disabled on the command line,
-{
+         * nothing to do here.
-        return smpboot_update_cpumask_percpu_thread(
+         */
-                    &watchdog_threads, &watchdog_cpumask);
+        lockup_detector_update_enable();
-}
-#endif
-#else /* SOFTLOCKUP */
+        if (!IS_ENABLED(CONFIG_SYSCTL) &&
-static int watchdog_park_threads(void)
+            !(watchdog_enabled && watchdog_thresh))
-{
+                return;
-        return 0;
-}
-static void watchdog_unpark_threads(void)
+        ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
-{
+                                                     &watchdog_allowed_mask);
-}
+        if (ret) {
+                pr_err("Failed to initialize soft lockup detector threads\n");
+                return;
+        }
-static int watchdog_enable_all_cpus(void)
+        mutex_lock(&watchdog_mutex);
-{
+        softlockup_threads_initialized = true;
-        return 0;
+        lockup_detector_reconfigure();
+        mutex_unlock(&watchdog_mutex);
 }
-static void watchdog_disable_all_cpus(void)
+#else /* CONFIG_SOFTLOCKUP_DETECTOR */
+static inline int watchdog_park_threads(void) { return 0; }
+static inline void watchdog_unpark_threads(void) { }
+static inline int watchdog_enable_all_cpus(void) { return 0; }
+static inline void watchdog_disable_all_cpus(void) { }
+static void lockup_detector_reconfigure(void)
 {
+        cpus_read_lock();
+        watchdog_nmi_stop();
+        lockup_detector_update_enable();
+        watchdog_nmi_start();
+        cpus_read_unlock();
 }
+static inline void lockup_detector_setup(void)
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
 {
-        return 0;
+        lockup_detector_reconfigure();
 }
-#endif
+#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
-static void set_sample_period(void)
+static void __lockup_detector_cleanup(void)
 {
+        lockdep_assert_held(&watchdog_mutex);
+        hardlockup_detector_perf_cleanup();
 }
-#endif /* SOFTLOCKUP */
-/*
+/**
- * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
+ *
+ * Caller must not hold the cpu hotplug rwsem.
 */
-int lockup_detector_suspend(void)
+void lockup_detector_cleanup(void)
 {
-        int ret = 0;
+        mutex_lock(&watchdog_mutex);
+        __lockup_detector_cleanup();
-        get_online_cpus();
+        mutex_unlock(&watchdog_mutex);
-        mutex_lock(&watchdog_proc_mutex);
-        /*
-         * Multiple suspend requests can be active in parallel (counted by
-         * the 'watchdog_suspended' variable). If the watchdog threads are
-         * running, the first caller takes care that they will be parked.
-         * The state of 'watchdog_running' cannot change while a suspend
-         * request is active (see related code in 'proc' handlers).
-         */
-        if (watchdog_running && !watchdog_suspended)
-                ret = watchdog_park_threads();
-        if (ret == 0)
-                watchdog_suspended++;
-        else {
-                watchdog_disable_all_cpus();
-                pr_err("Failed to suspend lockup detectors, disabled\n");
-                watchdog_enabled = 0;
-        }
-        watchdog_nmi_reconfigure();
-        mutex_unlock(&watchdog_proc_mutex);
-        return ret;
 }
-/*
+/**
- * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
+ *
+ * Special interface for parisc. It prevents lockup detector warnings from
+ * the default pm_poweroff() function which busy loops forever.
 */
-void lockup_detector_resume(void)
+void lockup_detector_soft_poweroff(void)
 {
-        mutex_lock(&watchdog_proc_mutex);
+        watchdog_enabled = 0;
-        watchdog_suspended--;
-        /*
-         * The watchdog threads are unparked if they were previously running
-         * and if there is no more active suspend request.
-         */
-        if (watchdog_running && !watchdog_suspended)
-                watchdog_unpark_threads();
-        watchdog_nmi_reconfigure();
-        mutex_unlock(&watchdog_proc_mutex);
-        put_online_cpus();
 }
 #ifdef CONFIG_SYSCTL
-/*
+/* Propagate any changes to the watchdog threads */
- * Update the run state of the lockup detectors.
+static void proc_watchdog_update(void)
- */
-static int proc_watchdog_update(void)
 {
-        int err = 0;
+        /* Remove impossible cpus to keep sysctl output clean. */
+        cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
-        /*
+        lockup_detector_reconfigure();
-         * Watchdog threads won't be started if they are already active.
-         * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
-         * care of this. If those threads are already active, the sample
-         * period will be updated and the lockup detectors will be enabled
-         * or disabled 'on the fly'.
-         */
-        if (watchdog_enabled && watchdog_thresh)
-                err = watchdog_enable_all_cpus();
-        else
-                watchdog_disable_all_cpus();
-        watchdog_nmi_reconfigure();
-        return err;
 }
 /*
 * common function for watchdog, nmi_watchdog and soft_watchdog parameter
 *
- * caller             | table->data points to | 'which' contains the flag(s)
+ * caller             | table->data points to      | 'which'
- * -------------------|-----------------------|-----------------------------
+ * -------------------|----------------------------|--------------------------
- * proc_watchdog      | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
+ * proc_watchdog      | watchdog_user_enabled      | NMI_WATCHDOG_ENABLED |
- *                    |                       | with SOFT_WATCHDOG_ENABLED
+ *                    |                            | SOFT_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
+ * -------------------|----------------------------|--------------------------
- * proc_nmi_watchdog  | nmi_watchdog_enabled  | NMI_WATCHDOG_ENABLED
+ * proc_nmi_watchdog  | nmi_watchdog_user_enabled  | NMI_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
+ * -------------------|----------------------------|--------------------------
- * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
 */
 static int proc_watchdog_common(int which, struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-        int err, old, new;
+        int err, old, *param = table->data;
-        int *watchdog_param = (int *)table->data;
-        get_online_cpus();
+        mutex_lock(&watchdog_mutex);
-        mutex_lock(&watchdog_proc_mutex);
-        if (watchdog_suspended) {
-                /* no parameter changes allowed while watchdog is suspended */
-                err = -EAGAIN;
-                goto out;
-        }
-        /*
-         * If the parameter is being read return the state of the corresponding
-         * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
-         * run state of the lockup detectors.
-         */
        if (!write) {
-                *watchdog_param = (watchdog_enabled & which) != 0;
+                /*
+                 * On read synchronize the userspace interface. This is a
+                 * racy snapshot.
+                 */
+                *param = (watchdog_enabled & which) != 0;
                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        } else {
+                old = READ_ONCE(*param);
                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-                if (err)
+                if (!err && old != READ_ONCE(*param))
-                        goto out;
+                        proc_watchdog_update();
-                /*
-                 * There is a race window between fetching the current value
-                 * from 'watchdog_enabled' and storing the new value. During
-                 * this race window, watchdog_nmi_enable() can sneak in and
-                 * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
-                 * The 'cmpxchg' detects this race and the loop retries.
-                 */
-                do {
-                        old = watchdog_enabled;
-                        /*
-                         * If the parameter value is not zero set the
-                         * corresponding bit(s), else clear it(them).
-                         */
-                        if (*watchdog_param)
-                                new = old | which;
-                        else
-                                new = old & ~which;
-                } while (cmpxchg(&watchdog_enabled, old, new) != old);
-                /*
-                 * Update the run state of the lockup detectors. There is _no_
-                 * need to check the value returned by proc_watchdog_update()
-                 * and to restore the previous value of 'watchdog_enabled' as
-                 * both lockup detectors are disabled if proc_watchdog_update()
-                 * returns an error.
-                 */
-                if (old == new)
-                        goto out;
-                err = proc_watchdog_update();
        }
-out:
+        mutex_unlock(&watchdog_mutex);
-        mutex_unlock(&watchdog_proc_mutex);
-        put_online_cpus();
        return err;
 }
@@ -835,6 +713,8 @@ int proc_watchdog(struct ctl_table *table, int write,
 int proc_nmi_watchdog(struct ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
+        if (!nmi_watchdog_available && write)
+                return -ENOTSUPP;
        return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
                                    table, write, buffer, lenp, ppos);
 }
@@ -855,39 +735,17 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
 int proc_watchdog_thresh(struct ctl_table *table, int write,
                         void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-        int err, old, new;
+        int err, old;
-        get_online_cpus();
-        mutex_lock(&watchdog_proc_mutex);
-        if (watchdog_suspended) {
+        mutex_lock(&watchdog_mutex);
-                /* no parameter changes allowed while watchdog is suspended */
-                err = -EAGAIN;
-                goto out;
-        }
-        old = ACCESS_ONCE(watchdog_thresh);
+        old = READ_ONCE(watchdog_thresh);
        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-        if (err || !write)
+        if (!err && write && old != READ_ONCE(watchdog_thresh))
-                goto out;
+                proc_watchdog_update();
-        /*
-         * Update the sample period. Restore on failure.
-         */
-        new = ACCESS_ONCE(watchdog_thresh);
-        if (old == new)
-                goto out;
-        set_sample_period();
+        mutex_unlock(&watchdog_mutex);
-        err = proc_watchdog_update();
-        if (err) {
-                watchdog_thresh = old;
-                set_sample_period();
-        }
-out:
-        mutex_unlock(&watchdog_proc_mutex);
-        put_online_cpus();
        return err;
 }
@@ -902,45 +760,19 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 {
        int err;
-        get_online_cpus();
+        mutex_lock(&watchdog_mutex);
-        mutex_lock(&watchdog_proc_mutex);
-        if (watchdog_suspended) {
-                /* no parameter changes allowed while watchdog is suspended */
-                err = -EAGAIN;
-                goto out;
-        }
        err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
-        if (!err && write) {
+        if (!err && write)
-                /* Remove impossible cpus to keep sysctl output cleaner. */
+                proc_watchdog_update();
-                cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
-                            cpu_possible_mask);
-                if (watchdog_running) {
-                        /*
-                         * Failure would be due to being unable to allocate
-                         * a temporary cpumask, so we are likely not in a
-                         * position to do much else to make things better.
-                         */
-                        if (watchdog_update_cpus() != 0)
-                                pr_err("cpumask update failed\n");
-                }
-                watchdog_nmi_reconfigure();
+        mutex_unlock(&watchdog_mutex);
-        }
-out:
-        mutex_unlock(&watchdog_proc_mutex);
-        put_online_cpus();
        return err;
 }
 #endif /* CONFIG_SYSCTL */
 void __init lockup_detector_init(void)
 {
-        set_sample_period();
 #ifdef CONFIG_NO_HZ_FULL
        if (tick_nohz_full_enabled()) {
                pr_info("Disabling watchdog on nohz_full cores by default\n");
@@ -951,6 +783,7 @@ void __init lockup_detector_init(void)
        cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
 #endif
-        if (watchdog_enabled)
+        if (!watchdog_nmi_probe())
-                watchdog_enable_all_cpus();
+                nmi_watchdog_available = true;
+        lockup_detector_setup();
 }
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 3a09ea1b1d3d..71a62ceacdc8 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -21,8 +21,10 @@
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static struct cpumask dead_events_mask;
 static unsigned long hardlockup_allcpu_dumped;
+static unsigned int watchdog_cpus;
 void arch_touch_nmi_watchdog(void)
 {
@@ -103,15 +105,12 @@ static struct perf_event_attr wd_hw_attr = {
 /* Callback function for perf event subsystem */
 static void watchdog_overflow_callback(struct perf_event *event,
-                 struct perf_sample_data *data,
+                                       struct perf_sample_data *data,
-                 struct pt_regs *regs)
+                                       struct pt_regs *regs)
 {
        /* Ensure the watchdog never gets throttled */
        event->hw.interrupts = 0;
-        if (atomic_read(&watchdog_park_in_progress) != 0)
-                return;
        if (__this_cpu_read(watchdog_nmi_touch) == true) {
                __this_cpu_write(watchdog_nmi_touch, false);
                return;
@@ -160,104 +159,131 @@ static void watchdog_overflow_callback(struct perf_event *event,
        return;
 }
-/*
+static int hardlockup_detector_event_create(void)
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long firstcpu_err;
-static atomic_t watchdog_cpus;
-int watchdog_nmi_enable(unsigned int cpu)
 {
+        unsigned int cpu = smp_processor_id();
        struct perf_event_attr *wd_attr;
-        struct perf_event *event = per_cpu(watchdog_ev, cpu);
+        struct perf_event *evt;
-        int firstcpu = 0;
-        /* nothing to do if the hard lockup detector is disabled */
-        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-                goto out;
-        /* is it already setup and enabled? */
-        if (event && event->state > PERF_EVENT_STATE_OFF)
-                goto out;
-        /* it is setup but not enabled */
-        if (event != NULL)
-                goto out_enable;
-        if (atomic_inc_return(&watchdog_cpus) == 1)
-                firstcpu = 1;
        wd_attr = &wd_hw_attr;
        wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
        /* Try to register using hardware perf events */
-        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+        evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+                                               watchdog_overflow_callback, NULL);
+        if (IS_ERR(evt)) {
+                pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
+                        PTR_ERR(evt));
+                return PTR_ERR(evt);
+        }
+        this_cpu_write(watchdog_ev, evt);
+        return 0;
+}
-        /* save the first cpu's error for future comparision */
+/**
-        if (firstcpu && IS_ERR(event))
+ * hardlockup_detector_perf_enable - Enable the local event
-                firstcpu_err = PTR_ERR(event);
+ */
+void hardlockup_detector_perf_enable(void)
+{
+        if (hardlockup_detector_event_create())
+                return;
-        if (!IS_ERR(event)) {
+        if (!watchdog_cpus++)
-                /* only print for the first cpu initialized */
+                pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
-                if (firstcpu || firstcpu_err)
-                        pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
-                goto out_save;
-        }
-        /*
+        perf_event_enable(this_cpu_read(watchdog_ev));
-         * Disable the hard lockup detector if _any_ CPU fails to set up
-         * set up the hardware perf event. The watchdog() function checks
-         * the NMI_WATCHDOG_ENABLED bit periodically.
-         *
-         * The barriers are for syncing up watchdog_enabled across all the
-         * cpus, as clear_bit() does not use barriers.
-         */
-        smp_mb__before_atomic();
-        clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
-        smp_mb__after_atomic();
-        /* skip displaying the same error again */
-        if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
-                return PTR_ERR(event);
-        /* vary the KERN level based on the returned errno */
-        if (PTR_ERR(event) == -EOPNOTSUPP)
-                pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
-        else if (PTR_ERR(event) == -ENOENT)
-                pr_warn("disabled (cpu%i): hardware events not enabled\n",
-                         cpu);
-        else
-                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
-                        cpu, PTR_ERR(event));
-        pr_info("Shutting down hard lockup detector on all cpus\n");
-        return PTR_ERR(event);
-        /* success path */
-out_save:
-        per_cpu(watchdog_ev, cpu) = event;
-out_enable:
-        perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
-        return 0;
 }
-void watchdog_nmi_disable(unsigned int cpu)
+/**
+ * hardlockup_detector_perf_disable - Disable the local event
+ */
+void hardlockup_detector_perf_disable(void)
 {
-        struct perf_event *event = per_cpu(watchdog_ev, cpu);
+        struct perf_event *event = this_cpu_read(watchdog_ev);
        if (event) {
                perf_event_disable(event);
+                cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
+                watchdog_cpus--;
+        }
+}
+/**
+ * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
+ *
+ * Called from lockup_detector_cleanup(). Serialized by the caller.
+ */
+void hardlockup_detector_perf_cleanup(void)
+{
+        int cpu;
+        for_each_cpu(cpu, &dead_events_mask) {
+                struct perf_event *event = per_cpu(watchdog_ev, cpu);
+                /*
+                 * Required because for_each_cpu() reports  unconditionally
+                 * CPU0 as set on UP kernels. Sigh.
+                 */
+                if (event)
+                        perf_event_release_kernel(event);
                per_cpu(watchdog_ev, cpu) = NULL;
+        }
+        cpumask_clear(&dead_events_mask);
+}
+/**
+ * hardlockup_detector_perf_stop - Globally stop watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_stop(void)
+{
+        int cpu;
+        lockdep_assert_cpus_held();
+        for_each_online_cpu(cpu) {
+                struct perf_event *event = per_cpu(watchdog_ev, cpu);
+                if (event)
+                        perf_event_disable(event);
+        }
+}
-                /* should be in cleanup, but blocks oprofile */
+/**
-                perf_event_release_kernel(event);
+ * hardlockup_detector_perf_restart - Globally restart watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_restart(void)
+{
+        int cpu;
+        lockdep_assert_cpus_held();
+        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+                return;
+        for_each_online_cpu(cpu) {
+                struct perf_event *event = per_cpu(watchdog_ev, cpu);
+                if (event)
+                        perf_event_enable(event);
+        }
+}
+/**
+ * hardlockup_detector_perf_init - Probe whether NMI event is available at all
+ */
+int __init hardlockup_detector_perf_init(void)
+{
+        int ret = hardlockup_detector_event_create();
-                /* watchdog_nmi_enable() expects this to be zero initially. */
+        if (ret) {
-                if (atomic_dec_and_test(&watchdog_cpus))
+                pr_info("Perf NMI watchdog permanently disabled\n");
-                        firstcpu_err = 0;
+        } else {
+                perf_event_release_kernel(this_cpu_read(watchdog_ev));
+                this_cpu_write(watchdog_ev, NULL);
        }
+        return ret;
 }
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2017-10-09 03:02:35 -0400
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2017-10-09 03:02:35 -0400
commit	1236d6bb6e19fc72ffc6bbcdeb1bfefe450e54ee (patch)
tree	47da3feee8e263e8c9352c85cf518e624be3c211 /kernel
parent	750b1a6894ecc9b178c6e3d0a1170122971b2036 (diff)
parent	8a5776a5f49812d29fe4b2d0a2d71675c3facf3f (diff)