35 files changed, 530 insertions, 335 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0f8f8b0bc1bf..60c302cfb4d3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -197,9 +197,9 @@ x509.genkey:
        @echo >>x509.genkey "x509_extensions = myexts"
        @echo >>x509.genkey
        @echo >>x509.genkey "[ req_distinguished_name ]"
-        @echo >>x509.genkey "O = Magrathea"
+        @echo >>x509.genkey "#O = Unspecified company"
-        @echo >>x509.genkey "CN = Glacier signing key"
+        @echo >>x509.genkey "CN = Build time autogenerated kernel key"
-        @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
+        @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company"
        @echo >>x509.genkey
        @echo >>x509.genkey "[ myexts ]"
        @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
diff --git a/kernel/audit.c b/kernel/audit.c
index 72ab759a0b43..1c13e4267de6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -43,6 +43,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/file.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/atomic.h>
@@ -107,6 +108,7 @@ static u32	audit_rate_limit;
 * When set to zero, this means unlimited. */
 static u32      audit_backlog_limit = 64;
 #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
+static u32      audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
 static u32      audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
 static u32      audit_backlog_wait_overflow = 0;
@@ -338,13 +340,13 @@ static int audit_set_backlog_limit(u32 limit)
 static int audit_set_backlog_wait_time(u32 timeout)
 {
        return audit_do_config_change("audit_backlog_wait_time",
-                                      &audit_backlog_wait_time, timeout);
+                                      &audit_backlog_wait_time_master, timeout);
 }
 static int audit_set_enabled(u32 state)
 {
        int rc;
-        if (state < AUDIT_OFF || state > AUDIT_LOCKED)
+        if (state > AUDIT_LOCKED)
                return -EINVAL;
        rc =  audit_do_config_change("audit_enabled", &audit_enabled, state);
@@ -663,7 +665,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
        case AUDIT_MAKE_EQUIV:
                /* Only support auditd and auditctl in initial pid namespace
                 * for now. */
-                if ((task_active_pid_ns(current) != &init_pid_ns))
+                if (task_active_pid_ns(current) != &init_pid_ns)
                        return -EPERM;
                if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
@@ -834,7 +836,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                s.lost                  = atomic_read(&audit_lost);
                s.backlog               = skb_queue_len(&audit_skb_queue);
                s.feature_bitmap        = AUDIT_FEATURE_BITMAP_ALL;
-                s.backlog_wait_time     = audit_backlog_wait_time;
+                s.backlog_wait_time     = audit_backlog_wait_time_master;
                audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
                break;
        }
@@ -877,8 +879,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
                        if (sizeof(s) > (size_t)nlh->nlmsg_len)
                                return -EINVAL;
-                        if (s.backlog_wait_time < 0 ||
+                        if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
-                            s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
                                return -EINVAL;
                        err = audit_set_backlog_wait_time(s.backlog_wait_time);
                        if (err < 0)
@@ -1385,7 +1386,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
                return NULL;
        }
-        audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+        if (!reserve)
+                audit_backlog_wait_time = audit_backlog_wait_time_master;
        ab = audit_buffer_alloc(ctx, gfp_mask, type);
        if (!ab) {
@@ -1759,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
        } else
                audit_log_format(ab, " name=(null)");
-        if (n->ino != (unsigned long)-1) {
+        if (n->ino != (unsigned long)-1)
                audit_log_format(ab, " inode=%lu"
                                 " dev=%02x:%02x mode=%#ho"
                                 " ouid=%u ogid=%u rdev=%02x:%02x",
@@ -1771,7 +1773,6 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
                                 from_kgid(&init_user_ns, n->gid),
                                 MAJOR(n->rdev),
                                 MINOR(n->rdev));
-        }
        if (n->osid != 0) {
                char *ctx = NULL;
                u32 len;
@@ -1838,11 +1839,29 @@ error_path:
 }
 EXPORT_SYMBOL(audit_log_task_context);
+void audit_log_d_path_exe(struct audit_buffer *ab,
+                          struct mm_struct *mm)
+{
+        struct file *exe_file;
+        if (!mm)
+                goto out_null;
+        exe_file = get_mm_exe_file(mm);
+        if (!exe_file)
+                goto out_null;
+        audit_log_d_path(ab, " exe=", &exe_file->f_path);
+        fput(exe_file);
+        return;
+out_null:
+        audit_log_format(ab, " exe=(null)");
+}
 void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
        const struct cred *cred;
        char comm[sizeof(tsk->comm)];
-        struct mm_struct *mm = tsk->mm;
        char *tty;
        if (!ab)
@@ -1878,13 +1897,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
        audit_log_format(ab, " comm=");
        audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
-        if (mm) {
+        audit_log_d_path_exe(ab, tsk->mm);
-                down_read(&mm->mmap_sem);
-                if (mm->exe_file)
-                        audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
-                up_read(&mm->mmap_sem);
-        } else
-                audit_log_format(ab, " exe=(null)");
        audit_log_task_context(ab);
 }
 EXPORT_SYMBOL(audit_log_task_info);
@@ -1915,7 +1928,7 @@ void audit_log_link_denied(const char *operation, struct path *link)
        /* Generate AUDIT_PATH record with object. */
        name->type = AUDIT_TYPE_NORMAL;
-        audit_copy_inode(name, link->dentry, link->dentry->d_inode);
+        audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry));
        audit_log_name(current->audit_context, name, link, 0, NULL);
 out:
        kfree(name);
diff --git a/kernel/audit.h b/kernel/audit.h
index 1caa0d345d90..d641f9bb3ed0 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -257,6 +257,9 @@ extern struct list_head audit_filter_list[];
 extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
+extern void audit_log_d_path_exe(struct audit_buffer *ab,
+                                 struct mm_struct *mm);
 /* audit watch functions */
 #ifdef CONFIG_AUDIT_WATCH
 extern void audit_put_watch(struct audit_watch *watch);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2e0c97427b33..b0f9877273fc 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -37,6 +37,7 @@ struct audit_chunk {
 static LIST_HEAD(tree_list);
 static LIST_HEAD(prune_list);
+static struct task_struct *prune_thread;
 /*
 * One struct chunk is attached to each inode of interest.
@@ -576,7 +577,7 @@ int audit_remove_tree_rule(struct audit_krule *rule)
 static int compare_root(struct vfsmount *mnt, void *arg)
 {
-        return mnt->mnt_root->d_inode == arg;
+        return d_backing_inode(mnt->mnt_root) == arg;
 }
 void audit_trim_trees(void)
@@ -648,7 +649,58 @@ void audit_put_tree(struct audit_tree *tree)
 static int tag_mount(struct vfsmount *mnt, void *arg)
 {
-        return tag_chunk(mnt->mnt_root->d_inode, arg);
+        return tag_chunk(d_backing_inode(mnt->mnt_root), arg);
+}
+/*
+ * That gets run when evict_chunk() ends up needing to kill audit_tree.
+ * Runs from a separate thread.
+ */
+static int prune_tree_thread(void *unused)
+{
+        for (;;) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (list_empty(&prune_list))
+                        schedule();
+                __set_current_state(TASK_RUNNING);
+                mutex_lock(&audit_cmd_mutex);
+                mutex_lock(&audit_filter_mutex);
+                while (!list_empty(&prune_list)) {
+                        struct audit_tree *victim;
+                        victim = list_entry(prune_list.next,
+                                        struct audit_tree, list);
+                        list_del_init(&victim->list);
+                        mutex_unlock(&audit_filter_mutex);
+                        prune_one(victim);
+                        mutex_lock(&audit_filter_mutex);
+                }
+                mutex_unlock(&audit_filter_mutex);
+                mutex_unlock(&audit_cmd_mutex);
+        }
+        return 0;
+}
+static int audit_launch_prune(void)
+{
+        if (prune_thread)
+                return 0;
+        prune_thread = kthread_create(prune_tree_thread, NULL,
+                                "audit_prune_tree");
+        if (IS_ERR(prune_thread)) {
+                pr_err("cannot start thread audit_prune_tree");
+                prune_thread = NULL;
+                return -ENOMEM;
+        } else {
+                wake_up_process(prune_thread);
+                return 0;
+        }
 }
 /* called with audit_filter_mutex */
@@ -674,6 +726,12 @@ int audit_add_tree_rule(struct audit_krule *rule)
        /* do not set rule->tree yet */
        mutex_unlock(&audit_filter_mutex);
+        if (unlikely(!prune_thread)) {
+                err = audit_launch_prune();
+                if (err)
+                        goto Err;
+        }
        err = kern_path(tree->pathname, 0, &path);
        if (err)
                goto Err;
@@ -811,36 +869,10 @@ int audit_tag_tree(char *old, char *new)
        return failed;
 }
-/*
- * That gets run when evict_chunk() ends up needing to kill audit_tree.
- * Runs from a separate thread.
- */
-static int prune_tree_thread(void *unused)
-{
-        mutex_lock(&audit_cmd_mutex);
-        mutex_lock(&audit_filter_mutex);
-        while (!list_empty(&prune_list)) {
-                struct audit_tree *victim;
-                victim = list_entry(prune_list.next, struct audit_tree, list);
-                list_del_init(&victim->list);
-                mutex_unlock(&audit_filter_mutex);
-                prune_one(victim);
-                mutex_lock(&audit_filter_mutex);
-        }
-        mutex_unlock(&audit_filter_mutex);
-        mutex_unlock(&audit_cmd_mutex);
-        return 0;
-}
 static void audit_schedule_prune(void)
 {
-        kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
+        wake_up_process(prune_thread);
 }
 /*
@@ -907,9 +939,9 @@ static void evict_chunk(struct audit_chunk *chunk)
        for (n = 0; n < chunk->count; n++)
                list_del_init(&chunk->owners[n].list);
        spin_unlock(&hash_lock);
+        mutex_unlock(&audit_filter_mutex);
        if (need_prune)
                audit_schedule_prune();
-        mutex_unlock(&audit_filter_mutex);
 }
 static int audit_tree_handle_event(struct fsnotify_group *group,
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index ad9c1682f616..6e30024d9aac 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -146,7 +146,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
 /* Initialize a parent watch entry. */
 static struct audit_parent *audit_init_parent(struct path *path)
 {
-        struct inode *inode = path->dentry->d_inode;
+        struct inode *inode = d_backing_inode(path->dentry);
        struct audit_parent *parent;
        int ret;
@@ -361,11 +361,11 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
        struct dentry *d = kern_path_locked(watch->path, parent);
        if (IS_ERR(d))
                return PTR_ERR(d);
-        mutex_unlock(&parent->dentry->d_inode->i_mutex);
+        mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex);
-        if (d->d_inode) {
+        if (d_is_positive(d)) {
                /* update watch filter fields */
-                watch->dev = d->d_inode->i_sb->s_dev;
+                watch->dev = d_backing_inode(d)->i_sb->s_dev;
-                watch->ino = d->d_inode->i_ino;
+                watch->ino = d_backing_inode(d)->i_ino;
        }
        dput(d);
        return 0;
@@ -426,7 +426,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
                return ret;
        /* either find an old parent or attach a new one */
-        parent = audit_find_parent(parent_path.dentry->d_inode);
+        parent = audit_find_parent(d_backing_inode(parent_path.dentry));
        if (!parent) {
                parent = audit_init_parent(&parent_path);
                if (IS_ERR(parent)) {
@@ -482,7 +482,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
        switch (data_type) {
        case (FSNOTIFY_EVENT_PATH):
-                inode = ((struct path *)data)->dentry->d_inode;
+                inode = d_backing_inode(((struct path *)data)->dentry);
                break;
        case (FSNOTIFY_EVENT_INODE):
                inode = (struct inode *)data;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index dc4ae70a7413..9fb9d1cb83ce 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1629,7 +1629,7 @@ retry:
        rcu_read_lock();
        seq = read_seqbegin(&rename_lock);
        for(;;) {
-                struct inode *inode = d->d_inode;
+                struct inode *inode = d_backing_inode(d);
                if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
                        struct audit_chunk *chunk;
                        chunk = audit_tree_lookup(inode);
@@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
                   unsigned int flags)
 {
        struct audit_context *context = current->audit_context;
-        const struct inode *inode = dentry->d_inode;
+        const struct inode *inode = d_backing_inode(dentry);
        struct audit_names *n;
        bool parent = flags & AUDIT_INODE_PARENT;
@@ -1853,7 +1853,7 @@ void __audit_inode_child(const struct inode *parent,
                         const unsigned char type)
 {
        struct audit_context *context = current->audit_context;
-        const struct inode *inode = dentry->d_inode;
+        const struct inode *inode = d_backing_inode(dentry);
        const char *dname = dentry->d_name.name;
        struct audit_names *n, *found_parent = NULL, *found_child = NULL;
@@ -2361,7 +2361,6 @@ static void audit_log_task(struct audit_buffer *ab)
        kuid_t auid, uid;
        kgid_t gid;
        unsigned int sessionid;
-        struct mm_struct *mm = current->mm;
        char comm[sizeof(current->comm)];
        auid = audit_get_loginuid(current);
@@ -2376,13 +2375,7 @@ static void audit_log_task(struct audit_buffer *ab)
        audit_log_task_context(ab);
        audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
        audit_log_untrustedstring(ab, get_task_comm(comm, current));
-        if (mm) {
+        audit_log_d_path_exe(ab, current->mm);
-                down_read(&mm->mmap_sem);
-                if (mm->exe_file)
-                        audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
-                up_read(&mm->mmap_sem);
-        } else
-                audit_log_format(ab, " exe=(null)");
 }
 /**
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4139a0f8b558..54f0e7fcd0e2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -357,8 +357,8 @@ select_insn:
        ALU64_MOD_X:
                if (unlikely(SRC == 0))
                        return 0;
-                tmp = DST;
+                div64_u64_rem(DST, SRC, &tmp);
-                DST = do_div(tmp, SRC);
+                DST = tmp;
                CONT;
        ALU_MOD_X:
                if (unlikely(SRC == 0))
@@ -367,8 +367,8 @@ select_insn:
                DST = do_div(tmp, (u32) SRC);
                CONT;
        ALU64_MOD_K:
-                tmp = DST;
+                div64_u64_rem(DST, IMM, &tmp);
-                DST = do_div(tmp, IMM);
+                DST = tmp;
                CONT;
        ALU_MOD_K:
                tmp = (u32) DST;
@@ -377,7 +377,7 @@ select_insn:
        ALU64_DIV_X:
                if (unlikely(SRC == 0))
                        return 0;
-                do_div(DST, SRC);
+                DST = div64_u64(DST, SRC);
                CONT;
        ALU_DIV_X:
                if (unlikely(SRC == 0))
@@ -387,7 +387,7 @@ select_insn:
                DST = (u32) tmp;
                CONT;
        ALU64_DIV_K:
-                do_div(DST, IMM);
+                DST = div64_u64(DST, IMM);
                CONT;
        ALU_DIV_K:
                tmp = (u32) DST;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 630a7bac1e51..47dcd3aa6e23 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1397,7 +1397,8 @@ peek_stack:
                        /* tell verifier to check for equivalent states
                         * after every call and jump
                         */
-                        env->explored_states[t + 1] = STATE_LIST_MARK;
+                        if (t + 1 < insn_cnt)
+                                env->explored_states[t + 1] = STATE_LIST_MARK;
                } else {
                        /* conditional jump with two edges */
                        ret = push_insn(t, t + 1, FALLTHROUGH, env);
@@ -1636,6 +1637,8 @@ static int do_check(struct verifier_env *env)
                        if (err)
                                return err;
+                        src_reg_type = regs[insn->src_reg].type;
                        /* check that memory (src_reg + off) is readable,
                         * the state of dst_reg will be updated by this func
                         */
@@ -1645,9 +1648,12 @@ static int do_check(struct verifier_env *env)
                        if (err)
                                return err;
-                        src_reg_type = regs[insn->src_reg].type;
+                        if (BPF_SIZE(insn->code) != BPF_W) {
+                                insn_idx++;
+                                continue;
+                        }
-                        if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) {
+                        if (insn->imm == 0) {
                                /* saw a valid insn
                                 * dst_reg = *(u32 *)(src_reg + off)
                                 * use reserved 'imm' field to mark this insn
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81aa3a4ece9f..1a3bf48743ce 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -913,10 +913,30 @@ static void put_ctx(struct perf_event_context *ctx)
 * Those places that change perf_event::ctx will hold both
 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 *
- * Lock ordering is by mutex address. There is one other site where
+ * Lock ordering is by mutex address. There are two other sites where
- * perf_event_context::mutex nests and that is put_event(). But remember that
+ * perf_event_context::mutex nests and those are:
- * that is a parent<->child context relation, and migration does not affect
+ *
- * children, therefore these two orderings should not interact.
+ *  - perf_event_exit_task_context()    [ child , 0 ]
+ *      __perf_event_exit_task()
+ *        sync_child_event()
+ *          put_event()                 [ parent, 1 ]
+ *
+ *  - perf_event_init_context()         [ parent, 0 ]
+ *      inherit_task_group()
+ *        inherit_group()
+ *          inherit_event()
+ *            perf_event_alloc()
+ *              perf_init_event()
+ *                perf_try_init_event() [ child , 1 ]
+ *
+ * While it appears there is an obvious deadlock here -- the parent and child
+ * nesting levels are inverted between the two. This is in fact safe because
+ * life-time rules separate them. That is an exiting task cannot fork, and a
+ * spawning task cannot (yet) exit.
+ *
+ * But remember that that these are parent<->child context relations, and
+ * migration does not affect children, therefore these two orderings should not
+ * interact.
 *
 * The change in perf_event::ctx does not affect children (as claimed above)
 * because the sys_perf_event_open() case will install a new event and break
@@ -3657,9 +3677,6 @@ static void perf_remove_from_owner(struct perf_event *event)
        }
 }
-/*
- * Called when the last reference to the file is gone.
- */
 static void put_event(struct perf_event *event)
 {
        struct perf_event_context *ctx;
@@ -3697,6 +3714,9 @@ int perf_event_release_kernel(struct perf_event *event)
 }
 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+/*
+ * Called when the last reference to the file is gone.
+ */
 static int perf_release(struct inode *inode, struct file *file)
 {
        put_event(file->private_data);
@@ -7364,7 +7384,12 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
                return -ENODEV;
        if (event->group_leader != event) {
-                ctx = perf_event_ctx_lock(event->group_leader);
+                /*
+                 * This ctx->mutex can nest when we're called through
+                 * inheritance. See the perf_event_ctx_lock_nested() comment.
+                 */
+                ctx = perf_event_ctx_lock_nested(event->group_leader,
+                                                 SINGLE_DEPTH_NESTING);
                BUG_ON(!ctx);
        }
diff --git a/kernel/fork.c b/kernel/fork.c
index f2c1e7352298..03c1eaaa6ef5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -74,6 +74,7 @@
 #include <linux/uprobes.h>
 #include <linux/aio.h>
 #include <linux/compiler.h>
+#include <linux/sysctl.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -88,6 +89,16 @@
 #include <trace/events/task.h>
 /*
+ * Minimum number of threads to boot the kernel
+ */
+#define MIN_THREADS 20
+/*
+ * Maximum number of threads
+ */
+#define MAX_THREADS FUTEX_TID_MASK
+/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
 unsigned long total_forks;      /* Handle normal Linux uptimes. */
@@ -253,7 +264,30 @@ EXPORT_SYMBOL_GPL(__put_task_struct);
 void __init __weak arch_task_cache_init(void) { }
-void __init fork_init(unsigned long mempages)
+/*
+ * set_max_threads
+ */
+static void set_max_threads(unsigned int max_threads_suggested)
+{
+        u64 threads;
+        /*
+         * The number of threads shall be limited such that the thread
+         * structures may only consume a small part of the available memory.
+         */
+        if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
+                threads = MAX_THREADS;
+        else
+                threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+                                    (u64) THREAD_SIZE * 8UL);
+        if (threads > max_threads_suggested)
+                threads = max_threads_suggested;
+        max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
+}
+void __init fork_init(void)
 {
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
@@ -268,18 +302,7 @@ void __init fork_init(unsigned long mempages)
        /* do the arch specific task caches init */
        arch_task_cache_init();
-        /*
+        set_max_threads(MAX_THREADS);
-         * The default maximum number of threads is set to a safe
-         * value: the thread structures can take up at most half
-         * of memory.
-         */
-        max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
-        /*
-         * we need to allow at least 20 threads to boot a system
-         */
-        if (max_threads < 20)
-                max_threads = 20;
        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
@@ -380,6 +403,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
         */
        down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+        /* No ordering required: file already has been exposed. */
+        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
        mm->total_vm = oldmm->total_vm;
        mm->shared_vm = oldmm->shared_vm;
        mm->exec_vm = oldmm->exec_vm;
@@ -505,7 +531,13 @@ static inline void mm_free_pgd(struct mm_struct *mm)
        pgd_free(mm, mm->pgd);
 }
 #else
-#define dup_mmap(mm, oldmm)     (0)
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+        down_write(&oldmm->mmap_sem);
+        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+        up_write(&oldmm->mmap_sem);
+        return 0;
+}
 #define mm_alloc_pgd(mm)        (0)
 #define mm_free_pgd(mm)
 #endif /* CONFIG_MMU */
@@ -674,34 +706,53 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
+/**
+ * set_mm_exe_file - change a reference to the mm's executable file
+ *
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
+ *
+ * Main users are mmput() and sys_execve(). Callers prevent concurrent
+ * invocations: in mmput() nobody alive left, in execve task is single
+ * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
+ * mm->exe_file, but does so without using set_mm_exe_file() in order
+ * to do avoid the need for any locks.
+ */
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
+        struct file *old_exe_file;
+        /*
+         * It is safe to dereference the exe_file without RCU as
+         * this function is only called if nobody else can access
+         * this mm -- see comment above for justification.
+         */
+        old_exe_file = rcu_dereference_raw(mm->exe_file);
        if (new_exe_file)
                get_file(new_exe_file);
-        if (mm->exe_file)
+        rcu_assign_pointer(mm->exe_file, new_exe_file);
-                fput(mm->exe_file);
+        if (old_exe_file)
-        mm->exe_file = new_exe_file;
+                fput(old_exe_file);
 }
+/**
+ * get_mm_exe_file - acquire a reference to the mm's executable file
+ *
+ * Returns %NULL if mm has no associated executable file.
+ * User must release file via fput().
+ */
 struct file *get_mm_exe_file(struct mm_struct *mm)
 {
        struct file *exe_file;
-        /* We need mmap_sem to protect against races with removal of exe_file */
+        rcu_read_lock();
-        down_read(&mm->mmap_sem);
+        exe_file = rcu_dereference(mm->exe_file);
-        exe_file = mm->exe_file;
+        if (exe_file && !get_file_rcu(exe_file))
-        if (exe_file)
+                exe_file = NULL;
-                get_file(exe_file);
+        rcu_read_unlock();
-        up_read(&mm->mmap_sem);
        return exe_file;
 }
+EXPORT_SYMBOL(get_mm_exe_file);
-static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
-{
-        /* It's safe to write the exe_file pointer without exe_file_lock because
-         * this is called during fork when the task is not yet in /proc */
-        newmm->exe_file = get_mm_exe_file(oldmm);
-}
 /**
 * get_task_mm - acquire a reference to the task's mm
@@ -864,8 +915,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
        if (!mm_init(mm, tsk))
                goto fail_nomem;
-        dup_mm_exe_file(oldmm, mm);
        err = dup_mmap(mm, oldmm);
        if (err)
                goto free_pt;
@@ -1403,10 +1452,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                goto bad_fork_cleanup_io;
        if (pid != &init_struct_pid) {
-                retval = -ENOMEM;
                pid = alloc_pid(p->nsproxy->pid_ns_for_children);
-                if (!pid)
+                if (IS_ERR(pid)) {
+                        retval = PTR_ERR(pid);
                        goto bad_fork_cleanup_io;
+                }
        }
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -2000,3 +2050,26 @@ int unshare_files(struct files_struct **displaced)
        task_unlock(task);
        return 0;
 }
+int sysctl_max_threads(struct ctl_table *table, int write,
+                       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        struct ctl_table t;
+        int ret;
+        int threads = max_threads;
+        int min = MIN_THREADS;
+        int max = MAX_THREADS;
+        t = *table;
+        t.data = &threads;
+        t.extra1 = &min;
+        t.extra2 = &max;
+        ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+        if (ret || !write)
+                return ret;
+        set_max_threads(threads);
+        return 0;
+}
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index b358a802fd18..a744098e4eb7 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
 #include "gcov.h"
 static int gcov_events_enabled;
@@ -107,8 +108,10 @@ void gcov_enable_events(void)
        gcov_events_enabled = 1;
        /* Perform event callback for previously registered entries. */
-        while ((info = gcov_info_next(info)))
+        while ((info = gcov_info_next(info))) {
                gcov_event(GCOV_ADD, info);
+                cond_resched();
+        }
        mutex_unlock(&gcov_lock);
 }
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 988dc58e8847..2feb6feca0cc 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -57,5 +57,6 @@ struct irq_chip dummy_irq_chip = {
        .irq_ack        = noop,
        .irq_mask       = noop,
        .irq_unmask     = noop,
+        .flags          = IRQCHIP_SKIP_SET_WAKE,
 };
 EXPORT_SYMBOL_GPL(dummy_irq_chip);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 38c25b1f2fd5..7a36fdcca5bf 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -707,7 +707,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
        do {
                unsigned long pfn, epfn, addr, eaddr;
-                pages = kimage_alloc_pages(GFP_KERNEL, order);
+                pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
                if (!pages)
                        break;
                pfn   = page_to_pfn(pages);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index ba77ab5f64dd..a0831e1b99f4 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -551,7 +551,21 @@ static void print_lockdep_cache(struct lockdep_map *lock)
 static void print_lock(struct held_lock *hlock)
 {
-        print_lock_name(hlock_class(hlock));
+        /*
+         * We can be called locklessly through debug_show_all_locks() so be
+         * extra careful, the hlock might have been released and cleared.
+         */
+        unsigned int class_idx = hlock->class_idx;
+        /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
+        barrier();
+        if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+                printk("<RELEASED>\n");
+                return;
+        }
+        print_lock_name(lock_classes + class_idx - 1);
        printk(", at: ");
        print_ip_sym(hlock->acquire_ip);
 }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b73279367087..b025295f4966 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -265,15 +265,17 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 }
 /*
- * Called by sched_setscheduler() to check whether the priority change
+ * Called by sched_setscheduler() to get the priority which will be
- * is overruled by a possible priority boosting.
+ * effective after the change.
 */
-int rt_mutex_check_prio(struct task_struct *task, int newprio)
+int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
 {
        if (!task_has_pi_waiters(task))
-                return 0;
+                return newprio;
-        return task_top_pi_waiter(task)->task->prio <= newprio;
+        if (task_top_pi_waiter(task)->task->prio <= newprio)
+                return task_top_pi_waiter(task)->task->prio;
+        return newprio;
 }
 /*
diff --git a/kernel/module.c b/kernel/module.c
index 650b038ae520..42a1d2afb217 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -387,9 +387,9 @@ static bool check_symbol(const struct symsearch *syms,
                pr_warn("Symbol %s is marked as UNUSED, however this module is "
                        "using it.\n", fsa->name);
                pr_warn("This symbol will go away in the future.\n");
-                pr_warn("Please evalute if this is the right api to use and if "
+                pr_warn("Please evaluate if this is the right api to use and "
-                        "it really is, submit a report the linux kernel "
+                        "if it really is, submit a report to the linux kernel "
-                        "mailinglist together with submitting your code for "
+                        "mailing list together with submitting your code for "
                        "inclusion.\n");
        }
 #endif
@@ -2511,7 +2511,8 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
                return err;
        /* Suck in entire file: we'll want most of it. */
-        info->hdr = vmalloc(info->len);
+        info->hdr = __vmalloc(info->len,
+                        GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL);
        if (!info->hdr)
                return -ENOMEM;
diff --git a/kernel/params.c b/kernel/params.c
index 728e05b167de..a22d6a759b1a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -173,9 +173,9 @@ static char *next_arg(char *args, char **param, char **val)
                        if (args[i-1] == '"')
                                args[i-1] = '\0';
                }
-                if (quoted && args[i-1] == '"')
-                        args[i-1] = '\0';
        }
+        if (quoted && args[i-1] == '"')
+                args[i-1] = '\0';
        if (args[i]) {
                args[i] = '\0';
diff --git a/kernel/pid.c b/kernel/pid.c
index cd36a5e0d173..4fd07d5b7baf 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -182,7 +182,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
                        spin_unlock_irq(&pidmap_lock);
                        kfree(page);
                        if (unlikely(!map->page))
-                                break;
+                                return -ENOMEM;
                }
                if (likely(atomic_read(&map->nr_free))) {
                        for ( ; ; ) {
@@ -210,7 +210,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
                }
                pid = mk_pid(pid_ns, map, offset);
        }
-        return -1;
+        return -EAGAIN;
 }
 int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
@@ -301,17 +301,20 @@ struct pid *alloc_pid(struct pid_namespace *ns)
        int i, nr;
        struct pid_namespace *tmp;
        struct upid *upid;
+        int retval = -ENOMEM;
        pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
        if (!pid)
-                goto out;
+                return ERR_PTR(retval);
        tmp = ns;
        pid->level = ns->level;
        for (i = ns->level; i >= 0; i--) {
                nr = alloc_pidmap(tmp);
-                if (nr < 0)
+                if (IS_ERR_VALUE(nr)) {
+                        retval = nr;
                        goto out_free;
+                }
                pid->numbers[i].nr = nr;
                pid->numbers[i].ns = tmp;
@@ -339,7 +342,6 @@ struct pid *alloc_pid(struct pid_namespace *ns)
        }
        spin_unlock_irq(&pidmap_lock);
-out:
        return pid;
 out_unlock:
@@ -351,8 +353,7 @@ out_free:
                free_pidmap(pid->numbers + i);
        kmem_cache_free(ns->pid_cachep, pid);
-        pid = NULL;
+        return ERR_PTR(retval);
-        goto out;
 }
 void disable_pid_allocation(struct pid_namespace *ns)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 879edfc5ee52..c099b082cd02 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2017,24 +2017,6 @@ int add_preferred_console(char *name, int idx, char *options)
        return __add_preferred_console(name, idx, options, NULL);
 }
-int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
-{
-        struct console_cmdline *c;
-        int i;
-        for (i = 0, c = console_cmdline;
-             i < MAX_CMDLINECONSOLES && c->name[0];
-             i++, c++)
-                if (strcmp(c->name, name) == 0 && c->index == idx) {
-                        strlcpy(c->name, name_new, sizeof(c->name));
-                        c->options = options;
-                        c->index = idx_new;
-                        return i;
-                }
-        /* not found */
-        return -1;
-}
 bool console_suspend_enabled = true;
 EXPORT_SYMBOL(console_suspend_enabled);
@@ -2436,9 +2418,6 @@ void register_console(struct console *newcon)
        if (preferred_console < 0 || bcon || !console_drivers)
                preferred_console = selected_console;
-        if (newcon->early_setup)
-                newcon->early_setup();
        /*
         *      See if we want to use this console driver. If we
         *      didn't select a console we take the first one
@@ -2464,23 +2443,27 @@ void register_console(struct console *newcon)
        for (i = 0, c = console_cmdline;
             i < MAX_CMDLINECONSOLES && c->name[0];
             i++, c++) {
-                BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
+                if (!newcon->match ||
-                if (strcmp(c->name, newcon->name) != 0)
+                    newcon->match(newcon, c->name, c->index, c->options) != 0) {
-                        continue;
+                        /* default matching */
-                if (newcon->index >= 0 &&
+                        BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
-                    newcon->index != c->index)
+                        if (strcmp(c->name, newcon->name) != 0)
-                        continue;
+                                continue;
-                if (newcon->index < 0)
+                        if (newcon->index >= 0 &&
-                        newcon->index = c->index;
+                            newcon->index != c->index)
+                                continue;
+                        if (newcon->index < 0)
+                                newcon->index = c->index;
-                if (_braille_register_console(newcon, c))
+                        if (_braille_register_console(newcon, c))
-                        return;
+                                return;
+                        if (newcon->setup &&
+                            newcon->setup(newcon, c->options) != 0)
+                                break;
+                }
-                if (newcon->setup &&
-                    newcon->setup(newcon, console_cmdline[i].options) != 0)
-                        break;
                newcon->flags |= CON_ENABLED;
-                newcon->index = c->index;
                if (i == selected_console) {
                        newcon->flags |= CON_CONSDEV;
                        preferred_console = selected_console;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 227fec36b12a..c8e0e050a36a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -456,8 +456,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 static int ptrace_detach(struct task_struct *child, unsigned int data)
 {
-        bool dead = false;
        if (!valid_signal(data))
                return -EIO;
@@ -467,18 +465,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
        write_lock_irq(&tasklist_lock);
        /*
-         * This child can be already killed. Make sure de_thread() or
+         * We rely on ptrace_freeze_traced(). It can't be killed and
-         * our sub-thread doing do_wait() didn't do release_task() yet.
+         * untraced by another thread, it can't be a zombie.
         */
-        if (child->ptrace) {
+        WARN_ON(!child->ptrace || child->exit_state);
-                child->exit_code = data;
+        /*
-                dead = __ptrace_detach(current, child);
+         * tasklist_lock avoids the race with wait_task_stopped(), see
-        }
+         * the comment in ptrace_resume().
+         */
+        child->exit_code = data;
+        __ptrace_detach(current, child);
        write_unlock_irq(&tasklist_lock);
        proc_ptrace_connector(child, PTRACE_DETACH);
-        if (unlikely(dead))
-                release_task(child);
        return 0;
 }
@@ -697,6 +696,8 @@ static int ptrace_peek_siginfo(struct task_struct *child,
 static int ptrace_resume(struct task_struct *child, long request,
                         unsigned long data)
 {
+        bool need_siglock;
        if (!valid_signal(data))
                return -EIO;
@@ -724,8 +725,26 @@ static int ptrace_resume(struct task_struct *child, long request,
                user_disable_single_step(child);
        }
+        /*
+         * Change ->exit_code and ->state under siglock to avoid the race
+         * with wait_task_stopped() in between; a non-zero ->exit_code will
+         * wrongly look like another report from tracee.
+         *
+         * Note that we need siglock even if ->exit_code == data and/or this
+         * status was not reported yet, the new status must not be cleared by
+         * wait_task_stopped() after resume.
+         *
+         * If data == 0 we do not care if wait_task_stopped() reports the old
+         * status and clears the code too; this can't race with the tracee, it
+         * takes siglock after resume.
+         */
+        need_siglock = data && !thread_group_empty(current);
+        if (need_siglock)
+                spin_lock_irq(&child->sighand->siglock);
        child->exit_code = data;
        wake_up_state(child, __TASK_TRACED);
+        if (need_siglock)
+                spin_unlock_irq(&child->sighand->siglock);
        return 0;
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 233165da782f..8cf7304b2867 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -162,11 +162,14 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
 module_param(kthread_prio, int, 0644);
-/* Delay in jiffies for grace-period initialization delays. */
+/* Delay in jiffies for grace-period initialization delays, debug only. */
-static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
-                                ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
+static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
-                                : 0;
 module_param(gp_init_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
+static const int gp_init_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
+#define PER_RCU_NODE_PERIOD 10  /* Number of grace periods between delays. */
 /*
 * Track the rcutorture test sequence number and the update version
@@ -1843,9 +1846,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
                ACCESS_ONCE(rsp->gp_activity) = jiffies;
-                if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) &&
+                if (gp_init_delay > 0 &&
-                    gp_init_delay > 0 &&
+                    !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD)))
-                    !(rsp->gpnum % (rcu_num_nodes * 10)))
                        schedule_timeout_uninterruptible(gp_init_delay);
        }
diff --git a/kernel/relay.c b/kernel/relay.c
index 5a56d3c8dc03..e9dbaeb8fd65 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -407,7 +407,7 @@ static inline void relay_set_buf_dentry(struct rchan_buf *buf,
                                        struct dentry *dentry)
 {
        buf->dentry = dentry;
-        buf->dentry->d_inode->i_size = buf->early_bytes;
+        d_inode(buf->dentry)->i_size = buf->early_bytes;
 }
 static struct dentry *relay_create_buf_file(struct rchan *chan,
@@ -733,7 +733,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
                buf->padding[old_subbuf] = buf->prev_padding;
                buf->subbufs_produced++;
                if (buf->dentry)
-                        buf->dentry->d_inode->i_size +=
+                        d_inode(buf->dentry)->i_size +=
                                buf->chan->subbuf_size -
                                buf->padding[old_subbuf];
                else
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9123a82cbb6..57bd333bc4ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1016,13 +1016,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                rq_clock_skip_update(rq, true);
 }
-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-void register_task_migration_notifier(struct notifier_block *n)
-{
-        atomic_notifier_chain_register(&task_migration_notifier, n);
-}
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -1053,18 +1046,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        trace_sched_migrate_task(p, new_cpu);
        if (task_cpu(p) != new_cpu) {
-                struct task_migration_notifier tmn;
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
-                tmn.task = p;
-                tmn.from_cpu = task_cpu(p);
-                tmn.to_cpu = new_cpu;
-                atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
        }
        __set_task_cpu(p, new_cpu);
@@ -3315,15 +3300,18 @@ static void __setscheduler_params(struct task_struct *p,
 /* Actually do priority change: must hold pi & rq lock. */
 static void __setscheduler(struct rq *rq, struct task_struct *p,
-                           const struct sched_attr *attr)
+                           const struct sched_attr *attr, bool keep_boost)
 {
        __setscheduler_params(p, attr);
        /*
-         * If we get here, there was no pi waiters boosting the
+         * Keep a potential priority boosting if called from
-         * task. It is safe to use the normal prio.
+         * sched_setscheduler().
         */
-        p->prio = normal_prio(p);
+        if (keep_boost)
+                p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
+        else
+                p->prio = normal_prio(p);
        if (dl_prio(p->prio))
                p->sched_class = &dl_sched_class;
@@ -3423,7 +3411,7 @@ static int __sched_setscheduler(struct task_struct *p,
        int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
                      MAX_RT_PRIO - 1 - attr->sched_priority;
        int retval, oldprio, oldpolicy = -1, queued, running;
-        int policy = attr->sched_policy;
+        int new_effective_prio, policy = attr->sched_policy;
        unsigned long flags;
        const struct sched_class *prev_class;
        struct rq *rq;
@@ -3605,15 +3593,14 @@ change:
        oldprio = p->prio;
        /*
-         * Special case for priority boosted tasks.
+         * Take priority boosted tasks into account. If the new
-         *
+         * effective priority is unchanged, we just store the new
-         * If the new priority is lower or equal (user space view)
-         * than the current (boosted) priority, we just store the new
         * normal parameters and do not touch the scheduler class and
         * the runqueue. This will be done when the task deboost
         * itself.
         */
-        if (rt_mutex_check_prio(p, newprio)) {
+        new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+        if (new_effective_prio == oldprio) {
                __setscheduler_params(p, attr);
                task_rq_unlock(rq, p, &flags);
                return 0;
@@ -3627,7 +3614,7 @@ change:
                put_prev_task(rq, p);
        prev_class = p->sched_class;
-        __setscheduler(rq, p, attr);
+        __setscheduler(rq, p, attr, true);
        if (running)
                p->sched_class->set_curr_task(rq);
@@ -7012,27 +6999,23 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
        unsigned long flags;
        long cpu = (long)hcpu;
        struct dl_bw *dl_b;
+        bool overflow;
+        int cpus;
-        switch (action & ~CPU_TASKS_FROZEN) {
+        switch (action) {
        case CPU_DOWN_PREPARE:
-                /* explicitly allow suspend */
+                rcu_read_lock_sched();
-                if (!(action & CPU_TASKS_FROZEN)) {
+                dl_b = dl_bw_of(cpu);
-                        bool overflow;
-                        int cpus;
-                        rcu_read_lock_sched();
-                        dl_b = dl_bw_of(cpu);
-                        raw_spin_lock_irqsave(&dl_b->lock, flags);
+                raw_spin_lock_irqsave(&dl_b->lock, flags);
-                        cpus = dl_bw_cpus(cpu);
+                cpus = dl_bw_cpus(cpu);
-                        overflow = __dl_overflow(dl_b, cpus, 0, 0);
+                overflow = __dl_overflow(dl_b, cpus, 0, 0);
-                        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-                        rcu_read_unlock_sched();
+                rcu_read_unlock_sched();
-                        if (overflow)
+                if (overflow)
-                                return notifier_from_errno(-EBUSY);
+                        return notifier_from_errno(-EBUSY);
-                }
                cpuset_update_active_cpus(false);
                break;
        case CPU_DOWN_PREPARE_FROZEN:
@@ -7361,7 +7344,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
        queued = task_on_rq_queued(p);
        if (queued)
                dequeue_task(rq, p, 0);
-        __setscheduler(rq, p, &attr);
+        __setscheduler(rq, p, &attr, false);
        if (queued) {
                enqueue_task(rq, p, 0);
                resched_curr(rq);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index deef1caa94c6..fefcb1fa5160 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -81,7 +81,6 @@ static void cpuidle_idle_call(void)
        struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
        struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
        int next_state, entered_state;
-        unsigned int broadcast;
        bool reflect;
        /*
@@ -150,17 +149,6 @@ static void cpuidle_idle_call(void)
                goto exit_idle;
        }
-        broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
-        /*
-         * Tell the time framework to switch to a broadcast timer
-         * because our local timer will be shutdown. If a local timer
-         * is used from another cpu as a broadcast timer, this call may
-         * fail if it is not available
-         */
-        if (broadcast && tick_broadcast_enter())
-                goto use_default;
        /* Take note of the planned idle state. */
        idle_set_state(this_rq(), &drv->states[next_state]);
@@ -174,8 +162,8 @@ static void cpuidle_idle_call(void)
        /* The cpu is no longer idle or about to enter idle. */
        idle_set_state(this_rq(), NULL);
-        if (broadcast)
+        if (entered_state == -EBUSY)
-                tick_broadcast_exit();
+                goto use_default;
        /*
         * Give the governor an opportunity to reflect on the outcome
diff --git a/kernel/signal.c b/kernel/signal.c
index a390499943e4..d51c5ddd855c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2992,11 +2992,9 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
-            (task_pid_vnr(current) != pid)) {
+            (task_pid_vnr(current) != pid))
-                /* We used to allow any < 0 si_code */
-                WARN_ON_ONCE(info->si_code < 0);
                return -EPERM;
-        }
        info->si_signo = sig;
        /* POSIX.1b doesn't mention process groups.  */
@@ -3041,12 +3039,10 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
-        if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
+        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
-            (task_pid_vnr(current) != pid)) {
+            (task_pid_vnr(current) != pid))
-                /* We used to allow any < 0 si_code */
-                WARN_ON_ONCE(info->si_code < 0);
                return -EPERM;
-        }
        info->si_signo = sig;
        return do_send_specific(tgid, pid, sig, info);
diff --git a/kernel/smp.c b/kernel/smp.c
index f38a1e692259..07854477c164 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -19,7 +19,7 @@
 enum {
        CSD_FLAG_LOCK           = 0x01,
-        CSD_FLAG_WAIT           = 0x02,
+        CSD_FLAG_SYNCHRONOUS    = 0x02,
 };
 struct call_function_data {
@@ -107,7 +107,7 @@ void __init call_function_init(void)
 */
 static void csd_lock_wait(struct call_single_data *csd)
 {
-        while (csd->flags & CSD_FLAG_LOCK)
+        while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
                cpu_relax();
 }
@@ -121,19 +121,17 @@ static void csd_lock(struct call_single_data *csd)
         * to ->flags with any subsequent assignments to other
         * fields of the specified call_single_data structure:
         */
-        smp_mb();
+        smp_wmb();
 }
 static void csd_unlock(struct call_single_data *csd)
 {
-        WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));
+        WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
        /*
         * ensure we're all done before releasing data:
         */
-        smp_mb();
+        smp_store_release(&csd->flags, 0);
-        csd->flags &= ~CSD_FLAG_LOCK;
 }
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
@@ -144,13 +142,16 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
 * ->func, ->info, and ->flags set.
 */
 static int generic_exec_single(int cpu, struct call_single_data *csd,
-                               smp_call_func_t func, void *info, int wait)
+                               smp_call_func_t func, void *info)
 {
-        struct call_single_data csd_stack = { .flags = 0 };
-        unsigned long flags;
        if (cpu == smp_processor_id()) {
+                unsigned long flags;
+                /*
+                 * We can unlock early even for the synchronous on-stack case,
+                 * since we're doing this from the same CPU..
+                 */
+                csd_unlock(csd);
                local_irq_save(flags);
                func(info);
                local_irq_restore(flags);
@@ -158,24 +159,14 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
        }
-        if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu))
+        if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+                csd_unlock(csd);
                return -ENXIO;
-        if (!csd) {
-                csd = &csd_stack;
-                if (!wait)
-                        csd = this_cpu_ptr(&csd_data);
        }
-        csd_lock(csd);
        csd->func = func;
        csd->info = info;
-        if (wait)
-                csd->flags |= CSD_FLAG_WAIT;
        /*
         * The list addition should be visible before sending the IPI
         * handler locks the list to pull the entry off it because of
@@ -190,9 +181,6 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
        if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
                arch_send_call_function_single_ipi(cpu);
-        if (wait)
-                csd_lock_wait(csd);
        return 0;
 }
@@ -250,8 +238,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
        }
        llist_for_each_entry_safe(csd, csd_next, entry, llist) {
-                csd->func(csd->info);
+                smp_call_func_t func = csd->func;
-                csd_unlock(csd);
+                void *info = csd->info;
+                /* Do we wait until *after* callback? */
+                if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+                        func(info);
+                        csd_unlock(csd);
+                } else {
+                        csd_unlock(csd);
+                        func(info);
+                }
        }
        /*
@@ -274,6 +271,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                             int wait)
 {
+        struct call_single_data *csd;
+        struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
        int this_cpu;
        int err;
@@ -292,7 +291,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
                     && !oops_in_progress);
-        err = generic_exec_single(cpu, NULL, func, info, wait);
+        csd = &csd_stack;
+        if (!wait) {
+                csd = this_cpu_ptr(&csd_data);
+                csd_lock(csd);
+        }
+        err = generic_exec_single(cpu, csd, func, info);
+        if (wait)
+                csd_lock_wait(csd);
        put_cpu();
@@ -321,7 +329,15 @@ int smp_call_function_single_async(int cpu, struct call_single_data *csd)
        int err = 0;
        preempt_disable();
-        err = generic_exec_single(cpu, csd, csd->func, csd->info, 0);
+        /* We could deadlock if we have to wait here with interrupts disabled! */
+        if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK))
+                csd_lock_wait(csd);
+        csd->flags = CSD_FLAG_LOCK;
+        smp_wmb();
+        err = generic_exec_single(cpu, csd, csd->func, csd->info);
        preempt_enable();
        return err;
@@ -433,6 +449,8 @@ void smp_call_function_many(const struct cpumask *mask,
                struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
                csd_lock(csd);
+                if (wait)
+                        csd->flags |= CSD_FLAG_SYNCHRONOUS;
                csd->func = func;
                csd->info = info;
                llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
diff --git a/kernel/sys.c b/kernel/sys.c
index 3be344902316..a4e372b798a5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1649,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask)
        return mask;
 }
-static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
        struct fd exe;
+        struct file *old_exe, *exe_file;
        struct inode *inode;
        int err;
-        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
        exe = fdget(fd);
        if (!exe.file)
                return -EBADF;
@@ -1680,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
        /*
         * Forbid mm->exe_file change if old file still mapped.
         */
+        exe_file = get_mm_exe_file(mm);
        err = -EBUSY;
-        if (mm->exe_file) {
+        if (exe_file) {
                struct vm_area_struct *vma;
-                for (vma = mm->mmap; vma; vma = vma->vm_next)
+                down_read(&mm->mmap_sem);
-                        if (vma->vm_file &&
+                for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                            path_equal(&vma->vm_file->f_path,
+                        if (!vma->vm_file)
-                                       &mm->exe_file->f_path))
+                                continue;
-                                goto exit;
+                        if (path_equal(&vma->vm_file->f_path,
+                                       &exe_file->f_path))
+                                goto exit_err;
+                }
+                up_read(&mm->mmap_sem);
+                fput(exe_file);
        }
        /*
@@ -1702,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
                goto exit;
        err = 0;
-        set_mm_exe_file(mm, exe.file);  /* this grabs a reference to exe.file */
+        /* set the new file, lockless */
+        get_file(exe.file);
+        old_exe = xchg(&mm->exe_file, exe.file);
+        if (old_exe)
+                fput(old_exe);
 exit:
        fdput(exe);
        return err;
+exit_err:
+        up_read(&mm->mmap_sem);
+        fput(exe_file);
+        goto exit;
 }
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1840,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
                user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
        }
-        down_write(&mm->mmap_sem);
        if (prctl_map.exe_fd != (u32)-1)
-                error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
+                error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
-        downgrade_write(&mm->mmap_sem);
+        down_read(&mm->mmap_sem);
        if (error)
                goto out;
@@ -1909,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
-        if (opt == PR_SET_MM_EXE_FILE) {
+        if (opt == PR_SET_MM_EXE_FILE)
-                down_write(&mm->mmap_sem);
+                return prctl_set_mm_exe_file(mm, (unsigned int)addr);
-                error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
-                up_write(&mm->mmap_sem);
-                return error;
-        }
        if (addr >= TASK_SIZE || addr < mmap_min_addr)
                return -EINVAL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 42b7fc2860c1..2082b1a88fb9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -93,11 +93,9 @@
 #include <linux/nmi.h>
 #endif
 #if defined(CONFIG_SYSCTL)
 /* External variables not in a header file. */
-extern int max_threads;
 extern int suid_dumpable;
 #ifdef CONFIG_COREDUMP
 extern int core_uses_pid;
@@ -710,10 +708,10 @@ static struct ctl_table kern_table[] = {
 #endif
        {
                .procname       = "threads-max",
-                .data           = &max_threads,
+                .data           = NULL,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = sysctl_max_threads,
        },
        {
                .procname       = "random",
@@ -1983,7 +1981,15 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
                                 int write, void *data)
 {
        if (write) {
-                *valp = *negp ? -*lvalp : *lvalp;
+                if (*negp) {
+                        if (*lvalp > (unsigned long) INT_MAX + 1)
+                                return -EINVAL;
+                        *valp = -*lvalp;
+                } else {
+                        if (*lvalp > (unsigned long) INT_MAX)
+                                return -EINVAL;
+                        *valp = *lvalp;
+                }
        } else {
                int val = *valp;
                if (val < 0) {
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 25d942d1da27..637a09461c1d 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -117,11 +117,7 @@ static int __clockevents_set_state(struct clock_event_device *dev,
        /* Transition with new state-specific callbacks */
        switch (state) {
        case CLOCK_EVT_STATE_DETACHED:
-                /*
+                /* The clockevent device is getting replaced. Shut it down. */
-                 * This is an internal state, which is guaranteed to go from
-                 * SHUTDOWN to DETACHED. No driver interaction required.
-                 */
-                return 0;
        case CLOCK_EVT_STATE_SHUTDOWN:
                return dev->set_state_shutdown(dev);
@@ -440,7 +436,7 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
        mutex_unlock(&clockevents_mutex);
        return ret;
 }
-EXPORT_SYMBOL_GPL(clockevents_unbind);
+EXPORT_SYMBOL_GPL(clockevents_unbind_device);
 /* Sanity check of state transition callbacks */
 static int clockevents_sanity_check(struct clock_event_device *dev)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 91eecaaa43e0..05330494a0df 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6079,7 +6079,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
        struct dentry *ret = trace_create_file(name, mode, parent, data, fops);
        if (ret) /* See tracing_get_cpu() */
-                ret->d_inode->i_cdev = (void *)(cpu + 1);
+                d_inode(ret)->i_cdev = (void *)(cpu + 1);
        return ret;
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7da1dfeb322e..c4de47fc5cca 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -494,8 +494,8 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
        if (dir) {
                spin_lock(&dir->d_lock);        /* probably unneeded */
                list_for_each_entry(child, &dir->d_subdirs, d_child) {
-                        if (child->d_inode)     /* probably unneeded */
+                        if (d_really_is_positive(child))        /* probably unneeded */
-                                child->d_inode->i_private = NULL;
+                                d_inode(child)->i_private = NULL;
                }
                spin_unlock(&dir->d_lock);
@@ -565,6 +565,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
 static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
 {
        char *event = NULL, *sub = NULL, *match;
+        int ret;
        /*
         * The buf format can be <subsystem>:<event-name>
@@ -590,7 +591,13 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
                        event = NULL;
        }
-        return __ftrace_set_clr_event(tr, match, sub, event, set);
+        ret = __ftrace_set_clr_event(tr, match, sub, event, set);
+        /* Put back the colon to allow this to be called again */
+        if (buf)
+                *(buf - 1) = ':';
+        return ret;
 }
 /**
@@ -1753,6 +1760,8 @@ static void update_event_printk(struct ftrace_event_call *call,
                                ptr++;
                                /* Check for alpha chars like ULL */
                        } while (isalnum(*ptr));
+                        if (!*ptr)
+                                break;
                        /*
                         * A number must have some kind of delimiter after
                         * it, and we can ignore that too.
@@ -1779,12 +1788,16 @@ static void update_event_printk(struct ftrace_event_call *call,
                        do {
                                ptr++;
                        } while (isalnum(*ptr) || *ptr == '_');
+                        if (!*ptr)
+                                break;
                        /*
                         * If what comes after this variable is a '.' or
                         * '->' then we can continue to ignore that string.
                         */
                        if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
                                ptr += *ptr == '.' ? 1 : 2;
+                                if (!*ptr)
+                                        break;
                                goto skip_more;
                        }
                        /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9cfea4c6d314..a51e79688455 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1308,15 +1308,19 @@ void graph_trace_open(struct trace_iterator *iter)
 {
        /* pid and depth on the last trace processed */
        struct fgraph_data *data;
+        gfp_t gfpflags;
        int cpu;
        iter->private = NULL;
-        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        /* We can be called in atomic context via ftrace_dump() */
+        gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
+        data = kzalloc(sizeof(*data), gfpflags);
        if (!data)
                goto out_err;
-        data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
+        data->cpu_data = alloc_percpu_gfp(struct fgraph_cpu_data, gfpflags);
        if (!data->cpu_data)
                goto out_err_free;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 692bf7184c8c..25a086bcb700 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -178,12 +178,13 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 EXPORT_SYMBOL(ftrace_print_hex_seq);
 const char *
-ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
+ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count,
                       size_t el_size)
 {
        const char *ret = trace_seq_buffer_ptr(p);
        const char *prefix = "";
        void *ptr = (void *)buf;
+        size_t buf_len = count * el_size;
        trace_seq_putc(p, '{');
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index d60fe62ec4fa..6dd022c7b5bc 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -443,7 +443,7 @@ static int create_trace_uprobe(int argc, char **argv)
        if (ret)
                goto fail_address_parse;
-        inode = igrab(path.dentry->d_inode);
+        inode = igrab(d_inode(path.dentry));
        path_put(&path);
        if (!inode || !S_ISREG(inode->i_mode)) {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 2316f50b07a4..581a68a04c64 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -41,6 +41,8 @@
 #define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
 #define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
+static DEFINE_MUTEX(watchdog_proc_mutex);
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
 #else
@@ -608,26 +610,36 @@ void watchdog_nmi_enable_all(void)
 {
        int cpu;
-        if (!watchdog_user_enabled)
+        mutex_lock(&watchdog_proc_mutex);
-                return;
+        if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+                goto unlock;
        get_online_cpus();
        for_each_online_cpu(cpu)
                watchdog_nmi_enable(cpu);
        put_online_cpus();
+unlock:
+        mutex_unlock(&watchdog_proc_mutex);
 }
 void watchdog_nmi_disable_all(void)
 {
        int cpu;
+        mutex_lock(&watchdog_proc_mutex);
        if (!watchdog_running)
-                return;
+                goto unlock;
        get_online_cpus();
        for_each_online_cpu(cpu)
                watchdog_nmi_disable(cpu);
        put_online_cpus();
+unlock:
+        mutex_unlock(&watchdog_proc_mutex);
 }
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
@@ -744,8 +756,6 @@ static int proc_watchdog_update(void)
 }
-static DEFINE_MUTEX(watchdog_proc_mutex);
 /*
 * common function for watchdog, nmi_watchdog and soft_watchdog parameter
 *