23 files changed, 386 insertions, 203 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 22bb4f24f071..8d528f9930da 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1883,6 +1883,23 @@ out_null:
        audit_log_format(ab, " exe=(null)");
 }
+struct tty_struct *audit_get_tty(struct task_struct *tsk)
+{
+        struct tty_struct *tty = NULL;
+        unsigned long flags;
+        spin_lock_irqsave(&tsk->sighand->siglock, flags);
+        if (tsk->signal)
+                tty = tty_kref_get(tsk->signal->tty);
+        spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+        return tty;
+}
+void audit_put_tty(struct tty_struct *tty)
+{
+        tty_kref_put(tty);
+}
 void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
        const struct cred *cred;
diff --git a/kernel/audit.h b/kernel/audit.h
index cbbe6bb6496e..a492f4c4e710 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -23,6 +23,7 @@
 #include <linux/audit.h>
 #include <linux/skbuff.h>
 #include <uapi/linux/mqueue.h>
+#include <linux/tty.h>
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
 * for saving names from getname().  If we get more names we will allocate
@@ -262,6 +263,9 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
 extern void audit_log_d_path_exe(struct audit_buffer *ab,
                                 struct mm_struct *mm);
+extern struct tty_struct *audit_get_tty(struct task_struct *tsk);
+extern void audit_put_tty(struct tty_struct *tty);
 /* audit watch functions */
 #ifdef CONFIG_AUDIT_WATCH
 extern void audit_put_watch(struct audit_watch *watch);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 62ab53d7619c..2672d105cffc 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -63,7 +63,6 @@
 #include <asm/unistd.h>
 #include <linux/security.h>
 #include <linux/list.h>
-#include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
@@ -1985,14 +1984,15 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
        if (!audit_enabled)
                return;
+        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+        if (!ab)
+                return;
        uid = from_kuid(&init_user_ns, task_uid(current));
        oldloginuid = from_kuid(&init_user_ns, koldloginuid);
        loginuid = from_kuid(&init_user_ns, kloginuid),
        tty = audit_get_tty(current);
-        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
-        if (!ab)
-                return;
        audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
        audit_log_task_context(ab);
        audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 668e07903c8f..eec9f90ba030 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -126,31 +126,6 @@
 * are set to NOT_INIT to indicate that they are no longer readable.
 */
-/* types of values stored in eBPF registers */
-enum bpf_reg_type {
-        NOT_INIT = 0,            /* nothing was written into register */
-        UNKNOWN_VALUE,           /* reg doesn't contain a valid pointer */
-        PTR_TO_CTX,              /* reg points to bpf_context */
-        CONST_PTR_TO_MAP,        /* reg points to struct bpf_map */
-        PTR_TO_MAP_VALUE,        /* reg points to map element value */
-        PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
-        FRAME_PTR,               /* reg == frame_pointer */
-        PTR_TO_STACK,            /* reg == frame_pointer + imm */
-        CONST_IMM,               /* constant integer value */
-        /* PTR_TO_PACKET represents:
-         * skb->data
-         * skb->data + imm
-         * skb->data + (u16) var
-         * skb->data + (u16) var + imm
-         * if (range > 0) then [ptr, ptr + range - off) is safe to access
-         * if (id > 0) means that some 'var' was added
-         * if (off > 0) menas that 'imm' was added
-         */
-        PTR_TO_PACKET,
-        PTR_TO_PACKET_END,       /* skb->data + headlen */
-};
 struct reg_state {
        enum bpf_reg_type type;
        union {
@@ -695,10 +670,10 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 /* check access to 'struct bpf_context' fields */
 static int check_ctx_access(struct verifier_env *env, int off, int size,
-                            enum bpf_access_type t)
+                            enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
        if (env->prog->aux->ops->is_valid_access &&
-            env->prog->aux->ops->is_valid_access(off, size, t)) {
+            env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
                /* remember the offset of last byte accessed in ctx */
                if (env->prog->aux->max_ctx_offset < off + size)
                        env->prog->aux->max_ctx_offset = off + size;
@@ -798,21 +773,19 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
                        mark_reg_unknown_value(state->regs, value_regno);
        } else if (reg->type == PTR_TO_CTX) {
+                enum bpf_reg_type reg_type = UNKNOWN_VALUE;
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose("R%d leaks addr into ctx\n", value_regno);
                        return -EACCES;
                }
-                err = check_ctx_access(env, off, size, t);
+                err = check_ctx_access(env, off, size, t, &reg_type);
                if (!err && t == BPF_READ && value_regno >= 0) {
                        mark_reg_unknown_value(state->regs, value_regno);
-                        if (off == offsetof(struct __sk_buff, data) &&
+                        if (env->allow_ptr_leaks)
-                            env->allow_ptr_leaks)
                                /* note that reg.[id|off|range] == 0 */
-                                state->regs[value_regno].type = PTR_TO_PACKET;
+                                state->regs[value_regno].type = reg_type;
-                        else if (off == offsetof(struct __sk_buff, data_end) &&
-                                 env->allow_ptr_leaks)
-                                state->regs[value_regno].type = PTR_TO_PACKET_END;
                }
        } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 86cb5c6e8932..75c0ff00aca6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -837,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset)
 static void put_css_set(struct css_set *cset)
 {
+        unsigned long flags;
        /*
         * Ensure that the refcount doesn't hit zero while any readers
         * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -845,9 +847,9 @@ static void put_css_set(struct css_set *cset)
        if (atomic_add_unless(&cset->refcount, -1, 1))
                return;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irqsave(&css_set_lock, flags);
        put_css_set_locked(cset);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irqrestore(&css_set_lock, flags);
 }
 /*
@@ -1070,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        /* First see if we already have a cgroup group that matches
         * the desired set */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        cset = find_existing_css_set(old_cset, cgrp, template);
        if (cset)
                get_css_set(cset);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        if (cset)
                return cset;
@@ -1102,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
         * find_existing_css_set() */
        memcpy(cset->subsys, template, sizeof(cset->subsys));
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
@@ -1128,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
                css_get(css);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return cset;
 }
@@ -1192,7 +1194,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
         * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
                list_del(&link->cset_link);
@@ -1200,7 +1202,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
                kfree(link);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        if (!list_empty(&root->root_list)) {
                list_del(&root->root_list);
@@ -1600,11 +1602,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
                ss->root = dst_root;
                css->cgroup = dcgrp;
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                hash_for_each(css_set_table, i, cset, hlist)
                        list_move_tail(&cset->e_cset_node[ss->id],
                                       &dcgrp->e_csets[ss->id]);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
@@ -1640,10 +1642,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
        if (!buf)
                return -ENOMEM;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
        len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        if (len >= PATH_MAX)
                len = -ERANGE;
@@ -1897,7 +1899,7 @@ static void cgroup_enable_task_cg_lists(void)
 {
        struct task_struct *p, *g;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        if (use_task_css_set_links)
                goto out_unlock;
@@ -1922,8 +1924,12 @@ static void cgroup_enable_task_cg_lists(void)
                 * entry won't be deleted though the process has exited.
                 * Do it while holding siglock so that we don't end up
                 * racing against cgroup_exit().
+                 *
+                 * Interrupts were already disabled while acquiring
+                 * the css_set_lock, so we do not need to disable it
+                 * again when acquiring the sighand->siglock here.
                 */
-                spin_lock_irq(&p->sighand->siglock);
+                spin_lock(&p->sighand->siglock);
                if (!(p->flags & PF_EXITING)) {
                        struct css_set *cset = task_css_set(p);
@@ -1932,11 +1938,11 @@ static void cgroup_enable_task_cg_lists(void)
                        list_add_tail(&p->cg_list, &cset->tasks);
                        get_css_set(cset);
                }
-                spin_unlock_irq(&p->sighand->siglock);
+                spin_unlock(&p->sighand->siglock);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
 out_unlock:
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
 }
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -2043,13 +2049,13 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
         * Link the root cgroup in this hierarchy into all the css_set
         * objects.
         */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        hash_for_each(css_set_table, i, cset, hlist) {
                link_css_set(&tmp_links, cset, root_cgrp);
                if (css_set_populated(cset))
                        cgroup_update_populated(root_cgrp, true);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        BUG_ON(!list_empty(&root_cgrp->self.children));
        BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2256,11 +2262,11 @@ out_mount:
                struct cgroup *cgrp;
                mutex_lock(&cgroup_mutex);
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                cgrp = cset_cgroup_from_root(ns->root_cset, root);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
                mutex_unlock(&cgroup_mutex);
                nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
@@ -2337,11 +2343,11 @@ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
        char *ret;
        mutex_lock(&cgroup_mutex);
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        return ret;
@@ -2369,7 +2375,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
        char *path = NULL;
        mutex_lock(&cgroup_mutex);
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
@@ -2382,7 +2388,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
                        path = buf;
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        return path;
 }
@@ -2557,7 +2563,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
         * the new cgroup.  There are no failure cases after here, so this
         * is the commit point.
         */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(cset, &tset->src_csets, mg_node) {
                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
                        struct css_set *from_cset = task_css_set(task);
@@ -2568,7 +2574,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
                        put_css_set_locked(from_cset);
                }
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        /*
         * Migration is committed, all target tasks are now on dst_csets.
@@ -2597,13 +2603,13 @@ out_cancel_attach:
                }
        } while_each_subsys_mask();
 out_release_tset:
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_splice_init(&tset->dst_csets, &tset->src_csets);
        list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
                list_del_init(&cset->mg_node);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return ret;
 }
@@ -2634,7 +2640,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
        lockdep_assert_held(&cgroup_mutex);
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cgrp = NULL;
@@ -2642,7 +2648,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
                list_del_init(&cset->mg_preload_node);
                put_css_set_locked(cset);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
 }
 /**
@@ -2783,7 +2789,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
         * already PF_EXITING could be freed from underneath us unless we
         * take an rcu_read_lock.
         */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
@@ -2792,7 +2798,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return cgroup_taskset_migrate(&tset, root);
 }
@@ -2816,7 +2822,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
                return -EBUSY;
        /* look up all src csets */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
@@ -2826,7 +2832,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        /* prepare dst csets and commit */
        ret = cgroup_migrate_prepare_dst(&preloaded_csets);
@@ -2859,9 +2865,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
                struct cgroup *cgrp;
                struct inode *inode;
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
                while (!cgroup_is_descendant(dst_cgrp, cgrp))
                        cgrp = cgroup_parent(cgrp);
@@ -2962,9 +2968,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
                if (root == &cgrp_dfl_root)
                        continue;
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                from_cgrp = task_cgroup_from_root(from, root);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
                retval = cgroup_attach_task(from_cgrp, tsk, false);
                if (retval)
@@ -3080,7 +3086,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
        percpu_down_write(&cgroup_threadgroup_rwsem);
        /* look up all csses currently attached to @cgrp's subtree */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                struct cgrp_cset_link *link;
@@ -3088,14 +3094,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
                        cgroup_migrate_add_src(link->cset, dsct,
                                               &preloaded_csets);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        /* NULL dst indicates self on default hierarchy */
        ret = cgroup_migrate_prepare_dst(&preloaded_csets);
        if (ret)
                goto out_finish;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
                struct task_struct *task, *ntask;
@@ -3107,7 +3113,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
                list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
                        cgroup_taskset_add(task, &tset);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        ret = cgroup_taskset_migrate(&tset, cgrp->root);
 out_finish:
@@ -3908,10 +3914,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
        int count = 0;
        struct cgrp_cset_link *link;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                count += atomic_read(&link->cset->refcount);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return count;
 }
@@ -4249,7 +4255,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
        memset(it, 0, sizeof(*it));
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        it->ss = css->ss;
@@ -4262,7 +4268,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
        css_task_iter_advance_css_set(it);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
 }
 /**
@@ -4280,7 +4286,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
                it->cur_task = NULL;
        }
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        if (it->task_pos) {
                it->cur_task = list_entry(it->task_pos, struct task_struct,
@@ -4289,7 +4295,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
                css_task_iter_advance(it);
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return it->cur_task;
 }
@@ -4303,10 +4309,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 void css_task_iter_end(struct css_task_iter *it)
 {
        if (it->cur_cset) {
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                list_del(&it->iters_node);
                put_css_set_locked(it->cur_cset);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
        }
        if (it->cur_task)
@@ -4338,10 +4344,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
        mutex_lock(&cgroup_mutex);
        /* all tasks in @from are being moved, all csets are source */
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &from->cset_links, cset_link)
                cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        ret = cgroup_migrate_prepare_dst(&preloaded_csets);
        if (ret)
@@ -5063,6 +5069,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
        memset(css, 0, sizeof(*css));
        css->cgroup = cgrp;
        css->ss = ss;
+        css->id = -1;
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
        css->serial_nr = css_serial_nr_next++;
@@ -5150,7 +5157,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
        err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
        if (err < 0)
-                goto err_free_percpu_ref;
+                goto err_free_css;
        css->id = err;
        /* @css is ready to be brought online now, make it visible */
@@ -5174,9 +5181,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 err_list_del:
        list_del_rcu(&css->sibling);
-        cgroup_idr_remove(&ss->css_idr, css->id);
-err_free_percpu_ref:
-        percpu_ref_exit(&css->refcnt);
 err_free_css:
        call_rcu(&css->rcu_head, css_free_rcu_fn);
        return ERR_PTR(err);
@@ -5451,10 +5455,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         */
        cgrp->self.flags &= ~CSS_ONLINE;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                link->cset->dead = true;
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        /* initiate massacre of all css's */
        for_each_css(css, ssid, cgrp)
@@ -5725,7 +5729,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                goto out;
        mutex_lock(&cgroup_mutex);
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        for_each_root(root) {
                struct cgroup_subsys *ss;
@@ -5778,7 +5782,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
        retval = 0;
 out_unlock:
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        kfree(buf);
 out:
@@ -5923,13 +5927,13 @@ void cgroup_post_fork(struct task_struct *child)
        if (use_task_css_set_links) {
                struct css_set *cset;
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                cset = task_css_set(current);
                if (list_empty(&child->cg_list)) {
                        get_css_set(cset);
                        css_set_move_task(child, NULL, cset, false);
                }
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
        }
        /*
@@ -5974,9 +5978,9 @@ void cgroup_exit(struct task_struct *tsk)
        cset = task_css_set(tsk);
        if (!list_empty(&tsk->cg_list)) {
-                spin_lock_bh(&css_set_lock);
+                spin_lock_irq(&css_set_lock);
                css_set_move_task(tsk, cset, NULL, false);
-                spin_unlock_bh(&css_set_lock);
+                spin_unlock_irq(&css_set_lock);
        } else {
                get_css_set(cset);
        }
@@ -6044,9 +6048,9 @@ static void cgroup_release_agent(struct work_struct *work)
        if (!pathbuf || !agentbuf)
                goto out;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        if (!path)
                goto out;
@@ -6306,12 +6310,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                return ERR_PTR(-EPERM);
        mutex_lock(&cgroup_mutex);
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        cset = task_css_set(current);
        get_css_set(cset);
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        new_ns = alloc_cgroup_ns();
@@ -6435,7 +6439,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
        if (!name_buf)
                return -ENOMEM;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
        cset = rcu_dereference(current->cgroups);
        list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -6446,7 +6450,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
                           c->root->hierarchy_id, name_buf);
        }
        rcu_read_unlock();
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        kfree(name_buf);
        return 0;
 }
@@ -6457,7 +6461,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
        struct cgroup_subsys_state *css = seq_css(seq);
        struct cgrp_cset_link *link;
-        spin_lock_bh(&css_set_lock);
+        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
                struct css_set *cset = link->cset;
                struct task_struct *task;
@@ -6480,7 +6484,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
        overflow:
                seq_puts(seq, "  ...\n");
        }
-        spin_unlock_bh(&css_set_lock);
+        spin_unlock_irq(&css_set_lock);
        return 0;
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 274450efea90..85cd41878a74 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3862,10 +3862,8 @@ static void _free_event(struct perf_event *event)
        if (event->ctx)
                put_ctx(event->ctx);
-        if (event->pmu) {
+        exclusive_event_destroy(event);
-                exclusive_event_destroy(event);
+        module_put(event->pmu->module);
-                module_put(event->pmu->module);
-        }
        call_rcu(&event->rcu_head, free_event_rcu);
 }
@@ -7531,7 +7529,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
        prog = event->tp_event->prog;
        if (prog) {
                event->tp_event->prog = NULL;
-                bpf_prog_put(prog);
+                bpf_prog_put_rcu(prog);
        }
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 5c2c355aa97f..4a7ec0c6c88c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -148,18 +148,18 @@ static inline void free_task_struct(struct task_struct *tsk)
 }
 #endif
-void __weak arch_release_thread_info(struct thread_info *ti)
+void __weak arch_release_thread_stack(unsigned long *stack)
 {
 }
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
 /*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
 # if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                  int node)
 {
        struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
@@ -172,33 +172,33 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
        return page ? page_address(page) : NULL;
 }
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_stack(unsigned long *stack)
 {
-        struct page *page = virt_to_page(ti);
+        struct page *page = virt_to_page(stack);
        memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
                                    -(1 << THREAD_SIZE_ORDER));
        __free_kmem_pages(page, THREAD_SIZE_ORDER);
 }
 # else
-static struct kmem_cache *thread_info_cache;
+static struct kmem_cache *thread_stack_cache;
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                  int node)
 {
-        return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+        return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
-static void free_thread_info(struct thread_info *ti)
+static void free_thread_stack(unsigned long *stack)
 {
-        kmem_cache_free(thread_info_cache, ti);
+        kmem_cache_free(thread_stack_cache, stack);
 }
-void thread_info_cache_init(void)
+void thread_stack_cache_init(void)
 {
-        thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+        thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
                                              THREAD_SIZE, 0, NULL);
-        BUG_ON(thread_info_cache == NULL);
+        BUG_ON(thread_stack_cache == NULL);
 }
 # endif
 #endif
@@ -221,9 +221,9 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(unsigned long *stack, int account)
 {
-        struct zone *zone = page_zone(virt_to_page(ti));
+        struct zone *zone = page_zone(virt_to_page(stack));
        mod_zone_page_state(zone, NR_KERNEL_STACK, account);
 }
@@ -231,8 +231,8 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
        account_kernel_stack(tsk->stack, -1);
-        arch_release_thread_info(tsk->stack);
+        arch_release_thread_stack(tsk->stack);
-        free_thread_info(tsk->stack);
+        free_thread_stack(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
@@ -343,7 +343,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
        struct task_struct *tsk;
-        struct thread_info *ti;
+        unsigned long *stack;
        int err;
        if (node == NUMA_NO_NODE)
@@ -352,15 +352,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        if (!tsk)
                return NULL;
-        ti = alloc_thread_info_node(tsk, node);
+        stack = alloc_thread_stack_node(tsk, node);
-        if (!ti)
+        if (!stack)
                goto free_tsk;
        err = arch_dup_task_struct(tsk, orig);
        if (err)
-                goto free_ti;
+                goto free_stack;
-        tsk->stack = ti;
+        tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
@@ -392,14 +392,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
-        account_kernel_stack(ti, 1);
+        account_kernel_stack(stack, 1);
        kcov_task_init(tsk);
        return tsk;
-free_ti:
+free_stack:
-        free_thread_info(ti);
+        free_thread_stack(stack);
 free_tsk:
        free_task_struct(tsk);
        return NULL;
diff --git a/kernel/futex.c b/kernel/futex.c
index ee25f5ba4aca..33664f70e2d2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-        struct page *page;
+        struct page *page, *tail;
        struct address_space *mapping;
        int err, ro = 0;
@@ -530,7 +530,15 @@ again:
         * considered here and page lock forces unnecessarily serialization
         * From this point on, mapping will be re-verified if necessary and
         * page lock will be acquired only if it is unavoidable
-         */
+         *
+         * Mapping checks require the head page for any compound page so the
+         * head page and mapping is looked up now. For anonymous pages, it
+         * does not matter if the page splits in the future as the key is
+         * based on the address. For filesystem-backed pages, the tail is
+         * required as the index of the page determines the key. For
+         * base pages, there is no tail page and tail == page.
+         */
+        tail = page;
        page = compound_head(page);
        mapping = READ_ONCE(page->mapping);
@@ -654,7 +662,7 @@ again:
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
                key->shared.inode = inode;
-                key->shared.pgoff = basepage_index(page);
+                key->shared.pgoff = basepage_index(tail);
                rcu_read_unlock();
        }
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 05254eeb4b4e..4b353e0be121 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key);
 void static_key_slow_inc(struct static_key *key)
 {
+        int v, v1;
        STATIC_KEY_CHECK_USE();
-        if (atomic_inc_not_zero(&key->enabled))
-                return;
+        /*
+         * Careful if we get concurrent static_key_slow_inc() calls;
+         * later calls must wait for the first one to _finish_ the
+         * jump_label_update() process.  At the same time, however,
+         * the jump_label_update() call below wants to see
+         * static_key_enabled(&key) for jumps to be updated properly.
+         *
+         * So give a special meaning to negative key->enabled: it sends
+         * static_key_slow_inc() down the slow path, and it is non-zero
+         * so it counts as "enabled" in jump_label_update().  Note that
+         * atomic_inc_unless_negative() checks >= 0, so roll our own.
+         */
+        for (v = atomic_read(&key->enabled); v > 0; v = v1) {
+                v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
+                if (likely(v1 == v))
+                        return;
+        }
        jump_label_lock();
-        if (atomic_inc_return(&key->enabled) == 1)
+        if (atomic_read(&key->enabled) == 0) {
+                atomic_set(&key->enabled, -1);
                jump_label_update(key);
+                atomic_set(&key->enabled, 1);
+        } else {
+                atomic_inc(&key->enabled);
+        }
        jump_label_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
 static void __static_key_slow_dec(struct static_key *key,
                unsigned long rate_limit, struct delayed_work *work)
 {
+        /*
+         * The negative count check is valid even when a negative
+         * key->enabled is in use by static_key_slow_inc(); a
+         * __static_key_slow_dec() before the first static_key_slow_inc()
+         * returns is unbalanced, because all other static_key_slow_inc()
+         * instances block while the update is in progress.
+         */
        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
                WARN(atomic_read(&key->enabled) < 0,
                     "jump label: negative count!\n");
diff --git a/kernel/kcov.c b/kernel/kcov.c
index a02f2dddd1d7..8d44b3fea9d0 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = {
 static int __init kcov_init(void)
 {
-        if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+        /*
+         * The kcov debugfs file won't ever get removed and thus,
+         * there is no need to protect it against removal races. The
+         * use of debugfs_create_file_unsafe() is actually safe here.
+         */
+        if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
                pr_err("failed to create kcov in debugfs\n");
                return -ENOMEM;
        }
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 3ef3736002d8..9c951fade415 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
 }
 void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-                            struct thread_info *ti)
+                            struct task_struct *task)
 {
        SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
        /* Mark the current thread as blocked on the lock: */
-        ti->task->blocked_on = waiter;
+        task->blocked_on = waiter;
 }
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-                         struct thread_info *ti)
+                         struct task_struct *task)
 {
        DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
-        DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
+        DEBUG_LOCKS_WARN_ON(waiter->task != task);
-        DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
+        DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
-        ti->task->blocked_on = NULL;
+        task->blocked_on = NULL;
        list_del_init(&waiter->list);
        waiter->task = NULL;
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..d06ae3bb46c5 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock,
 extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
 extern void debug_mutex_add_waiter(struct mutex *lock,
                                   struct mutex_waiter *waiter,
-                                   struct thread_info *ti);
+                                   struct task_struct *task);
 extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-                                struct thread_info *ti);
+                                struct task_struct *task);
 extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
                             struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index e364b424b019..a70b90db3909 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
        if (!hold_ctx)
                return 0;
-        if (unlikely(ctx == hold_ctx))
-                return -EALREADY;
        if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
            (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
 #ifdef CONFIG_DEBUG_MUTEXES
@@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        unsigned long flags;
        int ret;
+        if (use_ww_ctx) {
+                struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+                if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
+                        return -EALREADY;
+        }
        preempt_disable();
        mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
@@ -534,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                goto skip_wait;
        debug_mutex_lock_common(lock, &waiter);
-        debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+        debug_mutex_add_waiter(lock, &waiter, task);
        /* add waiting tasks to the end of the waitqueue (FIFO): */
        list_add_tail(&waiter.list, &lock->wait_list);
@@ -581,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        }
        __set_task_state(task, TASK_RUNNING);
-        mutex_remove_waiter(lock, &waiter, current_thread_info());
+        mutex_remove_waiter(lock, &waiter, task);
        /* set it to 0 if there are no waiters left: */
        if (likely(list_empty(&lock->wait_list)))
                atomic_set(&lock->count, 0);
@@ -602,7 +605,7 @@ skip_wait:
        return 0;
 err:
-        mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+        mutex_remove_waiter(lock, &waiter, task);
        spin_unlock_mutex(&lock->wait_lock, flags);
        debug_mutex_free_waiter(&waiter);
        mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 5cda397607f2..a68bae5e852a 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -13,7 +13,7 @@
                do { spin_lock(lock); (void)(flags); } while (0)
 #define spin_unlock_mutex(lock, flags) \
                do { spin_unlock(lock); (void)(flags); } while (0)
-#define mutex_remove_waiter(lock, waiter, ti) \
+#define mutex_remove_waiter(lock, waiter, task) \
                __list_del((waiter)->list.prev, (waiter)->list.next)
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e32ae1..5fc8c311b8fe 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -267,6 +267,66 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 #define queued_spin_lock_slowpath       native_queued_spin_lock_slowpath
 #endif
+/*
+ * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
+ * issuing an _unordered_ store to set _Q_LOCKED_VAL.
+ *
+ * This means that the store can be delayed, but no later than the
+ * store-release from the unlock. This means that simply observing
+ * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
+ *
+ * There are two paths that can issue the unordered store:
+ *
+ *  (1) clear_pending_set_locked():     *,1,0 -> *,0,1
+ *
+ *  (2) set_locked():                   t,0,0 -> t,0,1 ; t != 0
+ *      atomic_cmpxchg_relaxed():       t,0,0 -> 0,0,1
+ *
+ * However, in both cases we have other !0 state we've set before to queue
+ * ourseves:
+ *
+ * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
+ * load is constrained by that ACQUIRE to not pass before that, and thus must
+ * observe the store.
+ *
+ * For (2) we have a more intersting scenario. We enqueue ourselves using
+ * xchg_tail(), which ends up being a RELEASE. This in itself is not
+ * sufficient, however that is followed by an smp_cond_acquire() on the same
+ * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
+ * guarantees we must observe that store.
+ *
+ * Therefore both cases have other !0 state that is observable before the
+ * unordered locked byte store comes through. This means we can use that to
+ * wait for the lock store, and then wait for an unlock.
+ */
+#ifndef queued_spin_unlock_wait
+void queued_spin_unlock_wait(struct qspinlock *lock)
+{
+        u32 val;
+        for (;;) {
+                val = atomic_read(&lock->val);
+                if (!val) /* not locked, we're done */
+                        goto done;
+                if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
+                        break;
+                /* not locked, but pending, wait until we observe the lock */
+                cpu_relax();
+        }
+        /* any unlock is good */
+        while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
+                cpu_relax();
+done:
+        smp_rmb(); /* CTRL + RMB -> ACQUIRE */
+}
+EXPORT_SYMBOL(queued_spin_unlock_wait);
+#endif
 #endif /* _GEN_PV_LOCK_SLOWPATH */
 /**
diff --git a/kernel/power/process.c b/kernel/power/process.c
index df058bed53ce..0c2ee9761d57 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -146,6 +146,18 @@ int freeze_processes(void)
        if (!error && !oom_killer_disable())
                error = -EBUSY;
+        /*
+         * There is a hard to fix race between oom_reaper kernel thread
+         * and oom_killer_disable. oom_reaper calls exit_oom_victim
+         * before the victim reaches exit_mm so try to freeze all the tasks
+         * again and catch such a left over task.
+         */
+        if (!error) {
+                pr_info("Double checking all user space processes after OOM killer disable... ");
+                error = try_to_freeze_tasks(true);
+                pr_cont("\n");
+        }
        if (error)
                thaw_processes();
        return error;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f2cae4620c7..51d7105f529a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
        for (;;) {
                /* Any allowed, online CPU? */
                for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-                        if (!cpu_active(dest_cpu))
+                        if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
+                                continue;
+                        if (!cpu_online(dest_cpu))
                                continue;
                        goto out;
                }
@@ -2253,9 +2255,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 #endif
 #endif
+#ifdef CONFIG_SCHEDSTATS
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
-#ifdef CONFIG_SCHEDSTATS
 static void set_schedstats(bool enabled)
 {
        if (enabled)
@@ -2278,11 +2282,16 @@ static int __init setup_schedstats(char *str)
        if (!str)
                goto out;
+        /*
+         * This code is called before jump labels have been set up, so we can't
+         * change the static branch directly just yet.  Instead set a temporary
+         * variable so init_schedstats() can do it later.
+         */
        if (!strcmp(str, "enable")) {
-                set_schedstats(true);
+                __sched_schedstats = true;
                ret = 1;
        } else if (!strcmp(str, "disable")) {
-                set_schedstats(false);
+                __sched_schedstats = false;
                ret = 1;
        }
 out:
@@ -2293,6 +2302,11 @@ out:
 }
 __setup("schedstats=", setup_schedstats);
+static void __init init_schedstats(void)
+{
+        set_schedstats(__sched_schedstats);
+}
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_schedstats(struct ctl_table *table, int write,
                         void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2313,8 +2327,10 @@ int sysctl_schedstats(struct ctl_table *table, int write,
                set_schedstats(state);
        return err;
 }
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
-#endif
+#else  /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
 /*
 * fork()/clone()-time setup:
@@ -2521,10 +2537,9 @@ void wake_up_new_task(struct task_struct *p)
         */
        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
-        /* Post initialize new task's util average when its cfs_rq is set */
+        rq = __task_rq_lock(p, &rf);
        post_init_entity_util_avg(&p->se);
-        rq = __task_rq_lock(p, &rf);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
@@ -3156,7 +3171,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
 static inline void schedule_debug(struct task_struct *prev)
 {
 #ifdef CONFIG_SCHED_STACK_END_CHECK
-        BUG_ON(task_stack_end_corrupted(prev));
+        if (task_stack_end_corrupted(prev))
+                panic("corrupted stack end detected inside scheduler\n");
 #endif
        if (unlikely(in_atomic_preempt_off())) {
@@ -5133,14 +5149,16 @@ void show_state_filter(unsigned long state_filter)
                /*
                 * reset the NMI-timeout, listing all files on a slow
                 * console might take a lot of time:
+                 * Also, reset softlockup watchdogs on all CPUs, because
+                 * another CPU might be blocked waiting for us to process
+                 * an IPI.
                 */
                touch_nmi_watchdog();
+                touch_all_softlockup_watchdogs();
                if (!state_filter || (p->state & state_filter))
                        sched_show_task(p);
        }
-        touch_all_softlockup_watchdogs();
 #ifdef CONFIG_SCHED_DEBUG
        if (!state_filter)
                sysrq_sched_debug_show();
@@ -7487,6 +7505,8 @@ void __init sched_init(void)
 #endif
        init_sched_fair_class();
+        init_schedstats();
        scheduler_running = 1;
 }
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index cf905f655ba1..0368c393a336 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -427,19 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                SPLIT_NS(p->se.vruntime),
                (long long)(p->nvcsw + p->nivcsw),
                p->prio);
-#ifdef CONFIG_SCHEDSTATS
-        if (schedstat_enabled()) {
-                SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-                        SPLIT_NS(p->se.statistics.wait_sum),
-                        SPLIT_NS(p->se.sum_exec_runtime),
-                        SPLIT_NS(p->se.statistics.sum_sleep_runtime));
-        }
-#else
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-                0LL, 0L,
+                SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
                SPLIT_NS(p->se.sum_exec_runtime),
-                0LL, 0L);
+                SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
-#endif
 #ifdef CONFIG_NUMA_BALANCING
        SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e83db73..bdcbeea90c95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2904,6 +2904,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
        }
 }
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {                           \
+        typeof(_ptr) ptr = (_ptr);                              \
+        typeof(*ptr) val = (_val);                              \
+        typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+        res = var - val;                                        \
+        if (res > var)                                          \
+                res = 0;                                        \
+        WRITE_ONCE(*ptr, res);                                  \
+} while (0)
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -2913,15 +2930,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-                sa->load_avg = max_t(long, sa->load_avg - r, 0);
+                sub_positive(&sa->load_avg, r);
-                sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+                sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
                removed_load = 1;
        }
        if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-                sa->util_avg = max_t(long, sa->util_avg - r, 0);
+                sub_positive(&sa->util_avg, r);
-                sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+                sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
                removed_util = 1;
        }
@@ -2994,10 +3011,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
                          &se->avg, se->on_rq * scale_load_down(se->load.weight),
                          cfs_rq->curr == se, NULL);
-        cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+        sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
-        cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+        sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
-        cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+        sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
-        cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+        sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
        cfs_rq_util_change(cfs_rq);
 }
@@ -3246,7 +3263,7 @@ static inline void check_schedstat_required(void)
                        trace_sched_stat_iowait_enabled()  ||
                        trace_sched_stat_blocked_enabled() ||
                        trace_sched_stat_runtime_enabled())  {
-                pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+                printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
                             "stat_blocked and stat_runtime require the "
                             "kernel parameter schedstats=enabled or "
                             "kernel.sched_schedstats=1\n");
@@ -4185,6 +4202,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
        if (!cfs_bandwidth_used())
                return;
+        /* Synchronize hierarchical throttle counter: */
+        if (unlikely(!cfs_rq->throttle_uptodate)) {
+                struct rq *rq = rq_of(cfs_rq);
+                struct cfs_rq *pcfs_rq;
+                struct task_group *tg;
+                cfs_rq->throttle_uptodate = 1;
+                /* Get closest up-to-date node, because leaves go first: */
+                for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+                        pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+                        if (pcfs_rq->throttle_uptodate)
+                                break;
+                }
+                if (tg) {
+                        cfs_rq->throttle_count = pcfs_rq->throttle_count;
+                        cfs_rq->throttled_clock_task = rq_clock_task(rq);
+                }
+        }
        /* an active group must be handled by the update_curr()->put() path */
        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
                return;
@@ -4500,15 +4537,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
+                        /* Avoid re-evaluating load for this entity: */
+                        se = parent_entity(se);
                        /*
                         * Bias pick_next to pick a task from this cfs_rq, as
                         * p is sleeping when it is within its sched_slice.
                         */
-                        if (task_sleep && parent_entity(se))
+                        if (task_sleep && se && !throttled_hierarchy(cfs_rq))
-                                set_next_buddy(parent_entity(se));
+                                set_next_buddy(se);
-                        /* avoid re-evaluating load for this entity */
-                        se = parent_entity(se);
                        break;
                }
                flags |= DEQUEUE_SLEEP;
@@ -8496,8 +8532,9 @@ void free_fair_sched_group(struct task_group *tg)
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
-        struct cfs_rq *cfs_rq;
        struct sched_entity *se;
+        struct cfs_rq *cfs_rq;
+        struct rq *rq;
        int i;
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8512,6 +8549,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
        for_each_possible_cpu(i) {
+                rq = cpu_rq(i);
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
@@ -8525,7 +8564,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
                init_entity_runnable_average(se);
+                raw_spin_lock_irq(&rq->lock);
                post_init_entity_util_avg(se);
+                raw_spin_unlock_irq(&rq->lock);
        }
        return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f3087b04..7cbeb92a1cb9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -437,7 +437,7 @@ struct cfs_rq {
        u64 throttled_clock, throttled_clock_task;
        u64 throttled_clock_task_time;
-        int throttled, throttle_count;
+        int throttled, throttle_count, throttle_uptodate;
        struct list_head throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 70b3b6a20fb0..78955cbea31c 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -33,6 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 # define schedstat_inc(rq, field)       do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
 # define schedstat_add(rq, field, amt)  do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
 # define schedstat_set(var, val)        do { if (schedstat_enabled()) { var = (val); } } while (0)
+# define schedstat_val(rq, field)       ((schedstat_enabled()) ? (rq)->field : 0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -47,6 +49,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 # define schedstat_inc(rq, field)       do { } while (0)
 # define schedstat_add(rq, field, amt)  do { } while (0)
 # define schedstat_set(var, val)        do { } while (0)
+# define schedstat_val(rq, field)       0
 #endif
 #ifdef CONFIG_SCHED_INFO
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 037ea6ea3cb2..3de25fbed785 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -208,6 +208,10 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
            event->pmu->count)
                return -EINVAL;
+        if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
+                     event->attr.type != PERF_TYPE_RAW))
+                return -EINVAL;
        /*
         * we don't know if the function is run successfully by the
         * return value. It can be judged in other places, such as
@@ -347,7 +351,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 }
 /* bpf+kprobe programs can access fields of 'struct pt_regs' */
-static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+                                        enum bpf_reg_type *reg_type)
 {
        /* check bounds */
        if (off < 0 || off >= sizeof(struct pt_regs))
@@ -425,7 +430,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
        }
 }
-static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+                                    enum bpf_reg_type *reg_type)
 {
        if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
                return false;
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index f96f0383f6c6..ad1d6164e946 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -36,6 +36,10 @@ struct trace_bprintk_fmt {
 static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
 {
        struct trace_bprintk_fmt *pos;
+        if (!fmt)
+                return ERR_PTR(-EINVAL);
        list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
                if (!strcmp(pos->fmt, fmt))
                        return pos;
@@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
        for (iter = start; iter < end; iter++) {
                struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
                if (tb_fmt) {
-                        *iter = tb_fmt->fmt;
+                        if (!IS_ERR(tb_fmt))
+                                *iter = tb_fmt->fmt;
                        continue;
                }