Merge tag v4.15 of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git

To resolve conflicts in: drivers/infiniband/hw/mlx5/main.c drivers/infiniband/hw/mlx5/qp.c From patches merged into the -rc cycle. The conflict resolution matches what linux-next has been carrying. Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
author: Jason Gunthorpe <jgg@mellanox.com> 2018-01-29 15:26:40 -0500
committer: Jason Gunthorpe <jgg@mellanox.com> 2018-01-30 11:30:00 -0500
commit: e7996a9a77fc669387da43ff4823b91cc4872bd0 (patch)
tree: 617f0a128e222539d67e8cccc359f1bc4b984900 /kernel
parent: b5fa635aab8f0d39a824c01991266a6d06f007fb (diff)
parent: d8a5b80568a9cb66810e75b182018e9edb68e8ff (diff)
60 files changed, 1169 insertions, 1106 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index d15c0ee4d955..addf7732fb56 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct)
 {
        struct kstatfs sbuf;
-        if (time_is_before_jiffies(acct->needcheck))
+        if (time_is_after_jiffies(acct->needcheck))
                goto out;
        /* May block */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7c25426d3cf5..ab94d304a634 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 {
        bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
        int numa_node = bpf_map_attr_numa_node(attr);
+        u32 elem_size, index_mask, max_entries;
+        bool unpriv = !capable(CAP_SYS_ADMIN);
        struct bpf_array *array;
-        u64 array_size;
+        u64 array_size, mask64;
-        u32 elem_size;
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -72,11 +73,32 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
        elem_size = round_up(attr->value_size, 8);
+        max_entries = attr->max_entries;
+        /* On 32 bit archs roundup_pow_of_two() with max_entries that has
+         * upper most bit set in u32 space is undefined behavior due to
+         * resulting 1U << 32, so do it manually here in u64 space.
+         */
+        mask64 = fls_long(max_entries - 1);
+        mask64 = 1ULL << mask64;
+        mask64 -= 1;
+        index_mask = mask64;
+        if (unpriv) {
+                /* round up array size to nearest power of 2,
+                 * since cpu will speculate within index_mask limits
+                 */
+                max_entries = index_mask + 1;
+                /* Check for overflows. */
+                if (max_entries < attr->max_entries)
+                        return ERR_PTR(-E2BIG);
+        }
        array_size = sizeof(*array);
        if (percpu)
-                array_size += (u64) attr->max_entries * sizeof(void *);
+                array_size += (u64) max_entries * sizeof(void *);
        else
-                array_size += (u64) attr->max_entries * elem_size;
+                array_size += (u64) max_entries * elem_size;
        /* make sure there is no u32 overflow later in round_up() */
        if (array_size >= U32_MAX - PAGE_SIZE)
@@ -86,6 +108,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
        array = bpf_map_area_alloc(array_size, numa_node);
        if (!array)
                return ERR_PTR(-ENOMEM);
+        array->index_mask = index_mask;
+        array->map.unpriv_array = unpriv;
        /* copy mandatory map attributes */
        array->map.map_type = attr->map_type;
@@ -121,12 +145,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
        if (unlikely(index >= array->map.max_entries))
                return NULL;
-        return array->value + array->elem_size * index;
+        return array->value + array->elem_size * (index & array->index_mask);
 }
 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
 static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct bpf_insn *insn = insn_buf;
        u32 elem_size = round_up(map->value_size, 8);
        const int ret = BPF_REG_0;
@@ -135,7 +160,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
        *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
        *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-        *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+        if (map->unpriv_array) {
+                *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
+                *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+        } else {
+                *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+        }
        if (is_power_of_2(elem_size)) {
                *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
@@ -157,7 +187,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
        if (unlikely(index >= array->map.max_entries))
                return NULL;
-        return this_cpu_ptr(array->pptrs[index]);
+        return this_cpu_ptr(array->pptrs[index & array->index_mask]);
 }
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
@@ -177,7 +207,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
         */
        size = round_up(map->value_size, 8);
        rcu_read_lock();
-        pptr = array->pptrs[index];
+        pptr = array->pptrs[index & array->index_mask];
        for_each_possible_cpu(cpu) {
                bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
                off += size;
@@ -225,10 +255,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
                return -EEXIST;
        if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
-                memcpy(this_cpu_ptr(array->pptrs[index]),
+                memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
                       value, map->value_size);
        else
-                memcpy(array->value + array->elem_size * index,
+                memcpy(array->value +
+                       array->elem_size * (index & array->index_mask),
                       value, map->value_size);
        return 0;
 }
@@ -262,7 +293,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
         */
        size = round_up(map->value_size, 8);
        rcu_read_lock();
-        pptr = array->pptrs[index];
+        pptr = array->pptrs[index & array->index_mask];
        for_each_possible_cpu(cpu) {
                bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
                off += size;
@@ -613,6 +644,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
 static u32 array_of_map_gen_lookup(struct bpf_map *map,
                                   struct bpf_insn *insn_buf)
 {
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
        u32 elem_size = round_up(map->value_size, 8);
        struct bpf_insn *insn = insn_buf;
        const int ret = BPF_REG_0;
@@ -621,7 +653,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
        *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
        *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
-        *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+        if (map->unpriv_array) {
+                *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
+                *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+        } else {
+                *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+        }
        if (is_power_of_2(elem_size))
                *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
        else
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b9f8686a84cf..7949e8b8f94e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 }
 EXPORT_SYMBOL_GPL(__bpf_call_base);
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
 /**
 *      __bpf_prog_run - run eBPF program on a given context
 *      @ctx: is the data we are operating on
@@ -955,7 +956,7 @@ select_insn:
                DST = tmp;
                CONT;
        ALU_MOD_X:
-                if (unlikely(SRC == 0))
+                if (unlikely((u32)SRC == 0))
                        return 0;
                tmp = (u32) DST;
                DST = do_div(tmp, (u32) SRC);
@@ -974,7 +975,7 @@ select_insn:
                DST = div64_u64(DST, SRC);
                CONT;
        ALU_DIV_X:
-                if (unlikely(SRC == 0))
+                if (unlikely((u32)SRC == 0))
                        return 0;
                tmp = (u32) DST;
                do_div(tmp, (u32) SRC);
@@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
 EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 };
+#else
+static unsigned int __bpf_prog_ret0(const void *ctx,
+                                    const struct bpf_insn *insn)
+{
+        return 0;
+}
+#endif
 bool bpf_prog_array_compatible(struct bpf_array *array,
                               const struct bpf_prog *fp)
 {
@@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
 */
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
        fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+#else
+        fp->bpf_func = __bpf_prog_ret0;
+#endif
        /* eBPF JITs can rewrite the program in case constant
         * blinding is active. However, in case of error during
@@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
         */
        if (!bpf_prog_is_dev_bound(fp->aux)) {
                fp = bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+                if (!fp->jited) {
+                        *err = -ENOTSUPP;
+                        return fp;
+                }
+#endif
        } else {
                *err = bpf_prog_offload_compile(fp);
                if (*err)
@@ -1447,7 +1466,8 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
        rcu_read_lock();
        prog = rcu_dereference(progs)->progs;
        for (; *prog; prog++)
-                cnt++;
+                if (*prog != &dummy_bpf_prog.prog)
+                        cnt++;
        rcu_read_unlock();
        return cnt;
 }
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index e469e05c8e83..3905d4bc5b80 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab)
                pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
                                         htab->map.key_size);
                free_percpu(pptr);
+                cond_resched();
        }
 free_elems:
        bpf_map_area_free(htab->elems);
@@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab)
                        goto free_elems;
                htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
                                  pptr);
+                cond_resched();
        }
 skip_percpu_elems:
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 01aaef1a77c5..5bb5e49ef4c3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -368,7 +368,45 @@ out:
        putname(pname);
        return ret;
 }
-EXPORT_SYMBOL_GPL(bpf_obj_get_user);
+static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+{
+        struct bpf_prog *prog;
+        int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+        if (ret)
+                return ERR_PTR(ret);
+        if (inode->i_op == &bpf_map_iops)
+                return ERR_PTR(-EINVAL);
+        if (inode->i_op != &bpf_prog_iops)
+                return ERR_PTR(-EACCES);
+        prog = inode->i_private;
+        ret = security_bpf_prog(prog);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (!bpf_prog_get_ok(prog, &type, false))
+                return ERR_PTR(-EINVAL);
+        return bpf_prog_inc(prog);
+}
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
+{
+        struct bpf_prog *prog;
+        struct path path;
+        int ret = kern_path(name, LOOKUP_FOLLOW, &path);
+        if (ret)
+                return ERR_PTR(ret);
+        prog = __get_prog_inode(d_backing_inode(path.dentry), type);
+        if (!IS_ERR(prog))
+                touch_atime(&path);
+        path_put(&path);
+        return prog;
+}
+EXPORT_SYMBOL(bpf_prog_get_type_path);
 static void bpf_evict_inode(struct inode *inode)
 {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 68ec884440b7..8455b89d1bbf 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -1,3 +1,18 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree.
+ *
+ * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+ * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+ * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+ * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+ */
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bug.h>
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 5ee2e41893d9..1712d319c2d8 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map)
                write_lock_bh(&sock->sk_callback_lock);
                psock = smap_psock_sk(sock);
-                smap_list_remove(psock, &stab->sock_map[i]);
+                /* This check handles a racing sock event that can get the
-                smap_release_sock(psock, sock);
+                 * sk_callback_lock before this case but after xchg happens
+                 * causing the refcnt to hit zero and sock user data (psock)
+                 * to be null and queued for garbage collection.
+                 */
+                if (likely(psock)) {
+                        smap_list_remove(psock, &stab->sock_map[i]);
+                        smap_release_sock(psock, sock);
+                }
                write_unlock_bh(&sock->sk_callback_lock);
        }
        rcu_read_unlock();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c4cfeaa8d5e..5cb783fc8224 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
-static bool bpf_prog_get_ok(struct bpf_prog *prog,
+bool bpf_prog_get_ok(struct bpf_prog *prog,
                            enum bpf_prog_type *attach_type, bool attach_drv)
 {
        /* not an attachment, just a refcount inc, always allow */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d4593571c404..13551e623501 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -978,6 +978,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
        return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
 }
+static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
+{
+        const struct bpf_reg_state *reg = cur_regs(env) + regno;
+        return reg->type == PTR_TO_CTX;
+}
 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
                                   const struct bpf_reg_state *reg,
                                   int off, int size, bool strict)
@@ -1059,6 +1066,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
                break;
        case PTR_TO_STACK:
                pointer_desc = "stack ";
+                /* The stack spill tracking logic in check_stack_write()
+                 * and check_stack_read() relies on stack accesses being
+                 * aligned.
+                 */
+                strict = true;
                break;
        default:
                break;
@@ -1067,6 +1079,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
                                           strict);
 }
+/* truncate register to smaller size (in bytes)
+ * must be called with size < BPF_REG_SIZE
+ */
+static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
+{
+        u64 mask;
+        /* clear high bits in bit representation */
+        reg->var_off = tnum_cast(reg->var_off, size);
+        /* fix arithmetic bounds */
+        mask = ((u64)1 << (size * 8)) - 1;
+        if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
+                reg->umin_value &= mask;
+                reg->umax_value &= mask;
+        } else {
+                reg->umin_value = 0;
+                reg->umax_value = mask;
+        }
+        reg->smin_value = reg->umin_value;
+        reg->smax_value = reg->umax_value;
+}
 /* check whether memory at (regno + off) is accessible for t = (read | write)
 * if t==write, value_regno is a register which value is stored into memory
 * if t==read, value_regno is a register which will receive the value from memory
@@ -1200,9 +1235,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
        if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
            regs[value_regno].type == SCALAR_VALUE) {
                /* b/h/w load zero-extends, mark upper bits as known 0 */
-                regs[value_regno].var_off =
+                coerce_reg_to_size(&regs[value_regno], size);
-                        tnum_cast(regs[value_regno].var_off, size);
-                __update_reg_bounds(&regs[value_regno]);
        }
        return err;
 }
@@ -1232,6 +1265,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
                return -EACCES;
        }
+        if (is_ctx_reg(env, insn->dst_reg)) {
+                verbose(env, "BPF_XADD stores into R%d context is not allowed\n",
+                        insn->dst_reg);
+                return -EACCES;
+        }
        /* check whether atomic_add can read the memory */
        err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
                               BPF_SIZE(insn->code), BPF_READ, -1);
@@ -1282,6 +1321,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
                tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
                verbose(env, "invalid variable stack read R%d var_off=%s\n",
                        regno, tn_buf);
+                return -EACCES;
        }
        off = regs[regno].off + regs[regno].var_off.value;
        if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
@@ -1674,7 +1714,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
                return -EINVAL;
        }
+        /* With LD_ABS/IND some JITs save/restore skb from r1. */
        changes_data = bpf_helper_changes_pkt_data(fn->func);
+        if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
+                verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
+                        func_id_name(func_id), func_id);
+                return -EINVAL;
+        }
        memset(&meta, 0, sizeof(meta));
        meta.pkt_access = fn->pkt_access;
@@ -1696,6 +1742,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
        err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
        if (err)
                return err;
+        if (func_id == BPF_FUNC_tail_call) {
+                if (meta.map_ptr == NULL) {
+                        verbose(env, "verifier bug\n");
+                        return -EINVAL;
+                }
+                env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
+        }
        err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
        if (err)
                return err;
@@ -1766,14 +1819,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
        return 0;
 }
-static void coerce_reg_to_32(struct bpf_reg_state *reg)
-{
-        /* clear high 32 bits */
-        reg->var_off = tnum_cast(reg->var_off, 4);
-        /* Update bounds */
-        __update_reg_bounds(reg);
-}
 static bool signed_add_overflows(s64 a, s64 b)
 {
        /* Do the add in u64, where overflow is well-defined */
@@ -1794,6 +1839,41 @@ static bool signed_sub_overflows(s64 a, s64 b)
        return res > a;
 }
+static bool check_reg_sane_offset(struct bpf_verifier_env *env,
+                                  const struct bpf_reg_state *reg,
+                                  enum bpf_reg_type type)
+{
+        bool known = tnum_is_const(reg->var_off);
+        s64 val = reg->var_off.value;
+        s64 smin = reg->smin_value;
+        if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+                verbose(env, "math between %s pointer and %lld is not allowed\n",
+                        reg_type_str[type], val);
+                return false;
+        }
+        if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
+                verbose(env, "%s pointer offset %d is not allowed\n",
+                        reg_type_str[type], reg->off);
+                return false;
+        }
+        if (smin == S64_MIN) {
+                verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
+                        reg_type_str[type]);
+                return false;
+        }
+        if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+                verbose(env, "value %lld makes %s pointer be out of bounds\n",
+                        smin, reg_type_str[type]);
+                return false;
+        }
+        return true;
+}
 /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
 * Caller should also handle BPF_MOV case separately.
 * If we return -EACCES, caller may want to try again treating pointer as a
@@ -1815,44 +1895,36 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        dst_reg = &regs[dst];
-        if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
+        if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
-                print_verifier_state(env, env->cur_state);
+            smin_val > smax_val || umin_val > umax_val) {
-                verbose(env,
+                /* Taint dst register if offset had invalid bounds derived from
-                        "verifier internal error: known but bad sbounds\n");
+                 * e.g. dead branches.
-                return -EINVAL;
+                 */
-        }
+                __mark_reg_unknown(dst_reg);
-        if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
+                return 0;
-                print_verifier_state(env, env->cur_state);
-                verbose(env,
-                        "verifier internal error: known but bad ubounds\n");
-                return -EINVAL;
        }
        if (BPF_CLASS(insn->code) != BPF_ALU64) {
                /* 32-bit ALU ops on pointers produce (meaningless) scalars */
-                if (!env->allow_ptr_leaks)
+                verbose(env,
-                        verbose(env,
+                        "R%d 32-bit pointer arithmetic prohibited\n",
-                                "R%d 32-bit pointer arithmetic prohibited\n",
+                        dst);
-                                dst);
                return -EACCES;
        }
        if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-                if (!env->allow_ptr_leaks)
+                verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
-                        verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+                        dst);
-                                dst);
                return -EACCES;
        }
        if (ptr_reg->type == CONST_PTR_TO_MAP) {
-                if (!env->allow_ptr_leaks)
+                verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
-                        verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+                        dst);
-                                dst);
                return -EACCES;
        }
        if (ptr_reg->type == PTR_TO_PACKET_END) {
-                if (!env->allow_ptr_leaks)
+                verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
-                        verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+                        dst);
-                                dst);
                return -EACCES;
        }
@@ -1862,6 +1934,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        dst_reg->type = ptr_reg->type;
        dst_reg->id = ptr_reg->id;
+        if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
+            !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+                return -EINVAL;
        switch (opcode) {
        case BPF_ADD:
                /* We can take a fixed offset as long as it doesn't overflow
@@ -1915,9 +1991,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        case BPF_SUB:
                if (dst_reg == off_reg) {
                        /* scalar -= pointer.  Creates an unknown scalar */
-                        if (!env->allow_ptr_leaks)
+                        verbose(env, "R%d tried to subtract pointer from scalar\n",
-                                verbose(env, "R%d tried to subtract pointer from scalar\n",
+                                dst);
-                                        dst);
                        return -EACCES;
                }
                /* We don't allow subtraction from FP, because (according to
@@ -1925,9 +2000,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                 * be able to deal with it.
                 */
                if (ptr_reg->type == PTR_TO_STACK) {
-                        if (!env->allow_ptr_leaks)
+                        verbose(env, "R%d subtraction from stack pointer prohibited\n",
-                                verbose(env, "R%d subtraction from stack pointer prohibited\n",
+                                dst);
-                                        dst);
                        return -EACCES;
                }
                if (known && (ptr_reg->off - smin_val ==
@@ -1976,28 +2050,30 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        case BPF_AND:
        case BPF_OR:
        case BPF_XOR:
-                /* bitwise ops on pointers are troublesome, prohibit for now.
+                /* bitwise ops on pointers are troublesome, prohibit. */
-                 * (However, in principle we could allow some cases, e.g.
+                verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
-                 * ptr &= ~3 which would reduce min_value by 3.)
+                        dst, bpf_alu_string[opcode >> 4]);
-                 */
-                if (!env->allow_ptr_leaks)
-                        verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
-                                dst, bpf_alu_string[opcode >> 4]);
                return -EACCES;
        default:
                /* other operators (e.g. MUL,LSH) produce non-pointer results */
-                if (!env->allow_ptr_leaks)
+                verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
-                        verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
+                        dst, bpf_alu_string[opcode >> 4]);
-                                dst, bpf_alu_string[opcode >> 4]);
                return -EACCES;
        }
+        if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
+                return -EINVAL;
        __update_reg_bounds(dst_reg);
        __reg_deduce_bounds(dst_reg);
        __reg_bound_offset(dst_reg);
        return 0;
 }
+/* WARNING: This function does calculations on 64-bit values, but the actual
+ * execution may occur on 32-bit values. Therefore, things like bitshifts
+ * need extra checks in the 32-bit case.
+ */
 static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                                      struct bpf_insn *insn,
                                      struct bpf_reg_state *dst_reg,
@@ -2008,12 +2084,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
        bool src_known, dst_known;
        s64 smin_val, smax_val;
        u64 umin_val, umax_val;
+        u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
-        if (BPF_CLASS(insn->code) != BPF_ALU64) {
-                /* 32-bit ALU ops are (32,32)->64 */
-                coerce_reg_to_32(dst_reg);
-                coerce_reg_to_32(&src_reg);
-        }
        smin_val = src_reg.smin_value;
        smax_val = src_reg.smax_value;
        umin_val = src_reg.umin_value;
@@ -2021,6 +2093,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
        src_known = tnum_is_const(src_reg.var_off);
        dst_known = tnum_is_const(dst_reg->var_off);
+        if ((src_known && (smin_val != smax_val || umin_val != umax_val)) ||
+            smin_val > smax_val || umin_val > umax_val) {
+                /* Taint dst register if offset had invalid bounds derived from
+                 * e.g. dead branches.
+                 */
+                __mark_reg_unknown(dst_reg);
+                return 0;
+        }
+        if (!src_known &&
+            opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+                __mark_reg_unknown(dst_reg);
+                return 0;
+        }
        switch (opcode) {
        case BPF_ADD:
                if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
@@ -2149,9 +2236,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                __update_reg_bounds(dst_reg);
                break;
        case BPF_LSH:
-                if (umax_val > 63) {
+                if (umax_val >= insn_bitness) {
-                        /* Shifts greater than 63 are undefined.  This includes
+                        /* Shifts greater than 31 or 63 are undefined.
-                         * shifts by a negative number.
+                         * This includes shifts by a negative number.
                         */
                        mark_reg_unknown(env, regs, insn->dst_reg);
                        break;
@@ -2177,27 +2264,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                __update_reg_bounds(dst_reg);
                break;
        case BPF_RSH:
-                if (umax_val > 63) {
+                if (umax_val >= insn_bitness) {
-                        /* Shifts greater than 63 are undefined.  This includes
+                        /* Shifts greater than 31 or 63 are undefined.
-                         * shifts by a negative number.
+                         * This includes shifts by a negative number.
                         */
                        mark_reg_unknown(env, regs, insn->dst_reg);
                        break;
                }
-                /* BPF_RSH is an unsigned shift, so make the appropriate casts */
+                /* BPF_RSH is an unsigned shift.  If the value in dst_reg might
-                if (dst_reg->smin_value < 0) {
+                 * be negative, then either:
-                        if (umin_val) {
+                 * 1) src_reg might be zero, so the sign bit of the result is
-                                /* Sign bit will be cleared */
+                 *    unknown, so we lose our signed bounds
-                                dst_reg->smin_value = 0;
+                 * 2) it's known negative, thus the unsigned bounds capture the
-                        } else {
+                 *    signed bounds
-                                /* Lost sign bit information */
+                 * 3) the signed bounds cross zero, so they tell us nothing
-                                dst_reg->smin_value = S64_MIN;
+                 *    about the result
-                                dst_reg->smax_value = S64_MAX;
+                 * If the value in dst_reg is known nonnegative, then again the
-                        }
+                 * unsigned bounts capture the signed bounds.
-                } else {
+                 * Thus, in all cases it suffices to blow away our signed bounds
-                        dst_reg->smin_value =
+                 * and rely on inferring new ones from the unsigned bounds and
-                                (u64)(dst_reg->smin_value) >> umax_val;
+                 * var_off of the result.
-                }
+                 */
+                dst_reg->smin_value = S64_MIN;
+                dst_reg->smax_value = S64_MAX;
                if (src_known)
                        dst_reg->var_off = tnum_rshift(dst_reg->var_off,
                                                       umin_val);
@@ -2213,6 +2302,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                break;
        }
+        if (BPF_CLASS(insn->code) != BPF_ALU64) {
+                /* 32-bit ALU ops are (32,32)->32 */
+                coerce_reg_to_size(dst_reg, 4);
+                coerce_reg_to_size(&src_reg, 4);
+        }
        __reg_deduce_bounds(dst_reg);
        __reg_bound_offset(dst_reg);
        return 0;
@@ -2227,7 +2322,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
        struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
        struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
        u8 opcode = BPF_OP(insn->code);
-        int rc;
        dst_reg = &regs[insn->dst_reg];
        src_reg = NULL;
@@ -2238,43 +2332,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                if (src_reg->type != SCALAR_VALUE) {
                        if (dst_reg->type != SCALAR_VALUE) {
                                /* Combining two pointers by any ALU op yields
-                                 * an arbitrary scalar.
+                                 * an arbitrary scalar. Disallow all math except
+                                 * pointer subtraction
                                 */
-                                if (!env->allow_ptr_leaks) {
+                                if (opcode == BPF_SUB){
-                                        verbose(env, "R%d pointer %s pointer prohibited\n",
+                                        mark_reg_unknown(env, regs, insn->dst_reg);
-                                                insn->dst_reg,
+                                        return 0;
-                                                bpf_alu_string[opcode >> 4]);
-                                        return -EACCES;
                                }
-                                mark_reg_unknown(env, regs, insn->dst_reg);
+                                verbose(env, "R%d pointer %s pointer prohibited\n",
-                                return 0;
+                                        insn->dst_reg,
+                                        bpf_alu_string[opcode >> 4]);
+                                return -EACCES;
                        } else {
                                /* scalar += pointer
                                 * This is legal, but we have to reverse our
                                 * src/dest handling in computing the range
                                 */
-                                rc = adjust_ptr_min_max_vals(env, insn,
+                                return adjust_ptr_min_max_vals(env, insn,
-                                                             src_reg, dst_reg);
+                                                               src_reg, dst_reg);
-                                if (rc == -EACCES && env->allow_ptr_leaks) {
-                                        /* scalar += unknown scalar */
-                                        __mark_reg_unknown(&off_reg);
-                                        return adjust_scalar_min_max_vals(
-                                                        env, insn,
-                                                        dst_reg, off_reg);
-                                }
-                                return rc;
                        }
                } else if (ptr_reg) {
                        /* pointer += scalar */
-                        rc = adjust_ptr_min_max_vals(env, insn,
+                        return adjust_ptr_min_max_vals(env, insn,
-                                                     dst_reg, src_reg);
+                                                       dst_reg, src_reg);
-                        if (rc == -EACCES && env->allow_ptr_leaks) {
-                                /* unknown scalar += scalar */
-                                __mark_reg_unknown(dst_reg);
-                                return adjust_scalar_min_max_vals(
-                                                env, insn, dst_reg, *src_reg);
-                        }
-                        return rc;
                }
        } else {
                /* Pretend the src is a reg with a known value, since we only
@@ -2283,17 +2363,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                off_reg.type = SCALAR_VALUE;
                __mark_reg_known(&off_reg, insn->imm);
                src_reg = &off_reg;
-                if (ptr_reg) { /* pointer += K */
+                if (ptr_reg) /* pointer += K */
-                        rc = adjust_ptr_min_max_vals(env, insn,
+                        return adjust_ptr_min_max_vals(env, insn,
-                                                     ptr_reg, src_reg);
+                                                       ptr_reg, src_reg);
-                        if (rc == -EACCES && env->allow_ptr_leaks) {
-                                /* unknown scalar += K */
-                                __mark_reg_unknown(dst_reg);
-                                return adjust_scalar_min_max_vals(
-                                                env, insn, dst_reg, off_reg);
-                        }
-                        return rc;
-                }
        }
        /* Got here implies adding two SCALAR_VALUEs */
@@ -2390,17 +2462,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                        return -EACCES;
                                }
                                mark_reg_unknown(env, regs, insn->dst_reg);
-                                /* high 32 bits are known zero. */
+                                coerce_reg_to_size(&regs[insn->dst_reg], 4);
-                                regs[insn->dst_reg].var_off = tnum_cast(
-                                                regs[insn->dst_reg].var_off, 4);
-                                __update_reg_bounds(&regs[insn->dst_reg]);
                        }
                } else {
                        /* case: R = imm
                         * remember the value we stored into this reg
                         */
                        regs[insn->dst_reg].type = SCALAR_VALUE;
-                        __mark_reg_known(regs + insn->dst_reg, insn->imm);
+                        if (BPF_CLASS(insn->code) == BPF_ALU64) {
+                                __mark_reg_known(regs + insn->dst_reg,
+                                                 insn->imm);
+                        } else {
+                                __mark_reg_known(regs + insn->dst_reg,
+                                                 (u32)insn->imm);
+                        }
                }
        } else if (opcode > BPF_END) {
@@ -2436,6 +2511,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                        return -EINVAL;
                }
+                if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+                        verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
+                        return -EINVAL;
+                }
                if ((opcode == BPF_LSH || opcode == BPF_RSH ||
                     opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
                        int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -3431,15 +3511,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
                        return range_within(rold, rcur) &&
                               tnum_in(rold->var_off, rcur->var_off);
                } else {
-                        /* if we knew anything about the old value, we're not
+                        /* We're trying to use a pointer in place of a scalar.
-                         * equal, because we can't know anything about the
+                         * Even if the scalar was unbounded, this could lead to
-                         * scalar value of the pointer in the new value.
+                         * pointer leaks because scalars are allowed to leak
+                         * while pointers are not. We could make this safe in
+                         * special cases if root is calling us, but it's
+                         * probably not worth the hassle.
                         */
-                        return rold->umin_value == 0 &&
+                        return false;
-                               rold->umax_value == U64_MAX &&
-                               rold->smin_value == S64_MIN &&
-                               rold->smax_value == S64_MAX &&
-                               tnum_is_unknown(rold->var_off);
                }
        case PTR_TO_MAP_VALUE:
                /* If the new min/max/var_off satisfy the old ones and
@@ -3932,6 +4011,12 @@ static int do_check(struct bpf_verifier_env *env)
                        if (err)
                                return err;
+                        if (is_ctx_reg(env, insn->dst_reg)) {
+                                verbose(env, "BPF_ST stores into R%d context is not allowed\n",
+                                        insn->dst_reg);
+                                return -EACCES;
+                        }
                        /* check that memory (dst_reg + off) is writeable */
                        err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
                                               BPF_SIZE(insn->code), BPF_WRITE,
@@ -4384,6 +4469,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
        int i, cnt, delta = 0;
        for (i = 0; i < insn_cnt; i++, insn++) {
+                if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+                    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+                        /* due to JIT bugs clear upper 32-bits of src register
+                         * before div/mod operation
+                         */
+                        insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
+                        insn_buf[1] = *insn;
+                        cnt = 2;
+                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+                        if (!new_prog)
+                                return -ENOMEM;
+                        delta    += cnt - 1;
+                        env->prog = prog = new_prog;
+                        insn      = new_prog->insnsi + i + delta;
+                        continue;
+                }
                if (insn->code != (BPF_JMP | BPF_CALL))
                        continue;
@@ -4407,6 +4510,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
                         */
                        insn->imm = 0;
                        insn->code = BPF_JMP | BPF_TAIL_CALL;
+                        /* instead of changing every JIT dealing with tail_call
+                         * emit two extra insns:
+                         * if (index >= max_entries) goto out;
+                         * index &= array->index_mask;
+                         * to avoid out-of-bounds cpu speculation
+                         */
+                        map_ptr = env->insn_aux_data[i + delta].map_ptr;
+                        if (map_ptr == BPF_MAP_PTR_POISON) {
+                                verbose(env, "tail_call abusing map_ptr\n");
+                                return -EINVAL;
+                        }
+                        if (!map_ptr->unpriv_array)
+                                continue;
+                        insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+                                                  map_ptr->max_entries, 2);
+                        insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+                                                    container_of(map_ptr,
+                                                                 struct bpf_array,
+                                                                 map)->index_mask);
+                        insn_buf[2] = *insn;
+                        cnt = 3;
+                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+                        if (!new_prog)
+                                return -ENOMEM;
+                        delta    += cnt - 1;
+                        env->prog = prog = new_prog;
+                        insn      = new_prog->insnsi + i + delta;
                        continue;
                }
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085daab1a..a2c05d2476ac 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
         */
        do {
                css_task_iter_start(&from->self, 0, &it);
-                task = css_task_iter_next(&it);
+                do {
+                        task = css_task_iter_next(&it);
+                } while (task && (task->flags & PF_EXITING));
                if (task)
                        get_task_struct(task);
                css_task_iter_end(&it);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0b1ffe147f24..7e4c44538119 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
                         cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
                         cft->name);
        else
-                strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+                strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
        return buf;
 }
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
        root->flags = opts->flags;
        if (opts->release_agent)
-                strcpy(root->release_agent_path, opts->release_agent);
+                strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX);
        if (opts->name)
-                strcpy(root->name, opts->name);
+                strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
        if (opts->cpuset_clone_children)
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
@@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 static void css_task_iter_advance(struct css_task_iter *it)
 {
-        struct list_head *l = it->task_pos;
+        struct list_head *next;
        lockdep_assert_held(&css_set_lock);
-        WARN_ON_ONCE(!l);
 repeat:
        /*
         * Advance iterator to find next entry.  cset->tasks is consumed
         * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
         * next cset.
         */
-        l = l->next;
+        next = it->task_pos->next;
-        if (l == it->tasks_head)
+        if (next == it->tasks_head)
-                l = it->mg_tasks_head->next;
+                next = it->mg_tasks_head->next;
-        if (l == it->mg_tasks_head)
+        if (next == it->mg_tasks_head)
                css_task_iter_advance_css_set(it);
        else
-                it->task_pos = l;
+                it->task_pos = next;
        /* if PROCS, skip over tasks which aren't group leaders */
        if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@ -4449,6 +4447,7 @@ static struct cftype cgroup_base_files[] = {
        },
        {
                .name = "cgroup.threads",
+                .flags = CFTYPE_NS_DELEGATABLE,
                .release = cgroup_procs_release,
                .seq_start = cgroup_threads_start,
                .seq_next = cgroup_procs_next,
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 5f780d8f6a9d..9caeda610249 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v)
        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
-        cset = rcu_dereference(current->cgroups);
+        cset = task_css_set(current);
        refcnt = refcount_read(&cset->refcount);
        seq_printf(seq, "css_set %pK %d", cset, refcnt);
        if (refcnt > cset->nr_tasks)
@@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
-        cset = rcu_dereference(current->cgroups);
+        cset = task_css_set(current);
        list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
index 133b465691d6..1e111dd455c4 100644
--- a/kernel/cgroup/stat.c
+++ b/kernel/cgroup/stat.c
@@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp)
        }
        /* ->updated_children list is self terminated */
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
-                cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+                struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+                cstat->updated_children = cgrp;
+                u64_stats_init(&cstat->sync);
+        }
        prev_cputime_init(&cgrp->stat.prev_cputime);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 04892a82f6ac..53f7dc65f9a3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
-static void inline cpuhp_lock_acquire(bool bringup)
+static inline void cpuhp_lock_acquire(bool bringup)
 {
        lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
 }
-static void inline cpuhp_lock_release(bool bringup)
+static inline void cpuhp_lock_release(bool bringup)
 {
        lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
 }
 #else
-static void inline cpuhp_lock_acquire(bool bringup) { }
+static inline void cpuhp_lock_acquire(bool bringup) { }
-static void inline cpuhp_lock_release(bool bringup) { }
+static inline void cpuhp_lock_release(bool bringup) { }
 #endif
@@ -780,8 +780,8 @@ static int takedown_cpu(unsigned int cpu)
        BUG_ON(cpu_online(cpu));
        /*
-         * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all
+         * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
-         * runnable tasks from the cpu, there's only the idle task left now
+         * all runnable tasks from the CPU, there's only the idle task left now
         * that the migration thread is done doing the stop_machine thing.
         *
         * Wait for the stop thread to go away.
@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
         * before blk_mq_queue_reinit_notify() from notify_dead(),
         * otherwise a RCU stall occurs.
         */
-        [CPUHP_TIMERS_DEAD] = {
+        [CPUHP_TIMERS_PREPARE] = {
                .name                   = "timers:dead",
-                .startup.single         = NULL,
+                .startup.single         = timers_prepare_cpu,
                .teardown.single        = timers_dead_cpu,
        },
        /* Kicks the plugged cpu into life */
@@ -1289,11 +1289,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
                .teardown.single        = NULL,
                .cant_stop              = true,
        },
-        [CPUHP_AP_SMPCFD_DYING] = {
-                .name                   = "smpcfd:dying",
-                .startup.single         = NULL,
-                .teardown.single        = smpcfd_dying_cpu,
-        },
        /*
         * Handled on controll processor until the plugged processor manages
         * this itself.
@@ -1335,6 +1330,11 @@ static struct cpuhp_step cpuhp_ap_states[] = {
                .startup.single         = NULL,
                .teardown.single        = rcutree_dying_cpu,
        },
+        [CPUHP_AP_SMPCFD_DYING] = {
+                .name                   = "smpcfd:dying",
+                .startup.single         = NULL,
+                .teardown.single        = smpcfd_dying_cpu,
+        },
        /* Entry state on starting. Interrupts enabled from here on. Transient
         * state for synchronsization */
        [CPUHP_AP_ONLINE] = {
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b3663896278e..4f63597c824d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_SYMBOL(contig_page_data);
 #endif
 #ifdef CONFIG_SPARSEMEM
-        VMCOREINFO_SYMBOL(mem_section);
+        VMCOREINFO_SYMBOL_ARRAY(mem_section);
        VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
        VMCOREINFO_STRUCT_SIZE(mem_section);
        VMCOREINFO_OFFSET(mem_section, section_mem_map);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index e74be38245ad..ed5d34925ad0 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -350,7 +350,7 @@ poll_again:
                        }
                        kdb_printf("\n");
                        for (i = 0; i < count; i++) {
-                                if (kallsyms_symbol_next(p_tmp, i) < 0)
+                                if (WARN_ON(!kallsyms_symbol_next(p_tmp, i)))
                                        break;
                                kdb_printf("%s ", p_tmp);
                                *(p_tmp + len) = '\0';
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
 * Finish delay accounting for a statistic using its timestamps (@start),
 * accumalator (@total) and @count
 */
-static void delayacct_end(u64 *start, u64 *total, u32 *count)
+static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
 {
        s64 ns = ktime_get_ns() - *start;
        unsigned long flags;
        if (ns > 0) {
-                spin_lock_irqsave(&current->delays->lock, flags);
+                spin_lock_irqsave(lock, flags);
                *total += ns;
                (*count)++;
-                spin_unlock_irqrestore(&current->delays->lock, flags);
+                spin_unlock_irqrestore(lock, flags);
        }
 }
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
        current->delays->blkio_start = ktime_get_ns();
 }
-void __delayacct_blkio_end(void)
+/*
+ * We cannot rely on the `current` macro, as we haven't yet switched back to
+ * the process being woken.
+ */
+void __delayacct_blkio_end(struct task_struct *p)
 {
-        if (current->delays->flags & DELAYACCT_PF_SWAPIN)
+        struct task_delay_info *delays = p->delays;
-                /* Swapin block I/O */
+        u64 *total;
-                delayacct_end(&current->delays->blkio_start,
+        u32 *count;
-                        &current->delays->swapin_delay,
-                        &current->delays->swapin_count);
+        if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
-        else    /* Other block I/O */
+                total = &delays->swapin_delay;
-                delayacct_end(&current->delays->blkio_start,
+                count = &delays->swapin_count;
-                        &current->delays->blkio_delay,
+        } else {
-                        &current->delays->blkio_count);
+                total = &delays->blkio_delay;
+                count = &delays->blkio_count;
+        }
+        delayacct_end(&delays->lock, &delays->blkio_start, total, count);
 }
 int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
 void __delayacct_freepages_end(void)
 {
-        delayacct_end(&current->delays->freepages_start,
+        delayacct_end(
-                        &current->delays->freepages_delay,
+                &current->delays->lock,
-                        &current->delays->freepages_count);
+                &current->delays->freepages_start,
+                &current->delays->freepages_delay,
+                &current->delays->freepages_count);
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 16beab4767e1..5d8f4031f8d5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx)
 *            perf_event_context::lock
 *          perf_event::mmap_mutex
 *          mmap_sem
+ *
+ *    cpu_hotplug_lock
+ *      pmus_lock
+ *        cpuctx->mutex / perf_event_context::mutex
 */
 static struct perf_event_context *
 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
@@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *child, *tmp;
+        LIST_HEAD(free_list);
        /*
         * If we got here through err_file: fput(event_file); we will not have
@@ -4268,8 +4273,7 @@ again:
                                               struct perf_event, child_list);
                if (tmp == child) {
                        perf_remove_from_context(child, DETACH_GROUP);
-                        list_del(&child->child_list);
+                        list_move(&child->child_list, &free_list);
-                        free_event(child);
                        /*
                         * This matches the refcount bump in inherit_event();
                         * this can't be the last reference.
@@ -4284,6 +4288,11 @@ again:
        }
        mutex_unlock(&event->child_mutex);
+        list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+                list_del(&child->child_list);
+                free_event(child);
+        }
 no_ctx:
        put_event(event); /* Must be the 'last' reference */
        return 0;
@@ -6639,6 +6648,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
        struct perf_namespaces_event *namespaces_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
+        u16 header_size = namespaces_event->event_id.header.size;
        int ret;
        if (!perf_event_namespaces_match(event))
@@ -6649,7 +6659,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
        ret = perf_output_begin(&handle, event,
                                namespaces_event->event_id.header.size);
        if (ret)
-                return;
+                goto out;
        namespaces_event->event_id.pid = perf_event_pid(event,
                                                        namespaces_event->task);
@@ -6661,6 +6671,8 @@ static void perf_event_namespaces_output(struct perf_event *event,
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
+out:
+        namespaces_event->event_id.header.size = header_size;
 }
 static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
@@ -7987,11 +7999,11 @@ static void bpf_overflow_handler(struct perf_event *event,
 {
        struct bpf_perf_event_data_kern ctx = {
                .data = data,
-                .regs = regs,
                .event = event,
        };
        int ret = 0;
+        ctx.regs = perf_arch_bpf_user_pt_regs(regs);
        preempt_disable();
        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
                goto out;
@@ -8513,6 +8525,29 @@ fail_clear_files:
        return ret;
 }
+static int
+perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
+{
+        struct perf_event_context *ctx = event->ctx;
+        int ret;
+        /*
+         * Beware, here be dragons!!
+         *
+         * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
+         * stuff does not actually need it. So temporarily drop ctx->mutex. As per
+         * perf_event_ctx_lock() we already have a reference on ctx.
+         *
+         * This can result in event getting moved to a different ctx, but that
+         * does not affect the tracepoint state.
+         */
+        mutex_unlock(&ctx->mutex);
+        ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+        mutex_lock(&ctx->mutex);
+        return ret;
+}
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 {
        char *filter_str;
@@ -8529,8 +8564,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
        if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
            event->attr.type == PERF_TYPE_TRACEPOINT)
-                ret = ftrace_profile_set_filter(event, event->attr.config,
+                ret = perf_tracepoint_set_filter(event, filter_str);
-                                                filter_str);
        else if (has_addr_filter(event))
                ret = perf_event_set_addr_filter(event, filter_str);
@@ -9165,7 +9199,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
        if (!try_module_get(pmu->module))
                return -ENODEV;
-        if (event->group_leader != event) {
+        /*
+         * A number of pmu->event_init() methods iterate the sibling_list to,
+         * for example, validate if the group fits on the PMU. Therefore,
+         * if this is a sibling event, acquire the ctx->mutex to protect
+         * the sibling_list.
+         */
+        if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
                /*
                 * This ctx->mutex can nest when we're called through
                 * inheritance. See the perf_event_ctx_lock_nested() comment.
diff --git a/kernel/exit.c b/kernel/exit.c
index 6b4298a41167..995453d9fb55 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1755,3 +1755,12 @@ Efault:
        return -EFAULT;
 }
 #endif
+__weak void abort(void)
+{
+        BUG();
+        /* if that doesn't kill us, halt */
+        panic("Oops failed to kill thread");
+}
+EXPORT_SYMBOL(abort);
diff --git a/kernel/fork.c b/kernel/fork.c
index 432eadf6b58c..2295fc69717f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                        goto out;
        }
        /* a new mm has just been created */
-        arch_dup_mmap(oldmm, mm);
+        retval = arch_dup_mmap(oldmm, mm);
-        retval = 0;
 out:
        up_write(&mm->mmap_sem);
        flush_tlb_mm(oldmm);
diff --git a/kernel/futex.c b/kernel/futex.c
index 76ed5921117a..7f719d110908 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1582,8 +1582,8 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
 {
        unsigned int op =         (encoded_op & 0x70000000) >> 28;
        unsigned int cmp =        (encoded_op & 0x0f000000) >> 24;
-        int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
+        int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
-        int cmparg = sign_extend32(encoded_op & 0x00000fff, 12);
+        int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
        int oldval, ret;
        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
@@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        struct futex_q *this, *next;
        DEFINE_WAKE_Q(wake_q);
+        if (nr_wake < 0 || nr_requeue < 0)
+                return -EINVAL;
        /*
         * When PI not supported: return -ENOSYS if requeue_pi is true,
         * consequently the compiler knows requeue_pi is always false past
@@ -2294,34 +2297,33 @@ static void unqueue_me_pi(struct futex_q *q)
        spin_unlock(q->lock_ptr);
 }
-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                                struct task_struct *newowner)
+                                struct task_struct *argowner)
 {
-        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
        u32 uval, uninitialized_var(curval), newval;
-        struct task_struct *oldowner;
+        struct task_struct *oldowner, *newowner;
+        u32 newtid;
        int ret;
+        lockdep_assert_held(q->lock_ptr);
        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
        oldowner = pi_state->owner;
-        /* Owner died? */
-        if (!pi_state->owner)
-                newtid |= FUTEX_OWNER_DIED;
        /*
-         * We are here either because we stole the rtmutex from the
+         * We are here because either:
-         * previous highest priority waiter or we are the highest priority
+         *
-         * waiter but have failed to get the rtmutex the first time.
+         *  - we stole the lock and pi_state->owner needs updating to reflect
+         *    that (@argowner == current),
+         *
+         * or:
         *
-         * We have to replace the newowner TID in the user space variable.
+         *  - someone stole our lock and we need to fix things to point to the
+         *    new owner (@argowner == NULL).
+         *
+         * Either way, we have to replace the TID in the user space variable.
         * This must be atomic as we have to preserve the owner died bit here.
         *
         * Note: We write the user space value _before_ changing the pi_state
@@ -2334,6 +2336,45 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
         * in the PID check in lookup_pi_state.
         */
 retry:
+        if (!argowner) {
+                if (oldowner != current) {
+                        /*
+                         * We raced against a concurrent self; things are
+                         * already fixed up. Nothing to do.
+                         */
+                        ret = 0;
+                        goto out_unlock;
+                }
+                if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+                        /* We got the lock after all, nothing to fix. */
+                        ret = 0;
+                        goto out_unlock;
+                }
+                /*
+                 * Since we just failed the trylock; there must be an owner.
+                 */
+                newowner = rt_mutex_owner(&pi_state->pi_mutex);
+                BUG_ON(!newowner);
+        } else {
+                WARN_ON_ONCE(argowner != current);
+                if (oldowner == current) {
+                        /*
+                         * We raced against a concurrent self; things are
+                         * already fixed up. Nothing to do.
+                         */
+                        ret = 0;
+                        goto out_unlock;
+                }
+                newowner = argowner;
+        }
+        newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+        /* Owner died? */
+        if (!pi_state->owner)
+                newtid |= FUTEX_OWNER_DIED;
        if (get_futex_value_locked(&uval, uaddr))
                goto handle_fault;
@@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
                 * Got the lock. We might not be the anticipated owner if we
                 * did a lock-steal - fix up the PI-state in that case:
                 *
-                 * We can safely read pi_state->owner without holding wait_lock
+                 * Speculative pi_state->owner read (we don't hold wait_lock);
-                 * because we now own the rt_mutex, only the owner will attempt
+                 * since we own the lock pi_state->owner == current is the
-                 * to change it.
+                 * stable state, anything else needs more attention.
                 */
                if (q->pi_state->owner != current)
                        ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
        }
        /*
+         * If we didn't get the lock; check if anybody stole it from us. In
+         * that case, we need to fix up the uval to point to them instead of
+         * us, otherwise bad things happen. [10]
+         *
+         * Another speculative read; pi_state->owner == current is unstable
+         * but needs our attention.
+         */
+        if (q->pi_state->owner == current) {
+                ret = fixup_pi_state_owner(uaddr, q, NULL);
+                goto out;
+        }
+        /*
         * Paranoia check. If we did not take the lock, then we should not be
         * the owner of the rt_mutex.
         */
diff --git a/kernel/groups.c b/kernel/groups.c
index e357bc800111..daae2f2dc6d4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b)
        return gid_gt(a, b) - gid_lt(a, b);
 }
-static void groups_sort(struct group_info *group_info)
+void groups_sort(struct group_info *group_info)
 {
        sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
             gid_cmp, NULL);
 }
+EXPORT_SYMBOL(groups_sort);
 /* a simple bsearch */
 int groups_search(const struct group_info *group_info, kgid_t grp)
@@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
 void set_groups(struct cred *new, struct group_info *group_info)
 {
        put_group_info(new->group_info);
-        groups_sort(group_info);
        get_group_info(group_info);
        new->group_info = group_info;
 }
@@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
                return retval;
        }
+        groups_sort(group_info);
        retval = set_current_groups(group_info);
        put_group_info(group_info);
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 17f05ef8f575..e4d3819a91cc 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -12,6 +12,11 @@
 static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
+        static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
+        if (!__ratelimit(&ratelimit))
+                return;
        printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
                irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
        printk("->handle_irq():  %p, ", desc->handle_irq);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 7f608ac39653..acfaaef8672a 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = {
        BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
        BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
        BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
+        BIT_MASK_DESCR(IRQD_CAN_RESERVE),
        BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c26c5bb6b491..508c03dfef25 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
 EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
 /*
- * Separate lockdep class for interrupt chip which can nest irq_desc
+ * Separate lockdep classes for interrupt chip which can nest irq_desc
- * lock.
+ * lock and request mutex.
 */
 static struct lock_class_key irq_nested_lock_class;
+static struct lock_class_key irq_nested_request_class;
 /*
 * irq_map_generic_chip - Map a generic chip for an irq domain
@@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
        set_bit(idx, &gc->installed);
        if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
-                irq_set_lockdep_class(virq, &irq_nested_lock_class);
+                irq_set_lockdep_class(virq, &irq_nested_lock_class,
+                                      &irq_nested_request_class);
        if (chip->irq_calc_mask)
                chip->irq_calc_mask(data);
@@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
                        continue;
                if (flags & IRQ_GC_INIT_NESTED_LOCK)
-                        irq_set_lockdep_class(i, &irq_nested_lock_class);
+                        irq_set_lockdep_class(i, &irq_nested_lock_class,
+                                              &irq_nested_request_class);
                if (!(flags & IRQ_GC_NO_MASK)) {
                        struct irq_data *d = irq_get_irq_data(i);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 07d08ca701ec..ab19371eab9b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve)
 {
        irqd_set_activated(data);
        return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4f4f60015e8a..62068ad46930 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1693,7 +1693,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
        }
 }
-static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve)
 {
        int ret = 0;
@@ -1702,9 +1702,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
                if (irqd->parent_data)
                        ret = __irq_domain_activate_irq(irqd->parent_data,
-                                                        early);
+                                                        reserve);
                if (!ret && domain->ops->activate) {
-                        ret = domain->ops->activate(domain, irqd, early);
+                        ret = domain->ops->activate(domain, irqd, reserve);
                        /* Rollback in case of error */
                        if (ret && irqd->parent_data)
                                __irq_domain_deactivate_irq(irqd->parent_data);
@@ -1716,17 +1716,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
 /**
 * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
 *                           interrupt
- * @irq_data:   outermost irq_data associated with interrupt
+ * @irq_data:   Outermost irq_data associated with interrupt
+ * @reserve:    If set only reserve an interrupt vector instead of assigning one
 *
 * This is the second step to call domain_ops->activate to program interrupt
 * controllers, so the interrupt could actually get delivered.
 */
-int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve)
 {
        int ret = 0;
        if (!irqd_is_activated(irq_data))
-                ret = __irq_domain_activate_irq(irq_data, early);
+                ret = __irq_domain_activate_irq(irq_data, reserve);
        if (!ret)
                irqd_set_activated(irq_data);
        return ret;
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 7df2480005f8..5187dfe809ac 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m)
 int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
                     bool reserved, unsigned int *mapped_cpu)
 {
-        unsigned int cpu;
+        unsigned int cpu, best_cpu, maxavl = 0;
+        struct cpumap *cm;
+        unsigned int bit;
+        best_cpu = UINT_MAX;
        for_each_cpu(cpu, msk) {
-                struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+                cm = per_cpu_ptr(m->maps, cpu);
-                unsigned int bit;
-                if (!cm->online)
+                if (!cm->online || cm->available <= maxavl)
                        continue;
+                best_cpu = cpu;
+                maxavl = cm->available;
+        }
+        if (maxavl) {
+                cm = per_cpu_ptr(m->maps, best_cpu);
                bit = matrix_alloc_area(m, cm, 1, false);
                if (bit < m->alloc_end) {
                        cm->allocated++;
@@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
                        m->global_available--;
                        if (reserved)
                                m->global_reserved--;
-                        *mapped_cpu = cpu;
+                        *mapped_cpu = best_cpu;
-                        trace_irq_matrix_alloc(bit, cpu, m, cm);
+                        trace_irq_matrix_alloc(bit, best_cpu, m, cm);
                        return bit;
                }
        }
@@ -384,7 +392,9 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
 {
        struct cpumap *cm = this_cpu_ptr(m->maps);
-        return (m->global_available - cpudown) ? cm->available : 0;
+        if (!cpudown)
+                return m->global_available;
+        return m->global_available - cm->available;
 }
 /**
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index edb987b2c58d..2f3c4f5382cc 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
        return ret;
 }
+/*
+ * Carefully check whether the device can use reservation mode. If
+ * reservation mode is enabled then the early activation will assign a
+ * dummy vector to the device. If the PCI/MSI device does not support
+ * masking of the entry then this can result in spurious interrupts when
+ * the device driver is not absolutely careful. But even then a malfunction
+ * of the hardware could result in a spurious interrupt on the dummy vector
+ * and render the device unusable. If the entry can be masked then the core
+ * logic will prevent the spurious interrupt and reservation mode can be
+ * used. For now reservation mode is restricted to PCI/MSI.
+ */
+static bool msi_check_reservation_mode(struct irq_domain *domain,
+                                       struct msi_domain_info *info,
+                                       struct device *dev)
+{
+        struct msi_desc *desc;
+        if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+                return false;
+        if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
+                return false;
+        if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+                return false;
+        /*
+         * Checking the first MSI descriptor is sufficient. MSIX supports
+         * masking and MSI does so when the maskbit is set.
+         */
+        desc = first_msi_entry(dev);
+        return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
+}
 /**
 * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
 * @domain:     The domain to allocate from
@@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 {
        struct msi_domain_info *info = domain->host_data;
        struct msi_domain_ops *ops = info->ops;
-        msi_alloc_info_t arg;
+        struct irq_data *irq_data;
        struct msi_desc *desc;
+        msi_alloc_info_t arg;
        int i, ret, virq;
+        bool can_reserve;
        ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
        if (ret)
@@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
        if (ops->msi_finish)
                ops->msi_finish(&arg, 0);
+        can_reserve = msi_check_reservation_mode(domain, info, dev);
        for_each_msi_entry(desc, dev) {
                virq = desc->irq;
                if (desc->nvec_used == 1)
@@ -397,15 +435,25 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
                 * the MSI entries before the PCI layer enables MSI in the
                 * card. Otherwise the card latches a random msi message.
                 */
-                if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+                if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
-                        struct irq_data *irq_data;
+                        continue;
+                irq_data = irq_domain_get_irq_data(domain, desc->irq);
+                if (!can_reserve)
+                        irqd_clr_can_reserve(irq_data);
+                ret = irq_domain_activate_irq(irq_data, can_reserve);
+                if (ret)
+                        goto cleanup;
+        }
+        /*
+         * If these interrupts use reservation mode, clear the activated bit
+         * so request_irq() will assign the final vector.
+         */
+        if (can_reserve) {
+                for_each_msi_entry(desc, dev) {
                        irq_data = irq_domain_get_irq_data(domain, desc->irq);
-                        ret = irq_domain_activate_irq(irq_data, true);
+                        irqd_clr_activated(irq_data);
-                        if (ret)
-                                goto cleanup;
-                        if (info->flags & MSI_FLAG_MUST_REACTIVATE)
-                                irqd_clr_activated(irq_data);
                }
        }
        return 0;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8594d24e4adc..b4517095db6a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,7 +79,7 @@ int static_key_count(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_count);
-static void static_key_slow_inc_cpuslocked(struct static_key *key)
+void static_key_slow_inc_cpuslocked(struct static_key *key)
 {
        int v, v1;
@@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_disable);
-static void static_key_slow_dec_cpuslocked(struct static_key *key,
+static void __static_key_slow_dec_cpuslocked(struct static_key *key,
                                           unsigned long rate_limit,
                                           struct delayed_work *work)
 {
@@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key,
                                  struct delayed_work *work)
 {
        cpus_read_lock();
-        static_key_slow_dec_cpuslocked(key, rate_limit, work);
+        __static_key_slow_dec_cpuslocked(key, rate_limit, work);
        cpus_read_unlock();
 }
@@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec);
+void static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+        STATIC_KEY_CHECK_USE(key);
+        __static_key_slow_dec_cpuslocked(key, 0, NULL);
+}
 void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
        STATIC_KEY_CHECK_USE(key);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 15f33faf4013..7594c033d98a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
-void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
 {
        write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
 }
@@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
-void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
 {
        write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
                        _RET_IP_);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 9776da8db180..521659044719 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -49,6 +49,7 @@
 #include <linux/gfp.h>
 #include <linux/random.h>
 #include <linux/jhash.h>
+#include <linux/nmi.h>
 #include <asm/sections.h>
@@ -57,10 +58,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/lock.h>
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-#include <linux/slab.h>
-#endif
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
@@ -75,19 +72,6 @@ module_param(lock_stat, int, 0644);
 #define lock_stat 0
 #endif
-#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
-static int crossrelease_fullstack = 1;
-#else
-static int crossrelease_fullstack;
-#endif
-static int __init allow_crossrelease_fullstack(char *str)
-{
-        crossrelease_fullstack = 1;
-        return 0;
-}
-early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
 /*
 * lockdep_lock: protects the lockdep graph, the hashes and the
 *               class/list/hash allocators.
@@ -740,18 +724,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
 }
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-static void cross_init(struct lockdep_map *lock, int cross);
-static int cross_lock(struct lockdep_map *lock);
-static int lock_acquire_crosslock(struct held_lock *hlock);
-static int lock_release_crosslock(struct lockdep_map *lock);
-#else
-static inline void cross_init(struct lockdep_map *lock, int cross) {}
-static inline int cross_lock(struct lockdep_map *lock) { return 0; }
-static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
-static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
-#endif
 /*
 * Register a lock's class in the hash-table, if the class is not present
 * yet. Otherwise we look it up. We cache the result in the lock object
@@ -1151,41 +1123,22 @@ print_circular_lock_scenario(struct held_lock *src,
                printk(KERN_CONT "\n\n");
        }
-        if (cross_lock(tgt->instance)) {
+        printk(" Possible unsafe locking scenario:\n\n");
-                printk(" Possible unsafe locking scenario by crosslock:\n\n");
+        printk("       CPU0                    CPU1\n");
-                printk("       CPU0                    CPU1\n");
+        printk("       ----                    ----\n");
-                printk("       ----                    ----\n");
+        printk("  lock(");
-                printk("  lock(");
+        __print_lock_name(target);
-                __print_lock_name(parent);
+        printk(KERN_CONT ");\n");
-                printk(KERN_CONT ");\n");
+        printk("                               lock(");
-                printk("  lock(");
+        __print_lock_name(parent);
-                __print_lock_name(target);
+        printk(KERN_CONT ");\n");
-                printk(KERN_CONT ");\n");
+        printk("                               lock(");
-                printk("                               lock(");
+        __print_lock_name(target);
-                __print_lock_name(source);
+        printk(KERN_CONT ");\n");
-                printk(KERN_CONT ");\n");
+        printk("  lock(");
-                printk("                               unlock(");
+        __print_lock_name(source);
-                __print_lock_name(target);
+        printk(KERN_CONT ");\n");
-                printk(KERN_CONT ");\n");
+        printk("\n *** DEADLOCK ***\n\n");
-                printk("\n *** DEADLOCK ***\n\n");
-        } else {
-                printk(" Possible unsafe locking scenario:\n\n");
-                printk("       CPU0                    CPU1\n");
-                printk("       ----                    ----\n");
-                printk("  lock(");
-                __print_lock_name(target);
-                printk(KERN_CONT ");\n");
-                printk("                               lock(");
-                __print_lock_name(parent);
-                printk(KERN_CONT ");\n");
-                printk("                               lock(");
-                __print_lock_name(target);
-                printk(KERN_CONT ");\n");
-                printk("  lock(");
-                __print_lock_name(source);
-                printk(KERN_CONT ");\n");
-                printk("\n *** DEADLOCK ***\n\n");
-        }
 }
 /*
@@ -1211,10 +1164,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
                curr->comm, task_pid_nr(curr));
        print_lock(check_src);
-        if (cross_lock(check_tgt->instance))
+        pr_warn("\nbut task is already holding lock:\n");
-                pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
-        else
-                pr_warn("\nbut task is already holding lock:\n");
        print_lock(check_tgt);
        pr_warn("\nwhich lock already depends on the new lock.\n\n");
@@ -1244,9 +1194,7 @@ static noinline int print_circular_bug(struct lock_list *this,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        if (cross_lock(check_tgt->instance))
+        if (!save_trace(&this->trace))
-                this->trace = *trace;
-        else if (!save_trace(&this->trace))
                return 0;
        depth = get_lock_depth(target);
@@ -1850,9 +1798,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
                if (nest)
                        return 2;
-                if (cross_lock(prev->instance))
-                        continue;
                return print_deadlock_bug(curr, prev, next);
        }
        return 1;
@@ -2018,31 +1963,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
        for (;;) {
                int distance = curr->lockdep_depth - depth + 1;
                hlock = curr->held_locks + depth - 1;
                /*
-                 * Only non-crosslock entries get new dependencies added.
+                 * Only non-recursive-read entries get new dependencies
-                 * Crosslock entries will be added by commit later:
+                 * added:
                 */
-                if (!cross_lock(hlock->instance)) {
+                if (hlock->read != 2 && hlock->check) {
+                        int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+                        if (!ret)
+                                return 0;
                        /*
-                         * Only non-recursive-read entries get new dependencies
+                         * Stop after the first non-trylock entry,
-                         * added:
+                         * as non-trylock entries have added their
+                         * own direct dependencies already, so this
+                         * lock is connected to them indirectly:
                         */
-                        if (hlock->read != 2 && hlock->check) {
+                        if (!hlock->trylock)
-                                int ret = check_prev_add(curr, hlock, next,
+                                break;
-                                                         distance, &trace, save_trace);
-                                if (!ret)
-                                        return 0;
-                                /*
-                                 * Stop after the first non-trylock entry,
-                                 * as non-trylock entries have added their
-                                 * own direct dependencies already, so this
-                                 * lock is connected to them indirectly:
-                                 */
-                                if (!hlock->trylock)
-                                        break;
-                        }
                }
                depth--;
                /*
                 * End of lock-stack?
@@ -3292,21 +3232,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass)
 {
-        cross_init(lock, 0);
        __lockdep_init_map(lock, name, key, subclass);
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
-                      struct lock_class_key *key, int subclass)
-{
-        cross_init(lock, 1);
-        __lockdep_init_map(lock, name, key, subclass);
-}
-EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
-#endif
 struct lock_class_key __lockdep_no_validate__;
 EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
@@ -3362,7 +3291,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        int chain_head = 0;
        int class_idx;
        u64 chain_key;
-        int ret;
        if (unlikely(!debug_locks))
                return 0;
@@ -3411,8 +3339,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        class_idx = class - lock_classes + 1;
-        /* TODO: nest_lock is not implemented for crosslock yet. */
+        if (depth) {
-        if (depth && !cross_lock(lock)) {
                hlock = curr->held_locks + depth - 1;
                if (hlock->class_idx == class_idx && nest_lock) {
                        if (hlock->references) {
@@ -3500,14 +3427,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
                return 0;
-        ret = lock_acquire_crosslock(hlock);
-        /*
-         * 2 means normal acquire operations are needed. Otherwise, it's
-         * ok just to return with '0:fail, 1:success'.
-         */
-        if (ret != 2)
-                return ret;
        curr->curr_chain_key = chain_key;
        curr->lockdep_depth++;
        check_chain_key(curr);
@@ -3745,19 +3664,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
        struct task_struct *curr = current;
        struct held_lock *hlock;
        unsigned int depth;
-        int ret, i;
+        int i;
        if (unlikely(!debug_locks))
                return 0;
-        ret = lock_release_crosslock(lock);
-        /*
-         * 2 means normal release operations are needed. Otherwise, it's
-         * ok just to return with '0:fail, 1:success'.
-         */
-        if (ret != 2)
-                return ret;
        depth = curr->lockdep_depth;
        /*
         * So we're all set to release this lock.. wait what lock? We don't
@@ -4580,6 +4491,7 @@ retry:
                if (!unlock)
                        if (read_trylock(&tasklist_lock))
                                unlock = 1;
+                touch_nmi_watchdog();
        } while_each_thread(g, p);
        pr_warn("\n");
@@ -4675,494 +4587,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
        dump_stack();
 }
 EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-/*
- * Crossrelease works by recording a lock history for each thread and
- * connecting those historic locks that were taken after the
- * wait_for_completion() in the complete() context.
- *
- * Task-A                               Task-B
- *
- *                                      mutex_lock(&A);
- *                                      mutex_unlock(&A);
- *
- * wait_for_completion(&C);
- *   lock_acquire_crosslock();
- *     atomic_inc_return(&cross_gen_id);
- *                                |
- *                                |     mutex_lock(&B);
- *                                |     mutex_unlock(&B);
- *                                |
- *                                |     complete(&C);
- *                                `--     lock_commit_crosslock();
- *
- * Which will then add a dependency between B and C.
- */
-#define xhlock(i)         (current->xhlocks[(i) % MAX_XHLOCKS_NR])
-/*
- * Whenever a crosslock is held, cross_gen_id will be increased.
- */
-static atomic_t cross_gen_id; /* Can be wrapped */
-/*
- * Make an entry of the ring buffer invalid.
- */
-static inline void invalidate_xhlock(struct hist_lock *xhlock)
-{
-        /*
-         * Normally, xhlock->hlock.instance must be !NULL.
-         */
-        xhlock->hlock.instance = NULL;
-}
-/*
- * Lock history stacks; we have 2 nested lock history stacks:
- *
- *   HARD(IRQ)
- *   SOFT(IRQ)
- *
- * The thing is that once we complete a HARD/SOFT IRQ the future task locks
- * should not depend on any of the locks observed while running the IRQ.  So
- * what we do is rewind the history buffer and erase all our knowledge of that
- * temporal event.
- */
-void crossrelease_hist_start(enum xhlock_context_t c)
-{
-        struct task_struct *cur = current;
-        if (!cur->xhlocks)
-                return;
-        cur->xhlock_idx_hist[c] = cur->xhlock_idx;
-        cur->hist_id_save[c]    = cur->hist_id;
-}
-void crossrelease_hist_end(enum xhlock_context_t c)
-{
-        struct task_struct *cur = current;
-        if (cur->xhlocks) {
-                unsigned int idx = cur->xhlock_idx_hist[c];
-                struct hist_lock *h = &xhlock(idx);
-                cur->xhlock_idx = idx;
-                /* Check if the ring was overwritten. */
-                if (h->hist_id != cur->hist_id_save[c])
-                        invalidate_xhlock(h);
-        }
-}
-/*
- * lockdep_invariant_state() is used to annotate independence inside a task, to
- * make one task look like multiple independent 'tasks'.
- *
- * Take for instance workqueues; each work is independent of the last. The
- * completion of a future work does not depend on the completion of a past work
- * (in general). Therefore we must not carry that (lock) dependency across
- * works.
- *
- * This is true for many things; pretty much all kthreads fall into this
- * pattern, where they have an invariant state and future completions do not
- * depend on past completions. Its just that since they all have the 'same'
- * form -- the kthread does the same over and over -- it doesn't typically
- * matter.
- *
- * The same is true for system-calls, once a system call is completed (we've
- * returned to userspace) the next system call does not depend on the lock
- * history of the previous system call.
- *
- * They key property for independence, this invariant state, is that it must be
- * a point where we hold no locks and have no history. Because if we were to
- * hold locks, the restore at _end() would not necessarily recover it's history
- * entry. Similarly, independence per-definition means it does not depend on
- * prior state.
- */
-void lockdep_invariant_state(bool force)
-{
-        /*
-         * We call this at an invariant point, no current state, no history.
-         * Verify the former, enforce the latter.
-         */
-        WARN_ON_ONCE(!force && current->lockdep_depth);
-        invalidate_xhlock(&xhlock(current->xhlock_idx));
-}
-static int cross_lock(struct lockdep_map *lock)
-{
-        return lock ? lock->cross : 0;
-}
-/*
- * This is needed to decide the relationship between wrapable variables.
- */
-static inline int before(unsigned int a, unsigned int b)
-{
-        return (int)(a - b) < 0;
-}
-static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
-{
-        return hlock_class(&xhlock->hlock);
-}
-static inline struct lock_class *xlock_class(struct cross_lock *xlock)
-{
-        return hlock_class(&xlock->hlock);
-}
-/*
- * Should we check a dependency with previous one?
- */
-static inline int depend_before(struct held_lock *hlock)
-{
-        return hlock->read != 2 && hlock->check && !hlock->trylock;
-}
-/*
- * Should we check a dependency with next one?
- */
-static inline int depend_after(struct held_lock *hlock)
-{
-        return hlock->read != 2 && hlock->check;
-}
-/*
- * Check if the xhlock is valid, which would be false if,
- *
- *    1. Has not used after initializaion yet.
- *    2. Got invalidated.
- *
- * Remind hist_lock is implemented as a ring buffer.
- */
-static inline int xhlock_valid(struct hist_lock *xhlock)
-{
-        /*
-         * xhlock->hlock.instance must be !NULL.
-         */
-        return !!xhlock->hlock.instance;
-}
-/*
- * Record a hist_lock entry.
- *
- * Irq disable is only required.
- */
-static void add_xhlock(struct held_lock *hlock)
-{
-        unsigned int idx = ++current->xhlock_idx;
-        struct hist_lock *xhlock = &xhlock(idx);
-#ifdef CONFIG_DEBUG_LOCKDEP
-        /*
-         * This can be done locklessly because they are all task-local
-         * state, we must however ensure IRQs are disabled.
-         */
-        WARN_ON_ONCE(!irqs_disabled());
-#endif
-        /* Initialize hist_lock's members */
-        xhlock->hlock = *hlock;
-        xhlock->hist_id = ++current->hist_id;
-        xhlock->trace.nr_entries = 0;
-        xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
-        xhlock->trace.entries = xhlock->trace_entries;
-        if (crossrelease_fullstack) {
-                xhlock->trace.skip = 3;
-                save_stack_trace(&xhlock->trace);
-        } else {
-                xhlock->trace.nr_entries = 1;
-                xhlock->trace.entries[0] = hlock->acquire_ip;
-        }
-}
-static inline int same_context_xhlock(struct hist_lock *xhlock)
-{
-        return xhlock->hlock.irq_context == task_irq_context(current);
-}
-/*
- * This should be lockless as far as possible because this would be
- * called very frequently.
- */
-static void check_add_xhlock(struct held_lock *hlock)
-{
-        /*
-         * Record a hist_lock, only in case that acquisitions ahead
-         * could depend on the held_lock. For example, if the held_lock
-         * is trylock then acquisitions ahead never depends on that.
-         * In that case, we don't need to record it. Just return.
-         */
-        if (!current->xhlocks || !depend_before(hlock))
-                return;
-        add_xhlock(hlock);
-}
-/*
- * For crosslock.
- */
-static int add_xlock(struct held_lock *hlock)
-{
-        struct cross_lock *xlock;
-        unsigned int gen_id;
-        if (!graph_lock())
-                return 0;
-        xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
-        /*
-         * When acquisitions for a crosslock are overlapped, we use
-         * nr_acquire to perform commit for them, based on cross_gen_id
-         * of the first acquisition, which allows to add additional
-         * dependencies.
-         *
-         * Moreover, when no acquisition of a crosslock is in progress,
-         * we should not perform commit because the lock might not exist
-         * any more, which might cause incorrect memory access. So we
-         * have to track the number of acquisitions of a crosslock.
-         *
-         * depend_after() is necessary to initialize only the first
-         * valid xlock so that the xlock can be used on its commit.
-         */
-        if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
-                goto unlock;
-        gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
-        xlock->hlock = *hlock;
-        xlock->hlock.gen_id = gen_id;
-unlock:
-        graph_unlock();
-        return 1;
-}
-/*
- * Called for both normal and crosslock acquires. Normal locks will be
- * pushed on the hist_lock queue. Cross locks will record state and
- * stop regular lock_acquire() to avoid being placed on the held_lock
- * stack.
- *
- * Return: 0 - failure;
- *         1 - crosslock, done;
- *         2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_acquire_crosslock(struct held_lock *hlock)
-{
-        /*
-         *      CONTEXT 1               CONTEXT 2
-         *      ---------               ---------
-         *      lock A (cross)
-         *      X = atomic_inc_return(&cross_gen_id)
-         *      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-         *                              Y = atomic_read_acquire(&cross_gen_id)
-         *                              lock B
-         *
-         * atomic_read_acquire() is for ordering between A and B,
-         * IOW, A happens before B, when CONTEXT 2 see Y >= X.
-         *
-         * Pairs with atomic_inc_return() in add_xlock().
-         */
-        hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
-        if (cross_lock(hlock->instance))
-                return add_xlock(hlock);
-        check_add_xhlock(hlock);
-        return 2;
-}
-static int copy_trace(struct stack_trace *trace)
-{
-        unsigned long *buf = stack_trace + nr_stack_trace_entries;
-        unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
-        unsigned int nr = min(max_nr, trace->nr_entries);
-        trace->nr_entries = nr;
-        memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
-        trace->entries = buf;
-        nr_stack_trace_entries += nr;
-        if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
-                if (!debug_locks_off_graph_unlock())
-                        return 0;
-                print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
-                dump_stack();
-                return 0;
-        }
-        return 1;
-}
-static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
-{
-        unsigned int xid, pid;
-        u64 chain_key;
-        xid = xlock_class(xlock) - lock_classes;
-        chain_key = iterate_chain_key((u64)0, xid);
-        pid = xhlock_class(xhlock) - lock_classes;
-        chain_key = iterate_chain_key(chain_key, pid);
-        if (lookup_chain_cache(chain_key))
-                return 1;
-        if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
-                                chain_key))
-                return 0;
-        if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
-                            &xhlock->trace, copy_trace))
-                return 0;
-        return 1;
-}
-static void commit_xhlocks(struct cross_lock *xlock)
-{
-        unsigned int cur = current->xhlock_idx;
-        unsigned int prev_hist_id = xhlock(cur).hist_id;
-        unsigned int i;
-        if (!graph_lock())
-                return;
-        if (xlock->nr_acquire) {
-                for (i = 0; i < MAX_XHLOCKS_NR; i++) {
-                        struct hist_lock *xhlock = &xhlock(cur - i);
-                        if (!xhlock_valid(xhlock))
-                                break;
-                        if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
-                                break;
-                        if (!same_context_xhlock(xhlock))
-                                break;
-                        /*
-                         * Filter out the cases where the ring buffer was
-                         * overwritten and the current entry has a bigger
-                         * hist_id than the previous one, which is impossible
-                         * otherwise:
-                         */
-                        if (unlikely(before(prev_hist_id, xhlock->hist_id)))
-                                break;
-                        prev_hist_id = xhlock->hist_id;
-                        /*
-                         * commit_xhlock() returns 0 with graph_lock already
-                         * released if fail.
-                         */
-                        if (!commit_xhlock(xlock, xhlock))
-                                return;
-                }
-        }
-        graph_unlock();
-}
-void lock_commit_crosslock(struct lockdep_map *lock)
-{
-        struct cross_lock *xlock;
-        unsigned long flags;
-        if (unlikely(!debug_locks || current->lockdep_recursion))
-                return;
-        if (!current->xhlocks)
-                return;
-        /*
-         * Do commit hist_locks with the cross_lock, only in case that
-         * the cross_lock could depend on acquisitions after that.
-         *
-         * For example, if the cross_lock does not have the 'check' flag
-         * then we don't need to check dependencies and commit for that.
-         * Just skip it. In that case, of course, the cross_lock does
-         * not depend on acquisitions ahead, either.
-         *
-         * WARNING: Don't do that in add_xlock() in advance. When an
-         * acquisition context is different from the commit context,
-         * invalid(skipped) cross_lock might be accessed.
-         */
-        if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
-                return;
-        raw_local_irq_save(flags);
-        check_flags(flags);
-        current->lockdep_recursion = 1;
-        xlock = &((struct lockdep_map_cross *)lock)->xlock;
-        commit_xhlocks(xlock);
-        current->lockdep_recursion = 0;
-        raw_local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(lock_commit_crosslock);
-/*
- * Return: 0 - failure;
- *         1 - crosslock, done;
- *         2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_release_crosslock(struct lockdep_map *lock)
-{
-        if (cross_lock(lock)) {
-                if (!graph_lock())
-                        return 0;
-                ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
-                graph_unlock();
-                return 1;
-        }
-        return 2;
-}
-static void cross_init(struct lockdep_map *lock, int cross)
-{
-        if (cross)
-                ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
-        lock->cross = cross;
-        /*
-         * Crossrelease assumes that the ring buffer size of xhlocks
-         * is aligned with power of 2. So force it on build.
-         */
-        BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
-}
-void lockdep_init_task(struct task_struct *task)
-{
-        int i;
-        task->xhlock_idx = UINT_MAX;
-        task->hist_id = 0;
-        for (i = 0; i < XHLOCK_CTX_NR; i++) {
-                task->xhlock_idx_hist[i] = UINT_MAX;
-                task->hist_id_save[i] = 0;
-        }
-        task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
-                                GFP_KERNEL);
-}
-void lockdep_free_task(struct task_struct *task)
-{
-        if (task->xhlocks) {
-                void *tmp = task->xhlocks;
-                /* Diable crossrelease for current */
-                task->xhlocks = NULL;
-                kfree(tmp);
-        }
-}
-#endif
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6f3dba6e4e9e..65cc0cb984e6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        return ret;
 }
+static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+        int ret = try_to_take_rt_mutex(lock, current, NULL);
+        /*
+         * try_to_take_rt_mutex() sets the lock waiters bit
+         * unconditionally. Clean this up.
+         */
+        fixup_rt_mutex_waiters(lock);
+        return ret;
+}
 /*
 * Slow path try-lock function:
 */
@@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
         */
        raw_spin_lock_irqsave(&lock->wait_lock, flags);
-        ret = try_to_take_rt_mutex(lock, current, NULL);
+        ret = __rt_mutex_slowtrylock(lock);
-        /*
-         * try_to_take_rt_mutex() sets the lock waiters bit
-         * unconditionally. Clean this up.
-         */
-        fixup_rt_mutex_waiters(lock);
        raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
@@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
        return rt_mutex_slowtrylock(lock);
 }
+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+        return __rt_mutex_slowtrylock(lock);
+}
 /**
 * rt_mutex_timed_lock - lock a rt_mutex interruptible
 *                      the timeout structure is provided
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 124e98ca0b17..68686b3ec3c1 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
                                 struct rt_mutex_waiter *waiter);
 extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
 extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
 extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 1fd1a7543cdd..936f3d14dd6b 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock)			\
                        break;                                          \
                preempt_enable();                                       \
                                                                        \
-                if (!(lock)->break_lock)                                \
+                arch_##op##_relax(&lock->raw_lock);                     \
-                        (lock)->break_lock = 1;                         \
-                while ((lock)->break_lock)                              \
-                        arch_##op##_relax(&lock->raw_lock);             \
        }                                                               \
-        (lock)->break_lock = 0;                                         \
 }                                                                       \
                                                                        \
 unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)  \
@@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)	\
                local_irq_restore(flags);                               \
                preempt_enable();                                       \
                                                                        \
-                if (!(lock)->break_lock)                                \
+                arch_##op##_relax(&lock->raw_lock);                     \
-                        (lock)->break_lock = 1;                         \
-                while ((lock)->break_lock)                              \
-                        arch_##op##_relax(&lock->raw_lock);             \
        }                                                               \
-        (lock)->break_lock = 0;                                         \
+                                                                        \
        return flags;                                                   \
 }                                                                       \
                                                                        \
diff --git a/kernel/pid.c b/kernel/pid.c
index b13b624e2c49..1e8bb6550ec4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -193,10 +193,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
        }
        if (unlikely(is_child_reaper(pid))) {
-                if (pid_ns_prepare_proc(ns)) {
+                if (pid_ns_prepare_proc(ns))
-                        disable_pid_allocation(ns);
                        goto out_free;
-                }
        }
        get_pid_ns(ns);
@@ -226,6 +224,10 @@ out_free:
        while (++i <= ns->level)
                idr_remove(&ns->idr, (pid->numbers + i)->nr);
+        /* On failure to allocate the first pid, reset the state */
+        if (ns->pid_allocated == PIDNS_ADDING)
+                idr_set_cursor(&ns->idr, 0);
        spin_unlock_irq(&pidmap_lock);
        kmem_cache_free(ns->pid_cachep, pid);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5d81206a572d..b9006617710f 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3141,9 +3141,6 @@ void dump_stack_print_info(const char *log_lvl)
 void show_regs_print_info(const char *log_lvl)
 {
        dump_stack_print_info(log_lvl);
-        printk("%stask: %p task.stack: %p\n",
-               log_lvl, current, task_stack_page(current));
 }
 #endif
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 2ddaec40956f..0926aef10dad 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -34,11 +34,6 @@ void complete(struct completion *x)
        spin_lock_irqsave(&x->wait.lock, flags);
-        /*
-         * Perform commit of crossrelease here.
-         */
-        complete_release_commit(x);
        if (x->done != UINT_MAX)
                x->done++;
        __wake_up_locked(&x->wait, TASK_NORMAL, 1);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 75554f366fd3..a7bf32aabfda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        p->state = TASK_WAKING;
        if (p->in_iowait) {
-                delayacct_blkio_end();
+                delayacct_blkio_end(p);
                atomic_dec(&task_rq(p)->nr_iowait);
        }
@@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 #else /* CONFIG_SMP */
        if (p->in_iowait) {
-                delayacct_blkio_end();
+                delayacct_blkio_end(p);
                atomic_dec(&task_rq(p)->nr_iowait);
        }
@@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
        if (!task_on_rq_queued(p)) {
                if (p->in_iowait) {
-                        delayacct_blkio_end();
+                        delayacct_blkio_end(p);
                        atomic_dec(&rq->nr_iowait);
                }
                ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
@@ -5097,17 +5097,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
        return ret;
 }
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
 {
        struct task_struct *p;
@@ -5144,6 +5133,17 @@ out_unlock:
        return retval;
 }
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                struct timespec __user *, interval)
 {
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2f52ec0f1539..d6717a3331a1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 #ifdef CONFIG_NO_HZ_COMMON
 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
 {
-        unsigned long idle_calls = tick_nohz_get_idle_calls();
+        unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
        bool ret = idle_calls == sg_cpu->saved_idle_calls;
        sg_cpu->saved_idle_calls = idle_calls;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4037e19bbca2..26a71ebcd3c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3413,9 +3413,9 @@ void set_task_rq_fair(struct sched_entity *se,
 * _IFF_ we look at the pure running and runnable sums. Because they
 * represent the very same entity, just at different points in the hierarchy.
 *
- *
+ * Per the above update_tg_cfs_util() is trivial and simply copies the running
- * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
+ * sum over (but still wrong, because the group entity and group rq do not have
- * simply copies the running sum over.
+ * their PELT windows aligned).
 *
 * However, update_tg_cfs_runnable() is more complex. So we have:
 *
@@ -3424,11 +3424,11 @@ void set_task_rq_fair(struct sched_entity *se,
 * And since, like util, the runnable part should be directly transferable,
 * the following would _appear_ to be the straight forward approach:
 *
- *   grq->avg.load_avg = grq->load.weight * grq->avg.running_avg        (3)
+ *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg       (3)
 *
 * And per (1) we have:
 *
- *   ge->avg.running_avg == grq->avg.running_avg
+ *   ge->avg.runnable_avg == grq->avg.runnable_avg
 *
 * Which gives:
 *
@@ -3447,27 +3447,28 @@ void set_task_rq_fair(struct sched_entity *se,
 * to (shortly) return to us. This only works by keeping the weights as
 * integral part of the sum. We therefore cannot decompose as per (3).
 *
- * OK, so what then?
+ * Another reason this doesn't work is that runnable isn't a 0-sum entity.
+ * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
+ * rq itself is runnable anywhere between 2/3 and 1 depending on how the
+ * runnable section of these tasks overlap (or not). If they were to perfectly
+ * align the rq as a whole would be runnable 2/3 of the time. If however we
+ * always have at least 1 runnable task, the rq as a whole is always runnable.
 *
+ * So we'll have to approximate.. :/
 *
- * Another way to look at things is:
+ * Given the constraint:
 *
- *   grq->avg.load_avg = \Sum se->avg.load_avg
+ *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
 *
- * Therefore, per (2):
+ * We can construct a rule that adds runnable to a rq by assuming minimal
+ * overlap.
 *
- *   grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ * On removal, we'll assume each task is equally runnable; which yields:
 *
- * And the very thing we're propagating is a change in that sum (someone
+ *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
- * joined/left). So we can easily know the runnable change, which would be, per
- * (2) the already tracked se->load_avg divided by the corresponding
- * se->weight.
 *
- * Basically (4) but in differential form:
+ * XXX: only do this for the part of runnable > running ?
 *
- *   d(runnable_avg) += se->avg.load_avg / se->load.weight
- *                                                                 (5)
- *   ge->avg.load_avg += ge->load.weight * d(runnable_avg)
 */
 static inline void
@@ -3479,6 +3480,14 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
        if (!delta)
                return;
+        /*
+         * The relation between sum and avg is:
+         *
+         *   LOAD_AVG_MAX - 1024 + sa->period_contrib
+         *
+         * however, the PELT windows are not aligned between grq and gse.
+         */
        /* Set new sched_entity's utilization */
        se->avg.util_avg = gcfs_rq->avg.util_avg;
        se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
@@ -3491,33 +3500,68 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
 static inline void
 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
-        long runnable_sum = gcfs_rq->prop_runnable_sum;
+        long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
-        long runnable_load_avg, load_avg;
+        unsigned long runnable_load_avg, load_avg;
-        s64 runnable_load_sum, load_sum;
+        u64 runnable_load_sum, load_sum = 0;
+        s64 delta_sum;
        if (!runnable_sum)
                return;
        gcfs_rq->prop_runnable_sum = 0;
+        if (runnable_sum >= 0) {
+                /*
+                 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
+                 * the CPU is saturated running == runnable.
+                 */
+                runnable_sum += se->avg.load_sum;
+                runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+        } else {
+                /*
+                 * Estimate the new unweighted runnable_sum of the gcfs_rq by
+                 * assuming all tasks are equally runnable.
+                 */
+                if (scale_load_down(gcfs_rq->load.weight)) {
+                        load_sum = div_s64(gcfs_rq->avg.load_sum,
+                                scale_load_down(gcfs_rq->load.weight));
+                }
+                /* But make sure to not inflate se's runnable */
+                runnable_sum = min(se->avg.load_sum, load_sum);
+        }
+        /*
+         * runnable_sum can't be lower than running_sum
+         * As running sum is scale with cpu capacity wehreas the runnable sum
+         * is not we rescale running_sum 1st
+         */
+        running_sum = se->avg.util_sum /
+                arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+        runnable_sum = max(runnable_sum, running_sum);
        load_sum = (s64)se_weight(se) * runnable_sum;
        load_avg = div_s64(load_sum, LOAD_AVG_MAX);
-        add_positive(&se->avg.load_sum, runnable_sum);
+        delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
-        add_positive(&se->avg.load_avg, load_avg);
+        delta_avg = load_avg - se->avg.load_avg;
-        add_positive(&cfs_rq->avg.load_avg, load_avg);
+        se->avg.load_sum = runnable_sum;
-        add_positive(&cfs_rq->avg.load_sum, load_sum);
+        se->avg.load_avg = load_avg;
+        add_positive(&cfs_rq->avg.load_avg, delta_avg);
+        add_positive(&cfs_rq->avg.load_sum, delta_sum);
        runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
        runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+        delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
+        delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
-        add_positive(&se->avg.runnable_load_sum, runnable_sum);
+        se->avg.runnable_load_sum = runnable_sum;
-        add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
+        se->avg.runnable_load_avg = runnable_load_avg;
        if (se->on_rq) {
-                add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
+                add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
-                add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
+                add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
        }
 }
@@ -4321,12 +4365,12 @@ static inline bool cfs_bandwidth_used(void)
 void cfs_bandwidth_usage_inc(void)
 {
-        static_key_slow_inc(&__cfs_bandwidth_used);
+        static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
 }
 void cfs_bandwidth_usage_dec(void)
 {
-        static_key_slow_dec(&__cfs_bandwidth_used);
+        static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index dd7908743dab..9bcbacba82a8 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -89,7 +89,9 @@ static int membarrier_private_expedited(void)
                rcu_read_unlock();
        }
        if (!fallback) {
+                preempt_disable();
                smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+                preempt_enable();
                free_cpumask_var(tmpmask);
        }
        cpus_read_unlock();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4056c19ca3f0..665ace2fc558 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq)
        bool resched = false;
        struct task_struct *p;
        struct rq *src_rq;
+        int rt_overload_count = rt_overloaded(this_rq);
-        if (likely(!rt_overloaded(this_rq)))
+        if (likely(!rt_overload_count))
                return;
        /*
@@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq)
         */
        smp_rmb();
+        /* If we are the only overloaded CPU do nothing */
+        if (rt_overload_count == 1 &&
+            cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+                return;
 #ifdef HAVE_RT_PUSH_IPI
        if (sched_feat(RT_PUSH_IPI)) {
                tell_cpu_to_push(this_rq);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 98feab7933c7..929ecb7d6b78 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
        wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
        spin_lock_irqsave(&wq_head->lock, flags);
-        __add_wait_queue_entry_tail(wq_head, wq_entry);
+        __add_wait_queue(wq_head, wq_entry);
        spin_unlock_irqrestore(&wq_head->lock, flags);
 }
 EXPORT_SYMBOL(add_wait_queue);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e776fc8cc1df..f6b5f19223d6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -95,6 +95,7 @@ config NO_HZ_FULL
        select RCU_NOCB_CPU
        select VIRT_CPU_ACCOUNTING_GEN
        select IRQ_WORK
+        select CPU_ISOLATION
        help
         Adaptively try to shutdown the tick whenever possible, even when
         the CPU is running tasks. Typically this requires running a single
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d32520840fde..aa9d2a2b1210 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -655,7 +655,9 @@ static void hrtimer_reprogram(struct hrtimer *timer,
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 {
        base->expires_next = KTIME_MAX;
+        base->hang_detected = 0;
        base->hres_active = 0;
+        base->next_timer = NULL;
 }
 /*
@@ -1589,6 +1591,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
                timerqueue_init_head(&cpu_base->clock_base[i].active);
        }
+        cpu_base->active_bases = 0;
        cpu_base->cpu = cpu;
        hrtimer_init_hres(cpu_base);
        return 0;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 13d6881f908b..ec999f32c840 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event)
 {
        struct task_struct *rtn = current->group_leader;
-        if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
+        switch (event->sigev_notify) {
-                (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
+        case SIGEV_SIGNAL | SIGEV_THREAD_ID:
-                 !same_thread_group(rtn, current) ||
+                rtn = find_task_by_vpid(event->sigev_notify_thread_id);
-                 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+                if (!rtn || !same_thread_group(rtn, current))
+                        return NULL;
+                /* FALLTHRU */
+        case SIGEV_SIGNAL:
+        case SIGEV_THREAD:
+                if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
+                        return NULL;
+                /* FALLTHRU */
+        case SIGEV_NONE:
+                return task_pid(rtn);
+        default:
                return NULL;
+        }
-        if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
-            ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
-                return NULL;
-        return task_pid(rtn);
 }
 static struct k_itimer * alloc_posix_timer(void)
@@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
        struct timespec64 ts64;
        bool sig_none;
-        sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
+        sig_none = timr->it_sigev_notify == SIGEV_NONE;
        iv = timr->it_interval;
        /* interval timer ? */
@@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags,
        timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
        expires = timespec64_to_ktime(new_setting->it_value);
-        sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
+        sigev_none = timr->it_sigev_notify == SIGEV_NONE;
        kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
        timr->it_active = !sigev_none;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99578f06c8d4..f7cc7abfcf25 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
        ts->next_tick = 0;
 }
+static inline bool local_timer_softirq_pending(void)
+{
+        return local_softirq_pending() & TIMER_SOFTIRQ;
+}
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                                         ktime_t now, int cpu)
 {
@@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
        } while (read_seqretry(&jiffies_lock, seq));
        ts->last_jiffies = basejiff;
-        if (rcu_needs_cpu(basemono, &next_rcu) ||
+        /*
-            arch_needs_cpu() || irq_work_needs_cpu()) {
+         * Keep the periodic tick, when RCU, architecture or irq_work
+         * requests it.
+         * Aside of that check whether the local timer softirq is
+         * pending. If so its a bad idea to call get_next_timer_interrupt()
+         * because there is an already expired timer, so it will request
+         * immeditate expiry, which rearms the hardware timer with a
+         * minimal delta which brings us back to this place
+         * immediately. Lather, rinse and repeat...
+         */
+        if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+            irq_work_needs_cpu() || local_timer_softirq_pending()) {
                next_tick = basemono + TICK_NSEC;
        } else {
                /*
@@ -986,6 +1001,19 @@ ktime_t tick_nohz_get_sleep_length(void)
 }
 /**
+ * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
+ * for a particular CPU.
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
+{
+        struct tick_sched *ts = tick_get_tick_sched(cpu);
+        return ts->idle_calls;
+}
+/**
 * tick_nohz_get_idle_calls - return the current idle calls counter value
 *
 * Called from the schedutil frequency scaling governor in scheduler context.
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ffebcf878fba..0bcf00e3ce48 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -823,11 +823,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
        struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
        /*
-         * If the timer is deferrable and nohz is active then we need to use
+         * If the timer is deferrable and NO_HZ_COMMON is set then we need
-         * the deferrable base.
+         * to use the deferrable base.
         */
-        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
-            (tflags & TIMER_DEFERRABLE))
                base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
        return base;
 }
@@ -837,11 +836,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
        /*
-         * If the timer is deferrable and nohz is active then we need to use
+         * If the timer is deferrable and NO_HZ_COMMON is set then we need
-         * the deferrable base.
+         * to use the deferrable base.
         */
-        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
-            (tflags & TIMER_DEFERRABLE))
                base = this_cpu_ptr(&timer_bases[BASE_DEF]);
        return base;
 }
@@ -1009,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
        if (!ret && (options & MOD_TIMER_PENDING_ONLY))
                goto out_unlock;
-        debug_activate(timer, expires);
        new_base = get_target_base(base, timer->flags);
        if (base != new_base) {
@@ -1034,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
                }
        }
+        debug_activate(timer, expires);
        timer->expires = expires;
        /*
         * If 'idx' was calculated above and the base time did not advance
@@ -1684,7 +1682,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
        base->must_forward_clk = false;
        __run_timers(base);
-        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+        if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
                __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
 }
@@ -1698,7 +1696,7 @@ void run_local_timers(void)
        hrtimer_run_queues();
        /* Raise the softirq only if required. */
        if (time_before(jiffies, base->clk)) {
-                if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+                if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
                        return;
                /* CPU is awake, so check the deferrable base. */
                base++;
@@ -1855,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
        }
 }
+int timers_prepare_cpu(unsigned int cpu)
+{
+        struct timer_base *base;
+        int b;
+        for (b = 0; b < NR_BASES; b++) {
+                base = per_cpu_ptr(&timer_bases[b], cpu);
+                base->clk = jiffies;
+                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+                base->is_idle = false;
+                base->must_forward_clk = true;
+        }
+        return 0;
+}
 int timers_dead_cpu(unsigned int cpu)
 {
        struct timer_base *old_base;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index af7dad126c13..f54dc62b599c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS
        bool "Enable trace events for preempt and irq disable/enable"
        select TRACE_IRQFLAGS
        depends on DEBUG_PREEMPT || !PROVE_LOCKING
+        depends on TRACING
        default n
        help
          Enable tracing of disable and enable events for preemption and irqs.
@@ -354,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES
          on if you need to profile the system's use of these macros.
 config PROFILE_ALL_BRANCHES
-        bool "Profile all if conditionals"
+        bool "Profile all if conditionals" if !FORTIFY_SOURCE
        select TRACE_BRANCH_PROFILING
        help
          This tracer profiles all branch conditions. Every if ()
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 27d1f4ffa3de..40207c2a4113 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
        .arg4_type      = ARG_CONST_SIZE,
 };
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
 static __always_inline u64
 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
-                        u64 flags, struct perf_raw_record *raw)
+                        u64 flags, struct perf_sample_data *sd)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
-        struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
        unsigned int cpu = smp_processor_id();
        u64 index = flags & BPF_F_INDEX_MASK;
        struct bpf_event_entry *ee;
@@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
        if (unlikely(event->oncpu != cpu))
                return -EOPNOTSUPP;
-        perf_sample_data_init(sd, 0, 0);
-        sd->raw = raw;
        perf_event_output(event, sd, regs);
        return 0;
 }
@@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
           u64, flags, void *, data, u64, size)
 {
+        struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
        struct perf_raw_record raw = {
                .frag = {
                        .size = size,
@@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
        if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
                return -EINVAL;
-        return __bpf_perf_event_output(regs, map, flags, &raw);
+        perf_sample_data_init(sd, 0, 0);
+        sd->raw = &raw;
+        return __bpf_perf_event_output(regs, map, flags, sd);
 }
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 };
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
+        struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
        struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
        struct perf_raw_frag frag = {
                .copy           = ctx_copy,
@@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
        };
        perf_fetch_caller_regs(regs);
+        perf_sample_data_init(sd, 0, 0);
+        sd->raw = &raw;
-        return __bpf_perf_event_output(regs, map, flags, &raw);
+        return __bpf_perf_event_output(regs, map, flags, sd);
 }
 BPF_CALL_0(bpf_get_current_task)
@@ -759,6 +764,8 @@ const struct bpf_prog_ops perf_event_prog_ops = {
 static DEFINE_MUTEX(bpf_event_mutex);
+#define BPF_TRACE_MAX_PROGS 64
 int perf_event_attach_bpf_prog(struct perf_event *event,
                               struct bpf_prog *prog)
 {
@@ -772,6 +779,12 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
                goto unlock;
        old_array = event->tp_event->prog_array;
+        if (old_array &&
+            bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
+                ret = -E2BIG;
+                goto unlock;
+        }
        ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
        if (ret < 0)
                goto unlock;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ccdf3664e4a9..554b517c61a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = {
 };
 /*
- * This is used by __kernel_text_address() to return true if the
+ * Used by the stack undwinder to know about dynamic ftrace trampolines.
- * address is on a dynamically allocated trampoline that would
- * not return true for either core_kernel_text() or
- * is_module_text_address().
 */
-bool is_ftrace_trampoline(unsigned long addr)
+struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
 {
-        struct ftrace_ops *op;
+        struct ftrace_ops *op = NULL;
-        bool ret = false;
        /*
         * Some of the ops may be dynamically allocated,
@@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr)
                if (op->trampoline && op->trampoline_size)
                        if (addr >= op->trampoline &&
                            addr < op->trampoline + op->trampoline_size) {
-                                ret = true;
+                                preempt_enable_notrace();
-                                goto out;
+                                return op;
                        }
        } while_for_each_ftrace_op(op);
- out:
        preempt_enable_notrace();
-        return ret;
+        return NULL;
+}
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+        return ftrace_ops_trampoline(addr) != NULL;
 }
 struct ftrace_page {
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 91874a95060d..5af2842dea96 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 /* Missed count stored at end */
 #define RB_MISSED_STORED        (1 << 30)
+#define RB_MISSED_FLAGS         (RB_MISSED_EVENTS|RB_MISSED_STORED)
 struct buffer_data_page {
        u64              time_stamp;    /* page time stamp */
        local_t          commit;        /* write committed index */
@@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
 */
 size_t ring_buffer_page_len(void *page)
 {
-        return local_read(&((struct buffer_data_page *)page)->commit)
+        struct buffer_data_page *bpage = page;
+        return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
                + BUF_PAGE_HDR_SIZE;
 }
@@ -1799,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
-static __always_inline void *
-__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
-{
-        return bpage->data + index;
-}
 static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
 {
        return bpage->page->data + index;
@@ -2536,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
 * The lock and unlock are done within a preempt disable section.
 * The current_context per_cpu variable can only be modified
 * by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. There are four
+ * be modified more than once via an interrupt. To pass this
- * different contexts that we need to consider.
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ *  bit 0 =  NMI context
+ *  bit 1 =  IRQ context
+ *  bit 2 =  SoftIRQ context
+ *  bit 3 =  normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
 *
- *  Normal context.
+ * (binary)
- *  SoftIRQ context
+ *  101 - 1 = 100
- *  IRQ context
+ *  101 & 100 = 100 (clearing bit zero)
- *  NMI context
 *
- * If for some reason the ring buffer starts to recurse, we
+ *  1010 - 1 = 1001
- * only allow that to happen at most 4 times (one for each
+ *  1010 & 1001 = 1000 (clearing bit 1)
- * context). If it happens 5 times, then we consider this a
+ *
- * recusive loop and do not let it go further.
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
 */
 static __always_inline int
 trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-        if (cpu_buffer->current_context >= 4)
+        unsigned int val = cpu_buffer->current_context;
+        unsigned long pc = preempt_count();
+        int bit;
+        if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+                bit = RB_CTX_NORMAL;
+        else
+                bit = pc & NMI_MASK ? RB_CTX_NMI :
+                        pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
+        if (unlikely(val & (1 << bit)))
                return 1;
-        cpu_buffer->current_context++;
+        val |= (1 << bit);
-        /* Interrupts must see this update */
+        cpu_buffer->current_context = val;
-        barrier();
        return 0;
 }
@@ -2566,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 static __always_inline void
 trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-        /* Don't let the dec leak out */
+        cpu_buffer->current_context &= cpu_buffer->current_context - 1;
-        barrier();
-        cpu_buffer->current_context--;
 }
 /**
@@ -4406,8 +4431,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
 {
        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
        struct buffer_data_page *bpage = data;
+        struct page *page = virt_to_page(bpage);
        unsigned long flags;
+        /* If the page is still in use someplace else, we can't reuse it */
+        if (page_ref_count(page) > 1)
+                goto out;
        local_irq_save(flags);
        arch_spin_lock(&cpu_buffer->lock);
@@ -4419,6 +4449,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
        arch_spin_unlock(&cpu_buffer->lock);
        local_irq_restore(flags);
+ out:
        free_page((unsigned long)bpage);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 73e67b68c53b..8e3f20a18a06 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct
 }
 /**
- * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list
 * @pid_list: The list to modify
 * @self: The current task for fork or NULL for exit
 * @task: The task to add or remove
@@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr)
 }
 /**
- * trace_snapshot - take a snapshot of the current buffer.
+ * tracing_snapshot - take a snapshot of the current buffer.
 *
 * This causes a swap between the snapshot buffer and the current live
 * tracing buffer. You can use this to take snapshots of the live
@@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void)
 EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
 /**
- * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
 *
- * This is similar to trace_snapshot(), but it will allocate the
+ * This is similar to tracing_snapshot(), but it will allocate the
 * snapshot buffer if it isn't already allocated. Use this only
 * where it is safe to sleep, as the allocation may sleep.
 *
@@ -1303,7 +1303,7 @@ unsigned long __read_mostly	tracing_thresh;
 /*
 * Copy the new maximum trace into the separate maximum-trace
 * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
+ * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
 */
 static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
+/*
+ * Skip 3:
+ *
+ *   trace_buffer_unlock_commit_regs()
+ *   trace_event_buffer_commit()
+ *   trace_event_raw_event_xxx()
+*/
+# define STACK_SKIP 3
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
                                     struct ring_buffer *buffer,
                                     struct ring_buffer_event *event,
@@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
        __buffer_unlock_commit(buffer, event);
        /*
-         * If regs is not set, then skip the following callers:
+         * If regs is not set, then skip the necessary functions.
-         *   trace_buffer_unlock_commit_regs
-         *   event_trigger_unlock_commit
-         *   trace_event_buffer_commit
-         *   trace_event_raw_event_sched_switch
         * Note, we can still get here via blktrace, wakeup tracer
         * and mmiotrace, but that's ok if they lose a function or
-         * two. They are that meaningful.
+         * two. They are not that meaningful.
         */
-        ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
+        ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
        ftrace_trace_userstack(buffer, flags, pc);
 }
@@ -2415,7 +2420,7 @@ trace_process_export(struct trace_export *export,
        entry = ring_buffer_event_data(event);
        size = ring_buffer_event_length(event);
-        export->write(entry, size);
+        export->write(export, entry, size);
 }
 static DEFINE_MUTEX(ftrace_export_lock);
@@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
        trace.skip              = skip;
        /*
-         * Add two, for this function and the call to save_stack_trace()
+         * Add one, for this function and the call to save_stack_trace()
         * If regs is set, then these functions will not be in the way.
         */
+#ifndef CONFIG_UNWINDER_ORC
        if (!regs)
-                trace.skip += 2;
+                trace.skip++;
+#endif
        /*
         * Since events can happen in NMIs there's no safe way to
@@ -2711,11 +2718,10 @@ void trace_dump_stack(int skip)
        local_save_flags(flags);
-        /*
+#ifndef CONFIG_UNWINDER_ORC
-         * Skip 3 more, seems to get us at the caller of
+        /* Skip 1 to skip this function. */
-         * this function.
+        skip++;
-         */
+#endif
-        skip += 3;
        __ftrace_trace_stack(global_trace.trace_buffer.buffer,
                             flags, skip, preempt_count(), NULL);
 }
@@ -4178,37 +4184,30 @@ static const struct file_operations show_traces_fops = {
        .llseek         = seq_lseek,
 };
-/*
- * The tracer itself will not take this lock, but still we want
- * to provide a consistent cpumask to user-space:
- */
-static DEFINE_MUTEX(tracing_cpumask_update_lock);
-/*
- * Temporary storage for the character representation of the
- * CPU bitmask (and one more byte for the newline):
- */
-static char mask_str[NR_CPUS + 1];
 static ssize_t
 tracing_cpumask_read(struct file *filp, char __user *ubuf,
                     size_t count, loff_t *ppos)
 {
        struct trace_array *tr = file_inode(filp)->i_private;
+        char *mask_str;
        int len;
-        mutex_lock(&tracing_cpumask_update_lock);
+        len = snprintf(NULL, 0, "%*pb\n",
+                       cpumask_pr_args(tr->tracing_cpumask)) + 1;
+        mask_str = kmalloc(len, GFP_KERNEL);
+        if (!mask_str)
+                return -ENOMEM;
-        len = snprintf(mask_str, count, "%*pb\n",
+        len = snprintf(mask_str, len, "%*pb\n",
                       cpumask_pr_args(tr->tracing_cpumask));
        if (len >= count) {
                count = -EINVAL;
                goto out_err;
        }
-        count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+        count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
 out_err:
-        mutex_unlock(&tracing_cpumask_update_lock);
+        kfree(mask_str);
        return count;
 }
@@ -4228,8 +4227,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        if (err)
                goto err_unlock;
-        mutex_lock(&tracing_cpumask_update_lock);
        local_irq_disable();
        arch_spin_lock(&tr->max_lock);
        for_each_tracing_cpu(cpu) {
@@ -4252,8 +4249,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        local_irq_enable();
        cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
-        mutex_unlock(&tracing_cpumask_update_lock);
        free_cpumask_var(tracing_cpumask_new);
        return count;
@@ -6780,7 +6775,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                .spd_release    = buffer_spd_release,
        };
        struct buffer_ref *ref;
-        int entries, size, i;
+        int entries, i;
        ssize_t ret = 0;
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -6834,14 +6829,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                        break;
                }
-                /*
-                 * zero out any left over data, this is going to
-                 * user land.
-                 */
-                size = ring_buffer_page_len(ref->page);
-                if (size < PAGE_SIZE)
-                        memset(ref->page + size, 0, PAGE_SIZE - size);
                page = virt_to_page(ref->page);
                spd.pages[i] = page;
@@ -7599,6 +7586,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
        buf->data = alloc_percpu(struct trace_array_cpu);
        if (!buf->data) {
                ring_buffer_free(buf->buffer);
+                buf->buffer = NULL;
                return -ENOMEM;
        }
@@ -7622,7 +7610,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
                                    allocate_snapshot ? size : 1);
        if (WARN_ON(ret)) {
                ring_buffer_free(tr->trace_buffer.buffer);
+                tr->trace_buffer.buffer = NULL;
                free_percpu(tr->trace_buffer.data);
+                tr->trace_buffer.data = NULL;
                return -ENOMEM;
        }
        tr->allocated_snapshot = allocate_snapshot;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ec0f9aa4e151..1b87157edbff 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
 {
        struct trace_event_call *call, *p;
        const char *last_system = NULL;
+        bool first = false;
        int last_i;
        int i;
@@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
        list_for_each_entry_safe(call, p, &ftrace_events, list) {
                /* events are usually grouped together with systems */
                if (!last_system || call->class->system != last_system) {
+                        first = true;
                        last_i = 0;
                        last_system = call->class->system;
                }
+                /*
+                 * Since calls are grouped by systems, the likelyhood that the
+                 * next call in the iteration belongs to the same system as the
+                 * previous call is high. As an optimization, we skip seaching
+                 * for a map[] that matches the call's system if the last call
+                 * was from the same system. That's what last_i is for. If the
+                 * call has the same system as the previous call, then last_i
+                 * will be the index of the first map[] that has a matching
+                 * system.
+                 */
                for (i = last_i; i < len; i++) {
                        if (call->class->system == map[i]->system) {
                                /* Save the first system if need be */
-                                if (!last_i)
+                                if (first) {
                                        last_i = i;
+                                        first = false;
+                                }
                                update_event_printk(call, map[i]);
                        }
                }
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index f2ac9d44f6c4..87411482a46f 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
 #endif /* CONFIG_TRACER_SNAPSHOT */
 #ifdef CONFIG_STACKTRACE
+#ifdef CONFIG_UNWINDER_ORC
+/* Skip 2:
+ *   event_triggers_post_call()
+ *   trace_event_raw_event_xxx()
+ */
+# define STACK_SKIP 2
+#else
 /*
- * Skip 3:
+ * Skip 4:
 *   stacktrace_trigger()
 *   event_triggers_post_call()
+ *   trace_event_buffer_commit()
 *   trace_event_raw_event_xxx()
 */
-#define STACK_SKIP 3
+#define STACK_SKIP 4
+#endif
 static void
 stacktrace_trigger(struct event_trigger_data *data, void *rec)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 27f7ad12c4b1..b611cd36e22d 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
        preempt_enable_notrace();
 }
+#ifdef CONFIG_UNWINDER_ORC
+/*
+ * Skip 2:
+ *
+ *   function_stack_trace_call()
+ *   ftrace_call()
+ */
+#define STACK_SKIP 2
+#else
+/*
+ * Skip 3:
+ *   __trace_stack()
+ *   function_stack_trace_call()
+ *   ftrace_call()
+ */
+#define STACK_SKIP 3
+#endif
 static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
                          struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
        if (likely(disabled == 1)) {
                pc = preempt_count();
                trace_function(tr, ip, parent_ip, flags, pc);
-                /*
+                __trace_stack(tr, flags, STACK_SKIP, pc);
-                 * skip over 5 funcs:
-                 *    __ftrace_trace_stack,
-                 *    __trace_stack,
-                 *    function_stack_trace_call
-                 *    ftrace_list_func
-                 *    ftrace_call
-                 */
-                __trace_stack(tr, flags, 5, pc);
        }
        atomic_dec(&data->disabled);
@@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
        tracer_tracing_off(tr);
 }
+#ifdef CONFIG_UNWINDER_ORC
 /*
- * Skip 4:
+ * Skip 3:
+ *
+ *   function_trace_probe_call()
+ *   ftrace_ops_assist_func()
+ *   ftrace_call()
+ */
+#define FTRACE_STACK_SKIP 3
+#else
+/*
+ * Skip 5:
+ *
+ *   __trace_stack()
 *   ftrace_stacktrace()
 *   function_trace_probe_call()
- *   ftrace_ops_list_func()
+ *   ftrace_ops_assist_func()
 *   ftrace_call()
 */
-#define STACK_SKIP 4
+#define FTRACE_STACK_SKIP 5
+#endif
 static __always_inline void trace_stack(struct trace_array *tr)
 {
@@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr)
        local_save_flags(flags);
        pc = preempt_count();
-        __trace_stack(tr, flags, STACK_SKIP, pc);
+        __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
 }
 static void
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 734accc02418..3c7bfc4bf5e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
        if (__this_cpu_read(disable_stack_tracer) != 1)
                goto out;
+        /* If rcu is not watching, then save stack trace can fail */
+        if (!rcu_is_watching())
+                goto out;
        ip += MCOUNT_INSN_SIZE;
        check_stack(ip, &stack);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index ce74a4901d2b..ef1da2a5f9bd 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
                return retval;
        }
+        groups_sort(group_info);
        retval = set_current_groups(group_info);
        put_group_info(group_info);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8fdb710bfdd7..f699122dab32 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -38,7 +38,6 @@
 #include <linux/hardirq.h>
 #include <linux/mempolicy.h>
 #include <linux/freezer.h>
-#include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
@@ -48,6 +47,8 @@
 #include <linux/nodemask.h>
 #include <linux/moduleparam.h>
 #include <linux/uaccess.h>
+#include <linux/sched/isolation.h>
+#include <linux/nmi.h>
 #include "workqueue_internal.h"
@@ -1634,7 +1635,7 @@ static void worker_enter_idle(struct worker *worker)
                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
        /*
-         * Sanity check nr_running.  Because wq_unbind_fn() releases
+         * Sanity check nr_running.  Because unbind_workers() releases
         * pool->lock between setting %WORKER_UNBOUND and zapping
         * nr_running, the warning may trigger spuriously.  Check iff
         * unbind is not in progress.
@@ -4463,6 +4464,12 @@ void show_workqueue_state(void)
                        if (pwq->nr_active || !list_empty(&pwq->delayed_works))
                                show_pwq(pwq);
                        spin_unlock_irqrestore(&pwq->pool->lock, flags);
+                        /*
+                         * We could be printing a lot from atomic context, e.g.
+                         * sysrq-t -> show_workqueue_state(). Avoid triggering
+                         * hard lockup.
+                         */
+                        touch_nmi_watchdog();
                }
        }
@@ -4490,6 +4497,12 @@ void show_workqueue_state(void)
                pr_cont("\n");
        next_pool:
                spin_unlock_irqrestore(&pool->lock, flags);
+                /*
+                 * We could be printing a lot from atomic context, e.g.
+                 * sysrq-t -> show_workqueue_state(). Avoid triggering
+                 * hard lockup.
+                 */
+                touch_nmi_watchdog();
        }
        rcu_read_unlock_sched();
@@ -4510,9 +4523,8 @@ void show_workqueue_state(void)
 * cpu comes back online.
 */
-static void wq_unbind_fn(struct work_struct *work)
+static void unbind_workers(int cpu)
 {
-        int cpu = smp_processor_id();
        struct worker_pool *pool;
        struct worker *worker;
@@ -4589,16 +4601,6 @@ static void rebind_workers(struct worker_pool *pool)
        spin_lock_irq(&pool->lock);
-        /*
-         * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
-         * w/o preceding DOWN_PREPARE.  Work around it.  CPU hotplug is
-         * being reworked and this can go away in time.
-         */
-        if (!(pool->flags & POOL_DISASSOCIATED)) {
-                spin_unlock_irq(&pool->lock);
-                return;
-        }
        pool->flags &= ~POOL_DISASSOCIATED;
        for_each_pool_worker(worker, pool) {
@@ -4709,12 +4711,13 @@ int workqueue_online_cpu(unsigned int cpu)
 int workqueue_offline_cpu(unsigned int cpu)
 {
-        struct work_struct unbind_work;
        struct workqueue_struct *wq;
        /* unbinding per-cpu workers should happen on the local CPU */
-        INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
+        if (WARN_ON(cpu != smp_processor_id()))
-        queue_work_on(cpu, system_highpri_wq, &unbind_work);
+                return -1;
+        unbind_workers(cpu);
        /* update NUMA affinity of unbound workqueues */
        mutex_lock(&wq_pool_mutex);
@@ -4722,9 +4725,6 @@ int workqueue_offline_cpu(unsigned int cpu)
                wq_update_unbound_numa(wq, cpu, false);
        mutex_unlock(&wq_pool_mutex);
-        /* wait for per-cpu unbinding to finish */
-        flush_work(&unbind_work);
-        destroy_work_on_stack(&unbind_work);
        return 0;
 }
@@ -4957,6 +4957,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
        if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
                return -ENOMEM;
+        /*
+         * Not excluding isolated cpus on purpose.
+         * If the user wishes to include them, we allow that.
+         */
        cpumask_and(cpumask, cpumask, cpu_possible_mask);
        if (!cpumask_empty(cpumask)) {
                apply_wqattrs_lock();
@@ -5555,7 +5559,7 @@ int __init workqueue_init_early(void)
        WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
        BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-        cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+        cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
author	Jason Gunthorpe <jgg@mellanox.com>	2018-01-29 15:26:40 -0500
committer	Jason Gunthorpe <jgg@mellanox.com>	2018-01-30 11:30:00 -0500
commit	e7996a9a77fc669387da43ff4823b91cc4872bd0 (patch)
tree	617f0a128e222539d67e8cccc359f1bc4b984900 /kernel
parent	b5fa635aab8f0d39a824c01991266a6d06f007fb (diff)
parent	d8a5b80568a9cb66810e75b182018e9edb68e8ff (diff)