Merge 4.15-rc6 into char-misc-next

We want the fixes in here as well. Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
author: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2018-01-02 08:46:35 -0500
committer: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2018-01-02 08:46:35 -0500
commit: b6a09416e83ffe4eccfb4ef1b91b3b66483fa810 (patch)
tree: b30f266e85047244dcdb47d5afc134e76aec530d /kernel
parent: db809859c8cee415293b830e67178f526d1eb2be (diff)
parent: 30a7acd573899fd8b8ac39236eff6468b195ac7d (diff)
31 files changed, 462 insertions, 887 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index e469e05c8e83..3905d4bc5b80 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab)
                pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
                                         htab->map.key_size);
                free_percpu(pptr);
+                cond_resched();
        }
 free_elems:
        bpf_map_area_free(htab->elems);
@@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab)
                        goto free_elems;
                htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
                                  pptr);
+                cond_resched();
        }
 skip_percpu_elems:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d4593571c404..04b24876cd23 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1059,6 +1059,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
                break;
        case PTR_TO_STACK:
                pointer_desc = "stack ";
+                /* The stack spill tracking logic in check_stack_write()
+                 * and check_stack_read() relies on stack accesses being
+                 * aligned.
+                 */
+                strict = true;
                break;
        default:
                break;
@@ -1067,6 +1072,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
                                           strict);
 }
+/* truncate register to smaller size (in bytes)
+ * must be called with size < BPF_REG_SIZE
+ */
+static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
+{
+        u64 mask;
+        /* clear high bits in bit representation */
+        reg->var_off = tnum_cast(reg->var_off, size);
+        /* fix arithmetic bounds */
+        mask = ((u64)1 << (size * 8)) - 1;
+        if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
+                reg->umin_value &= mask;
+                reg->umax_value &= mask;
+        } else {
+                reg->umin_value = 0;
+                reg->umax_value = mask;
+        }
+        reg->smin_value = reg->umin_value;
+        reg->smax_value = reg->umax_value;
+}
 /* check whether memory at (regno + off) is accessible for t = (read | write)
 * if t==write, value_regno is a register which value is stored into memory
 * if t==read, value_regno is a register which will receive the value from memory
@@ -1200,9 +1228,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
        if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
            regs[value_regno].type == SCALAR_VALUE) {
                /* b/h/w load zero-extends, mark upper bits as known 0 */
-                regs[value_regno].var_off =
+                coerce_reg_to_size(&regs[value_regno], size);
-                        tnum_cast(regs[value_regno].var_off, size);
-                __update_reg_bounds(&regs[value_regno]);
        }
        return err;
 }
@@ -1282,6 +1308,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
                tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
                verbose(env, "invalid variable stack read R%d var_off=%s\n",
                        regno, tn_buf);
+                return -EACCES;
        }
        off = regs[regno].off + regs[regno].var_off.value;
        if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
@@ -1674,7 +1701,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
                return -EINVAL;
        }
+        /* With LD_ABS/IND some JITs save/restore skb from r1. */
        changes_data = bpf_helper_changes_pkt_data(fn->func);
+        if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
+                verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
+                        func_id_name(func_id), func_id);
+                return -EINVAL;
+        }
        memset(&meta, 0, sizeof(meta));
        meta.pkt_access = fn->pkt_access;
@@ -1766,14 +1799,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
        return 0;
 }
-static void coerce_reg_to_32(struct bpf_reg_state *reg)
-{
-        /* clear high 32 bits */
-        reg->var_off = tnum_cast(reg->var_off, 4);
-        /* Update bounds */
-        __update_reg_bounds(reg);
-}
 static bool signed_add_overflows(s64 a, s64 b)
 {
        /* Do the add in u64, where overflow is well-defined */
@@ -1794,6 +1819,41 @@ static bool signed_sub_overflows(s64 a, s64 b)
        return res > a;
 }
+static bool check_reg_sane_offset(struct bpf_verifier_env *env,
+                                  const struct bpf_reg_state *reg,
+                                  enum bpf_reg_type type)
+{
+        bool known = tnum_is_const(reg->var_off);
+        s64 val = reg->var_off.value;
+        s64 smin = reg->smin_value;
+        if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+                verbose(env, "math between %s pointer and %lld is not allowed\n",
+                        reg_type_str[type], val);
+                return false;
+        }
+        if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
+                verbose(env, "%s pointer offset %d is not allowed\n",
+                        reg_type_str[type], reg->off);
+                return false;
+        }
+        if (smin == S64_MIN) {
+                verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
+                        reg_type_str[type]);
+                return false;
+        }
+        if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+                verbose(env, "value %lld makes %s pointer be out of bounds\n",
+                        smin, reg_type_str[type]);
+                return false;
+        }
+        return true;
+}
 /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
 * Caller should also handle BPF_MOV case separately.
 * If we return -EACCES, caller may want to try again treating pointer as a
@@ -1830,29 +1890,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        if (BPF_CLASS(insn->code) != BPF_ALU64) {
                /* 32-bit ALU ops on pointers produce (meaningless) scalars */
-                if (!env->allow_ptr_leaks)
+                verbose(env,
-                        verbose(env,
+                        "R%d 32-bit pointer arithmetic prohibited\n",
-                                "R%d 32-bit pointer arithmetic prohibited\n",
+                        dst);
-                                dst);
                return -EACCES;
        }
        if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-                if (!env->allow_ptr_leaks)
+                verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
-                        verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+                        dst);
-                                dst);
                return -EACCES;
        }
        if (ptr_reg->type == CONST_PTR_TO_MAP) {
-                if (!env->allow_ptr_leaks)
+                verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
-                        verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+                        dst);
-                                dst);
                return -EACCES;
        }
        if (ptr_reg->type == PTR_TO_PACKET_END) {
-                if (!env->allow_ptr_leaks)
+                verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
-                        verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+                        dst);
-                                dst);
                return -EACCES;
        }
@@ -1862,6 +1918,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        dst_reg->type = ptr_reg->type;
        dst_reg->id = ptr_reg->id;
+        if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
+            !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+                return -EINVAL;
        switch (opcode) {
        case BPF_ADD:
                /* We can take a fixed offset as long as it doesn't overflow
@@ -1915,9 +1975,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        case BPF_SUB:
                if (dst_reg == off_reg) {
                        /* scalar -= pointer.  Creates an unknown scalar */
-                        if (!env->allow_ptr_leaks)
+                        verbose(env, "R%d tried to subtract pointer from scalar\n",
-                                verbose(env, "R%d tried to subtract pointer from scalar\n",
+                                dst);
-                                        dst);
                        return -EACCES;
                }
                /* We don't allow subtraction from FP, because (according to
@@ -1925,9 +1984,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                 * be able to deal with it.
                 */
                if (ptr_reg->type == PTR_TO_STACK) {
-                        if (!env->allow_ptr_leaks)
+                        verbose(env, "R%d subtraction from stack pointer prohibited\n",
-                                verbose(env, "R%d subtraction from stack pointer prohibited\n",
+                                dst);
-                                        dst);
                        return -EACCES;
                }
                if (known && (ptr_reg->off - smin_val ==
@@ -1976,28 +2034,30 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        case BPF_AND:
        case BPF_OR:
        case BPF_XOR:
-                /* bitwise ops on pointers are troublesome, prohibit for now.
+                /* bitwise ops on pointers are troublesome, prohibit. */
-                 * (However, in principle we could allow some cases, e.g.
+                verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
-                 * ptr &= ~3 which would reduce min_value by 3.)
+                        dst, bpf_alu_string[opcode >> 4]);
-                 */
-                if (!env->allow_ptr_leaks)
-                        verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
-                                dst, bpf_alu_string[opcode >> 4]);
                return -EACCES;
        default:
                /* other operators (e.g. MUL,LSH) produce non-pointer results */
-                if (!env->allow_ptr_leaks)
+                verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
-                        verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
+                        dst, bpf_alu_string[opcode >> 4]);
-                                dst, bpf_alu_string[opcode >> 4]);
                return -EACCES;
        }
+        if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
+                return -EINVAL;
        __update_reg_bounds(dst_reg);
        __reg_deduce_bounds(dst_reg);
        __reg_bound_offset(dst_reg);
        return 0;
 }
+/* WARNING: This function does calculations on 64-bit values, but the actual
+ * execution may occur on 32-bit values. Therefore, things like bitshifts
+ * need extra checks in the 32-bit case.
+ */
 static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                                      struct bpf_insn *insn,
                                      struct bpf_reg_state *dst_reg,
@@ -2008,12 +2068,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
        bool src_known, dst_known;
        s64 smin_val, smax_val;
        u64 umin_val, umax_val;
+        u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
-        if (BPF_CLASS(insn->code) != BPF_ALU64) {
-                /* 32-bit ALU ops are (32,32)->64 */
-                coerce_reg_to_32(dst_reg);
-                coerce_reg_to_32(&src_reg);
-        }
        smin_val = src_reg.smin_value;
        smax_val = src_reg.smax_value;
        umin_val = src_reg.umin_value;
@@ -2021,6 +2077,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
        src_known = tnum_is_const(src_reg.var_off);
        dst_known = tnum_is_const(dst_reg->var_off);
+        if (!src_known &&
+            opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+                __mark_reg_unknown(dst_reg);
+                return 0;
+        }
        switch (opcode) {
        case BPF_ADD:
                if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
@@ -2149,9 +2211,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                __update_reg_bounds(dst_reg);
                break;
        case BPF_LSH:
-                if (umax_val > 63) {
+                if (umax_val >= insn_bitness) {
-                        /* Shifts greater than 63 are undefined.  This includes
+                        /* Shifts greater than 31 or 63 are undefined.
-                         * shifts by a negative number.
+                         * This includes shifts by a negative number.
                         */
                        mark_reg_unknown(env, regs, insn->dst_reg);
                        break;
@@ -2177,27 +2239,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                __update_reg_bounds(dst_reg);
                break;
        case BPF_RSH:
-                if (umax_val > 63) {
+                if (umax_val >= insn_bitness) {
-                        /* Shifts greater than 63 are undefined.  This includes
+                        /* Shifts greater than 31 or 63 are undefined.
-                         * shifts by a negative number.
+                         * This includes shifts by a negative number.
                         */
                        mark_reg_unknown(env, regs, insn->dst_reg);
                        break;
                }
-                /* BPF_RSH is an unsigned shift, so make the appropriate casts */
+                /* BPF_RSH is an unsigned shift.  If the value in dst_reg might
-                if (dst_reg->smin_value < 0) {
+                 * be negative, then either:
-                        if (umin_val) {
+                 * 1) src_reg might be zero, so the sign bit of the result is
-                                /* Sign bit will be cleared */
+                 *    unknown, so we lose our signed bounds
-                                dst_reg->smin_value = 0;
+                 * 2) it's known negative, thus the unsigned bounds capture the
-                        } else {
+                 *    signed bounds
-                                /* Lost sign bit information */
+                 * 3) the signed bounds cross zero, so they tell us nothing
-                                dst_reg->smin_value = S64_MIN;
+                 *    about the result
-                                dst_reg->smax_value = S64_MAX;
+                 * If the value in dst_reg is known nonnegative, then again the
-                        }
+                 * unsigned bounts capture the signed bounds.
-                } else {
+                 * Thus, in all cases it suffices to blow away our signed bounds
-                        dst_reg->smin_value =
+                 * and rely on inferring new ones from the unsigned bounds and
-                                (u64)(dst_reg->smin_value) >> umax_val;
+                 * var_off of the result.
-                }
+                 */
+                dst_reg->smin_value = S64_MIN;
+                dst_reg->smax_value = S64_MAX;
                if (src_known)
                        dst_reg->var_off = tnum_rshift(dst_reg->var_off,
                                                       umin_val);
@@ -2213,6 +2277,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                break;
        }
+        if (BPF_CLASS(insn->code) != BPF_ALU64) {
+                /* 32-bit ALU ops are (32,32)->32 */
+                coerce_reg_to_size(dst_reg, 4);
+                coerce_reg_to_size(&src_reg, 4);
+        }
        __reg_deduce_bounds(dst_reg);
        __reg_bound_offset(dst_reg);
        return 0;
@@ -2227,7 +2297,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
        struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
        struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
        u8 opcode = BPF_OP(insn->code);
-        int rc;
        dst_reg = &regs[insn->dst_reg];
        src_reg = NULL;
@@ -2238,43 +2307,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                if (src_reg->type != SCALAR_VALUE) {
                        if (dst_reg->type != SCALAR_VALUE) {
                                /* Combining two pointers by any ALU op yields
-                                 * an arbitrary scalar.
+                                 * an arbitrary scalar. Disallow all math except
+                                 * pointer subtraction
                                 */
-                                if (!env->allow_ptr_leaks) {
+                                if (opcode == BPF_SUB){
-                                        verbose(env, "R%d pointer %s pointer prohibited\n",
+                                        mark_reg_unknown(env, regs, insn->dst_reg);
-                                                insn->dst_reg,
+                                        return 0;
-                                                bpf_alu_string[opcode >> 4]);
-                                        return -EACCES;
                                }
-                                mark_reg_unknown(env, regs, insn->dst_reg);
+                                verbose(env, "R%d pointer %s pointer prohibited\n",
-                                return 0;
+                                        insn->dst_reg,
+                                        bpf_alu_string[opcode >> 4]);
+                                return -EACCES;
                        } else {
                                /* scalar += pointer
                                 * This is legal, but we have to reverse our
                                 * src/dest handling in computing the range
                                 */
-                                rc = adjust_ptr_min_max_vals(env, insn,
+                                return adjust_ptr_min_max_vals(env, insn,
-                                                             src_reg, dst_reg);
+                                                               src_reg, dst_reg);
-                                if (rc == -EACCES && env->allow_ptr_leaks) {
-                                        /* scalar += unknown scalar */
-                                        __mark_reg_unknown(&off_reg);
-                                        return adjust_scalar_min_max_vals(
-                                                        env, insn,
-                                                        dst_reg, off_reg);
-                                }
-                                return rc;
                        }
                } else if (ptr_reg) {
                        /* pointer += scalar */
-                        rc = adjust_ptr_min_max_vals(env, insn,
+                        return adjust_ptr_min_max_vals(env, insn,
-                                                     dst_reg, src_reg);
+                                                       dst_reg, src_reg);
-                        if (rc == -EACCES && env->allow_ptr_leaks) {
-                                /* unknown scalar += scalar */
-                                __mark_reg_unknown(dst_reg);
-                                return adjust_scalar_min_max_vals(
-                                                env, insn, dst_reg, *src_reg);
-                        }
-                        return rc;
                }
        } else {
                /* Pretend the src is a reg with a known value, since we only
@@ -2283,17 +2338,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                off_reg.type = SCALAR_VALUE;
                __mark_reg_known(&off_reg, insn->imm);
                src_reg = &off_reg;
-                if (ptr_reg) { /* pointer += K */
+                if (ptr_reg) /* pointer += K */
-                        rc = adjust_ptr_min_max_vals(env, insn,
+                        return adjust_ptr_min_max_vals(env, insn,
-                                                     ptr_reg, src_reg);
+                                                       ptr_reg, src_reg);
-                        if (rc == -EACCES && env->allow_ptr_leaks) {
-                                /* unknown scalar += K */
-                                __mark_reg_unknown(dst_reg);
-                                return adjust_scalar_min_max_vals(
-                                                env, insn, dst_reg, off_reg);
-                        }
-                        return rc;
-                }
        }
        /* Got here implies adding two SCALAR_VALUEs */
@@ -2390,17 +2437,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                        return -EACCES;
                                }
                                mark_reg_unknown(env, regs, insn->dst_reg);
-                                /* high 32 bits are known zero. */
+                                coerce_reg_to_size(&regs[insn->dst_reg], 4);
-                                regs[insn->dst_reg].var_off = tnum_cast(
-                                                regs[insn->dst_reg].var_off, 4);
-                                __update_reg_bounds(&regs[insn->dst_reg]);
                        }
                } else {
                        /* case: R = imm
                         * remember the value we stored into this reg
                         */
                        regs[insn->dst_reg].type = SCALAR_VALUE;
-                        __mark_reg_known(regs + insn->dst_reg, insn->imm);
+                        if (BPF_CLASS(insn->code) == BPF_ALU64) {
+                                __mark_reg_known(regs + insn->dst_reg,
+                                                 insn->imm);
+                        } else {
+                                __mark_reg_known(regs + insn->dst_reg,
+                                                 (u32)insn->imm);
+                        }
                }
        } else if (opcode > BPF_END) {
@@ -3431,15 +3481,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
                        return range_within(rold, rcur) &&
                               tnum_in(rold->var_off, rcur->var_off);
                } else {
-                        /* if we knew anything about the old value, we're not
+                        /* We're trying to use a pointer in place of a scalar.
-                         * equal, because we can't know anything about the
+                         * Even if the scalar was unbounded, this could lead to
-                         * scalar value of the pointer in the new value.
+                         * pointer leaks because scalars are allowed to leak
+                         * while pointers are not. We could make this safe in
+                         * special cases if root is calling us, but it's
+                         * probably not worth the hassle.
                         */
-                        return rold->umin_value == 0 &&
+                        return false;
-                               rold->umax_value == U64_MAX &&
-                               rold->smin_value == S64_MIN &&
-                               rold->smax_value == S64_MAX &&
-                               tnum_is_unknown(rold->var_off);
                }
        case PTR_TO_MAP_VALUE:
                /* If the new min/max/var_off satisfy the old ones and
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 5f780d8f6a9d..9caeda610249 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v)
        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
-        cset = rcu_dereference(current->cgroups);
+        cset = task_css_set(current);
        refcnt = refcount_read(&cset->refcount);
        seq_printf(seq, "css_set %pK %d", cset, refcnt);
        if (refcnt > cset->nr_tasks)
@@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
-        cset = rcu_dereference(current->cgroups);
+        cset = task_css_set(current);
        list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
index 133b465691d6..1e111dd455c4 100644
--- a/kernel/cgroup/stat.c
+++ b/kernel/cgroup/stat.c
@@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp)
        }
        /* ->updated_children list is self terminated */
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
-                cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+                struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+                cstat->updated_children = cgrp;
+                u64_stats_init(&cstat->sync);
+        }
        prev_cputime_init(&cgrp->stat.prev_cputime);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 41376c3ac93b..53f7dc65f9a3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
-static void inline cpuhp_lock_acquire(bool bringup)
+static inline void cpuhp_lock_acquire(bool bringup)
 {
        lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
 }
-static void inline cpuhp_lock_release(bool bringup)
+static inline void cpuhp_lock_release(bool bringup)
 {
        lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
 }
 #else
-static void inline cpuhp_lock_acquire(bool bringup) { }
+static inline void cpuhp_lock_acquire(bool bringup) { }
-static void inline cpuhp_lock_release(bool bringup) { }
+static inline void cpuhp_lock_release(bool bringup) { }
 #endif
@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
         * before blk_mq_queue_reinit_notify() from notify_dead(),
         * otherwise a RCU stall occurs.
         */
-        [CPUHP_TIMERS_DEAD] = {
+        [CPUHP_TIMERS_PREPARE] = {
                .name                   = "timers:dead",
-                .startup.single         = NULL,
+                .startup.single         = timers_prepare_cpu,
                .teardown.single        = timers_dead_cpu,
        },
        /* Kicks the plugged cpu into life */
diff --git a/kernel/exit.c b/kernel/exit.c
index 6b4298a41167..df0c91d5606c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1755,3 +1755,11 @@ Efault:
        return -EFAULT;
 }
 #endif
+__weak void abort(void)
+{
+        BUG();
+        /* if that doesn't kill us, halt */
+        panic("Oops failed to kill thread");
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 432eadf6b58c..2295fc69717f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                        goto out;
        }
        /* a new mm has just been created */
-        arch_dup_mmap(oldmm, mm);
+        retval = arch_dup_mmap(oldmm, mm);
-        retval = 0;
 out:
        up_write(&mm->mmap_sem);
        flush_tlb_mm(oldmm);
diff --git a/kernel/groups.c b/kernel/groups.c
index e357bc800111..daae2f2dc6d4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b)
        return gid_gt(a, b) - gid_lt(a, b);
 }
-static void groups_sort(struct group_info *group_info)
+void groups_sort(struct group_info *group_info)
 {
        sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
             gid_cmp, NULL);
 }
+EXPORT_SYMBOL(groups_sort);
 /* a simple bsearch */
 int groups_search(const struct group_info *group_info, kgid_t grp)
@@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
 void set_groups(struct cred *new, struct group_info *group_info)
 {
        put_group_info(new->group_info);
-        groups_sort(group_info);
        get_group_info(group_info);
        new->group_info = group_info;
 }
@@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
                return retval;
        }
+        groups_sort(group_info);
        retval = set_current_groups(group_info);
        put_group_info(group_info);
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 17f05ef8f575..e4d3819a91cc 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -12,6 +12,11 @@
 static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
+        static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
+        if (!__ratelimit(&ratelimit))
+                return;
        printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
                irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
        printk("->handle_irq():  %p, ", desc->handle_irq);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 7f608ac39653..acfaaef8672a 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = {
        BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
        BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
        BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
+        BIT_MASK_DESCR(IRQD_CAN_RESERVE),
        BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c26c5bb6b491..508c03dfef25 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
 EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
 /*
- * Separate lockdep class for interrupt chip which can nest irq_desc
+ * Separate lockdep classes for interrupt chip which can nest irq_desc
- * lock.
+ * lock and request mutex.
 */
 static struct lock_class_key irq_nested_lock_class;
+static struct lock_class_key irq_nested_request_class;
 /*
 * irq_map_generic_chip - Map a generic chip for an irq domain
@@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
        set_bit(idx, &gc->installed);
        if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
-                irq_set_lockdep_class(virq, &irq_nested_lock_class);
+                irq_set_lockdep_class(virq, &irq_nested_lock_class,
+                                      &irq_nested_request_class);
        if (chip->irq_calc_mask)
                chip->irq_calc_mask(data);
@@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
                        continue;
                if (flags & IRQ_GC_INIT_NESTED_LOCK)
-                        irq_set_lockdep_class(i, &irq_nested_lock_class);
+                        irq_set_lockdep_class(i, &irq_nested_lock_class,
+                                              &irq_nested_request_class);
                if (!(flags & IRQ_GC_NO_MASK)) {
                        struct irq_data *d = irq_get_irq_data(i);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 07d08ca701ec..ab19371eab9b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve)
 {
        irqd_set_activated(data);
        return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4f4f60015e8a..62068ad46930 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1693,7 +1693,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
        }
 }
-static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve)
 {
        int ret = 0;
@@ -1702,9 +1702,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
                if (irqd->parent_data)
                        ret = __irq_domain_activate_irq(irqd->parent_data,
-                                                        early);
+                                                        reserve);
                if (!ret && domain->ops->activate) {
-                        ret = domain->ops->activate(domain, irqd, early);
+                        ret = domain->ops->activate(domain, irqd, reserve);
                        /* Rollback in case of error */
                        if (ret && irqd->parent_data)
                                __irq_domain_deactivate_irq(irqd->parent_data);
@@ -1716,17 +1716,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
 /**
 * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
 *                           interrupt
- * @irq_data:   outermost irq_data associated with interrupt
+ * @irq_data:   Outermost irq_data associated with interrupt
+ * @reserve:    If set only reserve an interrupt vector instead of assigning one
 *
 * This is the second step to call domain_ops->activate to program interrupt
 * controllers, so the interrupt could actually get delivered.
 */
-int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve)
 {
        int ret = 0;
        if (!irqd_is_activated(irq_data))
-                ret = __irq_domain_activate_irq(irq_data, early);
+                ret = __irq_domain_activate_irq(irq_data, reserve);
        if (!ret)
                irqd_set_activated(irq_data);
        return ret;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index edb987b2c58d..2f3c4f5382cc 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
        return ret;
 }
+/*
+ * Carefully check whether the device can use reservation mode. If
+ * reservation mode is enabled then the early activation will assign a
+ * dummy vector to the device. If the PCI/MSI device does not support
+ * masking of the entry then this can result in spurious interrupts when
+ * the device driver is not absolutely careful. But even then a malfunction
+ * of the hardware could result in a spurious interrupt on the dummy vector
+ * and render the device unusable. If the entry can be masked then the core
+ * logic will prevent the spurious interrupt and reservation mode can be
+ * used. For now reservation mode is restricted to PCI/MSI.
+ */
+static bool msi_check_reservation_mode(struct irq_domain *domain,
+                                       struct msi_domain_info *info,
+                                       struct device *dev)
+{
+        struct msi_desc *desc;
+        if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+                return false;
+        if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
+                return false;
+        if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+                return false;
+        /*
+         * Checking the first MSI descriptor is sufficient. MSIX supports
+         * masking and MSI does so when the maskbit is set.
+         */
+        desc = first_msi_entry(dev);
+        return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
+}
 /**
 * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
 * @domain:     The domain to allocate from
@@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 {
        struct msi_domain_info *info = domain->host_data;
        struct msi_domain_ops *ops = info->ops;
-        msi_alloc_info_t arg;
+        struct irq_data *irq_data;
        struct msi_desc *desc;
+        msi_alloc_info_t arg;
        int i, ret, virq;
+        bool can_reserve;
        ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
        if (ret)
@@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
        if (ops->msi_finish)
                ops->msi_finish(&arg, 0);
+        can_reserve = msi_check_reservation_mode(domain, info, dev);
        for_each_msi_entry(desc, dev) {
                virq = desc->irq;
                if (desc->nvec_used == 1)
@@ -397,15 +435,25 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
                 * the MSI entries before the PCI layer enables MSI in the
                 * card. Otherwise the card latches a random msi message.
                 */
-                if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+                if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
-                        struct irq_data *irq_data;
+                        continue;
+                irq_data = irq_domain_get_irq_data(domain, desc->irq);
+                if (!can_reserve)
+                        irqd_clr_can_reserve(irq_data);
+                ret = irq_domain_activate_irq(irq_data, can_reserve);
+                if (ret)
+                        goto cleanup;
+        }
+        /*
+         * If these interrupts use reservation mode, clear the activated bit
+         * so request_irq() will assign the final vector.
+         */
+        if (can_reserve) {
+                for_each_msi_entry(desc, dev) {
                        irq_data = irq_domain_get_irq_data(domain, desc->irq);
-                        ret = irq_domain_activate_irq(irq_data, true);
+                        irqd_clr_activated(irq_data);
-                        if (ret)
-                                goto cleanup;
-                        if (info->flags & MSI_FLAG_MUST_REACTIVATE)
-                                irqd_clr_activated(irq_data);
                }
        }
        return 0;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 15f33faf4013..7594c033d98a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
-void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
 {
        write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
 }
@@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
-void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
 {
        write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
                        _RET_IP_);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 670d8d7d8087..5fa1324a4f29 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -57,10 +57,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/lock.h>
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-#include <linux/slab.h>
-#endif
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
@@ -75,19 +71,6 @@ module_param(lock_stat, int, 0644);
 #define lock_stat 0
 #endif
-#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
-static int crossrelease_fullstack = 1;
-#else
-static int crossrelease_fullstack;
-#endif
-static int __init allow_crossrelease_fullstack(char *str)
-{
-        crossrelease_fullstack = 1;
-        return 0;
-}
-early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
 /*
 * lockdep_lock: protects the lockdep graph, the hashes and the
 *               class/list/hash allocators.
@@ -740,18 +723,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
 }
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-static void cross_init(struct lockdep_map *lock, int cross);
-static int cross_lock(struct lockdep_map *lock);
-static int lock_acquire_crosslock(struct held_lock *hlock);
-static int lock_release_crosslock(struct lockdep_map *lock);
-#else
-static inline void cross_init(struct lockdep_map *lock, int cross) {}
-static inline int cross_lock(struct lockdep_map *lock) { return 0; }
-static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
-static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
-#endif
 /*
 * Register a lock's class in the hash-table, if the class is not present
 * yet. Otherwise we look it up. We cache the result in the lock object
@@ -1151,41 +1122,22 @@ print_circular_lock_scenario(struct held_lock *src,
                printk(KERN_CONT "\n\n");
        }
-        if (cross_lock(tgt->instance)) {
+        printk(" Possible unsafe locking scenario:\n\n");
-                printk(" Possible unsafe locking scenario by crosslock:\n\n");
+        printk("       CPU0                    CPU1\n");
-                printk("       CPU0                    CPU1\n");
+        printk("       ----                    ----\n");
-                printk("       ----                    ----\n");
+        printk("  lock(");
-                printk("  lock(");
+        __print_lock_name(target);
-                __print_lock_name(parent);
+        printk(KERN_CONT ");\n");
-                printk(KERN_CONT ");\n");
+        printk("                               lock(");
-                printk("  lock(");
+        __print_lock_name(parent);
-                __print_lock_name(target);
+        printk(KERN_CONT ");\n");
-                printk(KERN_CONT ");\n");
+        printk("                               lock(");
-                printk("                               lock(");
+        __print_lock_name(target);
-                __print_lock_name(source);
+        printk(KERN_CONT ");\n");
-                printk(KERN_CONT ");\n");
+        printk("  lock(");
-                printk("                               unlock(");
+        __print_lock_name(source);
-                __print_lock_name(target);
+        printk(KERN_CONT ");\n");
-                printk(KERN_CONT ");\n");
+        printk("\n *** DEADLOCK ***\n\n");
-                printk("\n *** DEADLOCK ***\n\n");
-        } else {
-                printk(" Possible unsafe locking scenario:\n\n");
-                printk("       CPU0                    CPU1\n");
-                printk("       ----                    ----\n");
-                printk("  lock(");
-                __print_lock_name(target);
-                printk(KERN_CONT ");\n");
-                printk("                               lock(");
-                __print_lock_name(parent);
-                printk(KERN_CONT ");\n");
-                printk("                               lock(");
-                __print_lock_name(target);
-                printk(KERN_CONT ");\n");
-                printk("  lock(");
-                __print_lock_name(source);
-                printk(KERN_CONT ");\n");
-                printk("\n *** DEADLOCK ***\n\n");
-        }
 }
 /*
@@ -1211,10 +1163,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
                curr->comm, task_pid_nr(curr));
        print_lock(check_src);
-        if (cross_lock(check_tgt->instance))
+        pr_warn("\nbut task is already holding lock:\n");
-                pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
-        else
-                pr_warn("\nbut task is already holding lock:\n");
        print_lock(check_tgt);
        pr_warn("\nwhich lock already depends on the new lock.\n\n");
@@ -1244,9 +1193,7 @@ static noinline int print_circular_bug(struct lock_list *this,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        if (cross_lock(check_tgt->instance))
+        if (!save_trace(&this->trace))
-                this->trace = *trace;
-        else if (!save_trace(&this->trace))
                return 0;
        depth = get_lock_depth(target);
@@ -1850,9 +1797,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
                if (nest)
                        return 2;
-                if (cross_lock(prev->instance))
-                        continue;
                return print_deadlock_bug(curr, prev, next);
        }
        return 1;
@@ -2018,31 +1962,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
        for (;;) {
                int distance = curr->lockdep_depth - depth + 1;
                hlock = curr->held_locks + depth - 1;
                /*
-                 * Only non-crosslock entries get new dependencies added.
+                 * Only non-recursive-read entries get new dependencies
-                 * Crosslock entries will be added by commit later:
+                 * added:
                 */
-                if (!cross_lock(hlock->instance)) {
+                if (hlock->read != 2 && hlock->check) {
+                        int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+                        if (!ret)
+                                return 0;
                        /*
-                         * Only non-recursive-read entries get new dependencies
+                         * Stop after the first non-trylock entry,
-                         * added:
+                         * as non-trylock entries have added their
+                         * own direct dependencies already, so this
+                         * lock is connected to them indirectly:
                         */
-                        if (hlock->read != 2 && hlock->check) {
+                        if (!hlock->trylock)
-                                int ret = check_prev_add(curr, hlock, next,
+                                break;
-                                                         distance, &trace, save_trace);
-                                if (!ret)
-                                        return 0;
-                                /*
-                                 * Stop after the first non-trylock entry,
-                                 * as non-trylock entries have added their
-                                 * own direct dependencies already, so this
-                                 * lock is connected to them indirectly:
-                                 */
-                                if (!hlock->trylock)
-                                        break;
-                        }
                }
                depth--;
                /*
                 * End of lock-stack?
@@ -3292,21 +3231,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass)
 {
-        cross_init(lock, 0);
        __lockdep_init_map(lock, name, key, subclass);
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
-                      struct lock_class_key *key, int subclass)
-{
-        cross_init(lock, 1);
-        __lockdep_init_map(lock, name, key, subclass);
-}
-EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
-#endif
 struct lock_class_key __lockdep_no_validate__;
 EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
@@ -3362,7 +3290,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        int chain_head = 0;
        int class_idx;
        u64 chain_key;
-        int ret;
        if (unlikely(!debug_locks))
                return 0;
@@ -3411,8 +3338,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        class_idx = class - lock_classes + 1;
-        /* TODO: nest_lock is not implemented for crosslock yet. */
+        if (depth) {
-        if (depth && !cross_lock(lock)) {
                hlock = curr->held_locks + depth - 1;
                if (hlock->class_idx == class_idx && nest_lock) {
                        if (hlock->references) {
@@ -3500,14 +3426,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
                return 0;
-        ret = lock_acquire_crosslock(hlock);
-        /*
-         * 2 means normal acquire operations are needed. Otherwise, it's
-         * ok just to return with '0:fail, 1:success'.
-         */
-        if (ret != 2)
-                return ret;
        curr->curr_chain_key = chain_key;
        curr->lockdep_depth++;
        check_chain_key(curr);
@@ -3745,19 +3663,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
        struct task_struct *curr = current;
        struct held_lock *hlock;
        unsigned int depth;
-        int ret, i;
+        int i;
        if (unlikely(!debug_locks))
                return 0;
-        ret = lock_release_crosslock(lock);
-        /*
-         * 2 means normal release operations are needed. Otherwise, it's
-         * ok just to return with '0:fail, 1:success'.
-         */
-        if (ret != 2)
-                return ret;
        depth = curr->lockdep_depth;
        /*
         * So we're all set to release this lock.. wait what lock? We don't
@@ -4675,495 +4585,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
        dump_stack();
 }
 EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-/*
- * Crossrelease works by recording a lock history for each thread and
- * connecting those historic locks that were taken after the
- * wait_for_completion() in the complete() context.
- *
- * Task-A                               Task-B
- *
- *                                      mutex_lock(&A);
- *                                      mutex_unlock(&A);
- *
- * wait_for_completion(&C);
- *   lock_acquire_crosslock();
- *     atomic_inc_return(&cross_gen_id);
- *                                |
- *                                |     mutex_lock(&B);
- *                                |     mutex_unlock(&B);
- *                                |
- *                                |     complete(&C);
- *                                `--     lock_commit_crosslock();
- *
- * Which will then add a dependency between B and C.
- */
-#define xhlock(i)         (current->xhlocks[(i) % MAX_XHLOCKS_NR])
-/*
- * Whenever a crosslock is held, cross_gen_id will be increased.
- */
-static atomic_t cross_gen_id; /* Can be wrapped */
-/*
- * Make an entry of the ring buffer invalid.
- */
-static inline void invalidate_xhlock(struct hist_lock *xhlock)
-{
-        /*
-         * Normally, xhlock->hlock.instance must be !NULL.
-         */
-        xhlock->hlock.instance = NULL;
-}
-/*
- * Lock history stacks; we have 2 nested lock history stacks:
- *
- *   HARD(IRQ)
- *   SOFT(IRQ)
- *
- * The thing is that once we complete a HARD/SOFT IRQ the future task locks
- * should not depend on any of the locks observed while running the IRQ.  So
- * what we do is rewind the history buffer and erase all our knowledge of that
- * temporal event.
- */
-void crossrelease_hist_start(enum xhlock_context_t c)
-{
-        struct task_struct *cur = current;
-        if (!cur->xhlocks)
-                return;
-        cur->xhlock_idx_hist[c] = cur->xhlock_idx;
-        cur->hist_id_save[c]    = cur->hist_id;
-}
-void crossrelease_hist_end(enum xhlock_context_t c)
-{
-        struct task_struct *cur = current;
-        if (cur->xhlocks) {
-                unsigned int idx = cur->xhlock_idx_hist[c];
-                struct hist_lock *h = &xhlock(idx);
-                cur->xhlock_idx = idx;
-                /* Check if the ring was overwritten. */
-                if (h->hist_id != cur->hist_id_save[c])
-                        invalidate_xhlock(h);
-        }
-}
-/*
- * lockdep_invariant_state() is used to annotate independence inside a task, to
- * make one task look like multiple independent 'tasks'.
- *
- * Take for instance workqueues; each work is independent of the last. The
- * completion of a future work does not depend on the completion of a past work
- * (in general). Therefore we must not carry that (lock) dependency across
- * works.
- *
- * This is true for many things; pretty much all kthreads fall into this
- * pattern, where they have an invariant state and future completions do not
- * depend on past completions. Its just that since they all have the 'same'
- * form -- the kthread does the same over and over -- it doesn't typically
- * matter.
- *
- * The same is true for system-calls, once a system call is completed (we've
- * returned to userspace) the next system call does not depend on the lock
- * history of the previous system call.
- *
- * They key property for independence, this invariant state, is that it must be
- * a point where we hold no locks and have no history. Because if we were to
- * hold locks, the restore at _end() would not necessarily recover it's history
- * entry. Similarly, independence per-definition means it does not depend on
- * prior state.
- */
-void lockdep_invariant_state(bool force)
-{
-        /*
-         * We call this at an invariant point, no current state, no history.
-         * Verify the former, enforce the latter.
-         */
-        WARN_ON_ONCE(!force && current->lockdep_depth);
-        if (current->xhlocks)
-                invalidate_xhlock(&xhlock(current->xhlock_idx));
-}
-static int cross_lock(struct lockdep_map *lock)
-{
-        return lock ? lock->cross : 0;
-}
-/*
- * This is needed to decide the relationship between wrapable variables.
- */
-static inline int before(unsigned int a, unsigned int b)
-{
-        return (int)(a - b) < 0;
-}
-static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
-{
-        return hlock_class(&xhlock->hlock);
-}
-static inline struct lock_class *xlock_class(struct cross_lock *xlock)
-{
-        return hlock_class(&xlock->hlock);
-}
-/*
- * Should we check a dependency with previous one?
- */
-static inline int depend_before(struct held_lock *hlock)
-{
-        return hlock->read != 2 && hlock->check && !hlock->trylock;
-}
-/*
- * Should we check a dependency with next one?
- */
-static inline int depend_after(struct held_lock *hlock)
-{
-        return hlock->read != 2 && hlock->check;
-}
-/*
- * Check if the xhlock is valid, which would be false if,
- *
- *    1. Has not used after initializaion yet.
- *    2. Got invalidated.
- *
- * Remind hist_lock is implemented as a ring buffer.
- */
-static inline int xhlock_valid(struct hist_lock *xhlock)
-{
-        /*
-         * xhlock->hlock.instance must be !NULL.
-         */
-        return !!xhlock->hlock.instance;
-}
-/*
- * Record a hist_lock entry.
- *
- * Irq disable is only required.
- */
-static void add_xhlock(struct held_lock *hlock)
-{
-        unsigned int idx = ++current->xhlock_idx;
-        struct hist_lock *xhlock = &xhlock(idx);
-#ifdef CONFIG_DEBUG_LOCKDEP
-        /*
-         * This can be done locklessly because they are all task-local
-         * state, we must however ensure IRQs are disabled.
-         */
-        WARN_ON_ONCE(!irqs_disabled());
-#endif
-        /* Initialize hist_lock's members */
-        xhlock->hlock = *hlock;
-        xhlock->hist_id = ++current->hist_id;
-        xhlock->trace.nr_entries = 0;
-        xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
-        xhlock->trace.entries = xhlock->trace_entries;
-        if (crossrelease_fullstack) {
-                xhlock->trace.skip = 3;
-                save_stack_trace(&xhlock->trace);
-        } else {
-                xhlock->trace.nr_entries = 1;
-                xhlock->trace.entries[0] = hlock->acquire_ip;
-        }
-}
-static inline int same_context_xhlock(struct hist_lock *xhlock)
-{
-        return xhlock->hlock.irq_context == task_irq_context(current);
-}
-/*
- * This should be lockless as far as possible because this would be
- * called very frequently.
- */
-static void check_add_xhlock(struct held_lock *hlock)
-{
-        /*
-         * Record a hist_lock, only in case that acquisitions ahead
-         * could depend on the held_lock. For example, if the held_lock
-         * is trylock then acquisitions ahead never depends on that.
-         * In that case, we don't need to record it. Just return.
-         */
-        if (!current->xhlocks || !depend_before(hlock))
-                return;
-        add_xhlock(hlock);
-}
-/*
- * For crosslock.
- */
-static int add_xlock(struct held_lock *hlock)
-{
-        struct cross_lock *xlock;
-        unsigned int gen_id;
-        if (!graph_lock())
-                return 0;
-        xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
-        /*
-         * When acquisitions for a crosslock are overlapped, we use
-         * nr_acquire to perform commit for them, based on cross_gen_id
-         * of the first acquisition, which allows to add additional
-         * dependencies.
-         *
-         * Moreover, when no acquisition of a crosslock is in progress,
-         * we should not perform commit because the lock might not exist
-         * any more, which might cause incorrect memory access. So we
-         * have to track the number of acquisitions of a crosslock.
-         *
-         * depend_after() is necessary to initialize only the first
-         * valid xlock so that the xlock can be used on its commit.
-         */
-        if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
-                goto unlock;
-        gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
-        xlock->hlock = *hlock;
-        xlock->hlock.gen_id = gen_id;
-unlock:
-        graph_unlock();
-        return 1;
-}
-/*
- * Called for both normal and crosslock acquires. Normal locks will be
- * pushed on the hist_lock queue. Cross locks will record state and
- * stop regular lock_acquire() to avoid being placed on the held_lock
- * stack.
- *
- * Return: 0 - failure;
- *         1 - crosslock, done;
- *         2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_acquire_crosslock(struct held_lock *hlock)
-{
-        /*
-         *      CONTEXT 1               CONTEXT 2
-         *      ---------               ---------
-         *      lock A (cross)
-         *      X = atomic_inc_return(&cross_gen_id)
-         *      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-         *                              Y = atomic_read_acquire(&cross_gen_id)
-         *                              lock B
-         *
-         * atomic_read_acquire() is for ordering between A and B,
-         * IOW, A happens before B, when CONTEXT 2 see Y >= X.
-         *
-         * Pairs with atomic_inc_return() in add_xlock().
-         */
-        hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
-        if (cross_lock(hlock->instance))
-                return add_xlock(hlock);
-        check_add_xhlock(hlock);
-        return 2;
-}
-static int copy_trace(struct stack_trace *trace)
-{
-        unsigned long *buf = stack_trace + nr_stack_trace_entries;
-        unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
-        unsigned int nr = min(max_nr, trace->nr_entries);
-        trace->nr_entries = nr;
-        memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
-        trace->entries = buf;
-        nr_stack_trace_entries += nr;
-        if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
-                if (!debug_locks_off_graph_unlock())
-                        return 0;
-                print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
-                dump_stack();
-                return 0;
-        }
-        return 1;
-}
-static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
-{
-        unsigned int xid, pid;
-        u64 chain_key;
-        xid = xlock_class(xlock) - lock_classes;
-        chain_key = iterate_chain_key((u64)0, xid);
-        pid = xhlock_class(xhlock) - lock_classes;
-        chain_key = iterate_chain_key(chain_key, pid);
-        if (lookup_chain_cache(chain_key))
-                return 1;
-        if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
-                                chain_key))
-                return 0;
-        if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
-                            &xhlock->trace, copy_trace))
-                return 0;
-        return 1;
-}
-static void commit_xhlocks(struct cross_lock *xlock)
-{
-        unsigned int cur = current->xhlock_idx;
-        unsigned int prev_hist_id = xhlock(cur).hist_id;
-        unsigned int i;
-        if (!graph_lock())
-                return;
-        if (xlock->nr_acquire) {
-                for (i = 0; i < MAX_XHLOCKS_NR; i++) {
-                        struct hist_lock *xhlock = &xhlock(cur - i);
-                        if (!xhlock_valid(xhlock))
-                                break;
-                        if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
-                                break;
-                        if (!same_context_xhlock(xhlock))
-                                break;
-                        /*
-                         * Filter out the cases where the ring buffer was
-                         * overwritten and the current entry has a bigger
-                         * hist_id than the previous one, which is impossible
-                         * otherwise:
-                         */
-                        if (unlikely(before(prev_hist_id, xhlock->hist_id)))
-                                break;
-                        prev_hist_id = xhlock->hist_id;
-                        /*
-                         * commit_xhlock() returns 0 with graph_lock already
-                         * released if fail.
-                         */
-                        if (!commit_xhlock(xlock, xhlock))
-                                return;
-                }
-        }
-        graph_unlock();
-}
-void lock_commit_crosslock(struct lockdep_map *lock)
-{
-        struct cross_lock *xlock;
-        unsigned long flags;
-        if (unlikely(!debug_locks || current->lockdep_recursion))
-                return;
-        if (!current->xhlocks)
-                return;
-        /*
-         * Do commit hist_locks with the cross_lock, only in case that
-         * the cross_lock could depend on acquisitions after that.
-         *
-         * For example, if the cross_lock does not have the 'check' flag
-         * then we don't need to check dependencies and commit for that.
-         * Just skip it. In that case, of course, the cross_lock does
-         * not depend on acquisitions ahead, either.
-         *
-         * WARNING: Don't do that in add_xlock() in advance. When an
-         * acquisition context is different from the commit context,
-         * invalid(skipped) cross_lock might be accessed.
-         */
-        if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
-                return;
-        raw_local_irq_save(flags);
-        check_flags(flags);
-        current->lockdep_recursion = 1;
-        xlock = &((struct lockdep_map_cross *)lock)->xlock;
-        commit_xhlocks(xlock);
-        current->lockdep_recursion = 0;
-        raw_local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(lock_commit_crosslock);
-/*
- * Return: 0 - failure;
- *         1 - crosslock, done;
- *         2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_release_crosslock(struct lockdep_map *lock)
-{
-        if (cross_lock(lock)) {
-                if (!graph_lock())
-                        return 0;
-                ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
-                graph_unlock();
-                return 1;
-        }
-        return 2;
-}
-static void cross_init(struct lockdep_map *lock, int cross)
-{
-        if (cross)
-                ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
-        lock->cross = cross;
-        /*
-         * Crossrelease assumes that the ring buffer size of xhlocks
-         * is aligned with power of 2. So force it on build.
-         */
-        BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
-}
-void lockdep_init_task(struct task_struct *task)
-{
-        int i;
-        task->xhlock_idx = UINT_MAX;
-        task->hist_id = 0;
-        for (i = 0; i < XHLOCK_CTX_NR; i++) {
-                task->xhlock_idx_hist[i] = UINT_MAX;
-                task->hist_id_save[i] = 0;
-        }
-        task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
-                                GFP_KERNEL);
-}
-void lockdep_free_task(struct task_struct *task)
-{
-        if (task->xhlocks) {
-                void *tmp = task->xhlocks;
-                /* Diable crossrelease for current */
-                task->xhlocks = NULL;
-                kfree(tmp);
-        }
-}
-#endif
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 1fd1a7543cdd..936f3d14dd6b 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock)			\
                        break;                                          \
                preempt_enable();                                       \
                                                                        \
-                if (!(lock)->break_lock)                                \
+                arch_##op##_relax(&lock->raw_lock);                     \
-                        (lock)->break_lock = 1;                         \
-                while ((lock)->break_lock)                              \
-                        arch_##op##_relax(&lock->raw_lock);             \
        }                                                               \
-        (lock)->break_lock = 0;                                         \
 }                                                                       \
                                                                        \
 unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)  \
@@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)	\
                local_irq_restore(flags);                               \
                preempt_enable();                                       \
                                                                        \
-                if (!(lock)->break_lock)                                \
+                arch_##op##_relax(&lock->raw_lock);                     \
-                        (lock)->break_lock = 1;                         \
-                while ((lock)->break_lock)                              \
-                        arch_##op##_relax(&lock->raw_lock);             \
        }                                                               \
-        (lock)->break_lock = 0;                                         \
+                                                                        \
        return flags;                                                   \
 }                                                                       \
                                                                        \
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 75554f366fd3..644fa2e3d993 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5097,17 +5097,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
        return ret;
 }
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
 {
        struct task_struct *p;
@@ -5144,6 +5133,17 @@ out_unlock:
        return retval;
 }
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                struct timespec __user *, interval)
 {
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2f52ec0f1539..d6717a3331a1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 #ifdef CONFIG_NO_HZ_COMMON
 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
 {
-        unsigned long idle_calls = tick_nohz_get_idle_calls();
+        unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
        bool ret = idle_calls == sg_cpu->saved_idle_calls;
        sg_cpu->saved_idle_calls = idle_calls;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4056c19ca3f0..665ace2fc558 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq)
        bool resched = false;
        struct task_struct *p;
        struct rq *src_rq;
+        int rt_overload_count = rt_overloaded(this_rq);
-        if (likely(!rt_overloaded(this_rq)))
+        if (likely(!rt_overload_count))
                return;
        /*
@@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq)
         */
        smp_rmb();
+        /* If we are the only overloaded CPU do nothing */
+        if (rt_overload_count == 1 &&
+            cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+                return;
 #ifdef HAVE_RT_PUSH_IPI
        if (sched_feat(RT_PUSH_IPI)) {
                tell_cpu_to_push(this_rq);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e776fc8cc1df..f6b5f19223d6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -95,6 +95,7 @@ config NO_HZ_FULL
        select RCU_NOCB_CPU
        select VIRT_CPU_ACCOUNTING_GEN
        select IRQ_WORK
+        select CPU_ISOLATION
        help
         Adaptively try to shutdown the tick whenever possible, even when
         the CPU is running tasks. Typically this requires running a single
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 13d6881f908b..ec999f32c840 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event)
 {
        struct task_struct *rtn = current->group_leader;
-        if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
+        switch (event->sigev_notify) {
-                (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
+        case SIGEV_SIGNAL | SIGEV_THREAD_ID:
-                 !same_thread_group(rtn, current) ||
+                rtn = find_task_by_vpid(event->sigev_notify_thread_id);
-                 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+                if (!rtn || !same_thread_group(rtn, current))
+                        return NULL;
+                /* FALLTHRU */
+        case SIGEV_SIGNAL:
+        case SIGEV_THREAD:
+                if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
+                        return NULL;
+                /* FALLTHRU */
+        case SIGEV_NONE:
+                return task_pid(rtn);
+        default:
                return NULL;
+        }
-        if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
-            ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
-                return NULL;
-        return task_pid(rtn);
 }
 static struct k_itimer * alloc_posix_timer(void)
@@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
        struct timespec64 ts64;
        bool sig_none;
-        sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
+        sig_none = timr->it_sigev_notify == SIGEV_NONE;
        iv = timr->it_interval;
        /* interval timer ? */
@@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags,
        timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
        expires = timespec64_to_ktime(new_setting->it_value);
-        sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
+        sigev_none = timr->it_sigev_notify == SIGEV_NONE;
        kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
        timr->it_active = !sigev_none;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99578f06c8d4..f7cc7abfcf25 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
        ts->next_tick = 0;
 }
+static inline bool local_timer_softirq_pending(void)
+{
+        return local_softirq_pending() & TIMER_SOFTIRQ;
+}
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                                         ktime_t now, int cpu)
 {
@@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
        } while (read_seqretry(&jiffies_lock, seq));
        ts->last_jiffies = basejiff;
-        if (rcu_needs_cpu(basemono, &next_rcu) ||
+        /*
-            arch_needs_cpu() || irq_work_needs_cpu()) {
+         * Keep the periodic tick, when RCU, architecture or irq_work
+         * requests it.
+         * Aside of that check whether the local timer softirq is
+         * pending. If so its a bad idea to call get_next_timer_interrupt()
+         * because there is an already expired timer, so it will request
+         * immeditate expiry, which rearms the hardware timer with a
+         * minimal delta which brings us back to this place
+         * immediately. Lather, rinse and repeat...
+         */
+        if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+            irq_work_needs_cpu() || local_timer_softirq_pending()) {
                next_tick = basemono + TICK_NSEC;
        } else {
                /*
@@ -986,6 +1001,19 @@ ktime_t tick_nohz_get_sleep_length(void)
 }
 /**
+ * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
+ * for a particular CPU.
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
+{
+        struct tick_sched *ts = tick_get_tick_sched(cpu);
+        return ts->idle_calls;
+}
+/**
 * tick_nohz_get_idle_calls - return the current idle calls counter value
 *
 * Called from the schedutil frequency scaling governor in scheduler context.
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ffebcf878fba..89a9e1b4264a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -823,11 +823,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
        struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
        /*
-         * If the timer is deferrable and nohz is active then we need to use
+         * If the timer is deferrable and NO_HZ_COMMON is set then we need
-         * the deferrable base.
+         * to use the deferrable base.
         */
-        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
-            (tflags & TIMER_DEFERRABLE))
                base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
        return base;
 }
@@ -837,11 +836,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
        /*
-         * If the timer is deferrable and nohz is active then we need to use
+         * If the timer is deferrable and NO_HZ_COMMON is set then we need
-         * the deferrable base.
+         * to use the deferrable base.
         */
-        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
-            (tflags & TIMER_DEFERRABLE))
                base = this_cpu_ptr(&timer_bases[BASE_DEF]);
        return base;
 }
@@ -1009,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
        if (!ret && (options & MOD_TIMER_PENDING_ONLY))
                goto out_unlock;
-        debug_activate(timer, expires);
        new_base = get_target_base(base, timer->flags);
        if (base != new_base) {
@@ -1034,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
                }
        }
+        debug_activate(timer, expires);
        timer->expires = expires;
        /*
         * If 'idx' was calculated above and the base time did not advance
@@ -1684,7 +1682,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
        base->must_forward_clk = false;
        __run_timers(base);
-        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+        if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
                __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
 }
@@ -1855,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
        }
 }
+int timers_prepare_cpu(unsigned int cpu)
+{
+        struct timer_base *base;
+        int b;
+        for (b = 0; b < NR_BASES; b++) {
+                base = per_cpu_ptr(&timer_bases[b], cpu);
+                base->clk = jiffies;
+                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+                base->is_idle = false;
+                base->must_forward_clk = true;
+        }
+        return 0;
+}
 int timers_dead_cpu(unsigned int cpu)
 {
        struct timer_base *old_base;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index af7dad126c13..904c952ac383 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS
        bool "Enable trace events for preempt and irq disable/enable"
        select TRACE_IRQFLAGS
        depends on DEBUG_PREEMPT || !PROVE_LOCKING
+        depends on TRACING
        default n
        help
          Enable tracing of disable and enable events for preemption and irqs.
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0ce99c379c30..40207c2a4113 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
        .arg4_type      = ARG_CONST_SIZE,
 };
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
 static __always_inline u64
 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
-                        u64 flags, struct perf_raw_record *raw)
+                        u64 flags, struct perf_sample_data *sd)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
-        struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
        unsigned int cpu = smp_processor_id();
        u64 index = flags & BPF_F_INDEX_MASK;
        struct bpf_event_entry *ee;
@@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
        if (unlikely(event->oncpu != cpu))
                return -EOPNOTSUPP;
-        perf_sample_data_init(sd, 0, 0);
-        sd->raw = raw;
        perf_event_output(event, sd, regs);
        return 0;
 }
@@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
           u64, flags, void *, data, u64, size)
 {
+        struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
        struct perf_raw_record raw = {
                .frag = {
                        .size = size,
@@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
        if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
                return -EINVAL;
-        return __bpf_perf_event_output(regs, map, flags, &raw);
+        perf_sample_data_init(sd, 0, 0);
+        sd->raw = &raw;
+        return __bpf_perf_event_output(regs, map, flags, sd);
 }
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 };
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
+        struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
        struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
        struct perf_raw_frag frag = {
                .copy           = ctx_copy,
@@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
        };
        perf_fetch_caller_regs(regs);
+        perf_sample_data_init(sd, 0, 0);
+        sd->raw = &raw;
-        return __bpf_perf_event_output(regs, map, flags, &raw);
+        return __bpf_perf_event_output(regs, map, flags, sd);
 }
 BPF_CALL_0(bpf_get_current_task)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 91874a95060d..9ab18995ff1e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 /* Missed count stored at end */
 #define RB_MISSED_STORED        (1 << 30)
+#define RB_MISSED_FLAGS         (RB_MISSED_EVENTS|RB_MISSED_STORED)
 struct buffer_data_page {
        u64              time_stamp;    /* page time stamp */
        local_t          commit;        /* write committed index */
@@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
 */
 size_t ring_buffer_page_len(void *page)
 {
-        return local_read(&((struct buffer_data_page *)page)->commit)
+        struct buffer_data_page *bpage = page;
+        return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
                + BUF_PAGE_HDR_SIZE;
 }
@@ -1799,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
-static __always_inline void *
-__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
-{
-        return bpage->data + index;
-}
 static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
 {
        return bpage->page->data + index;
@@ -4406,8 +4404,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
 {
        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
        struct buffer_data_page *bpage = data;
+        struct page *page = virt_to_page(bpage);
        unsigned long flags;
+        /* If the page is still in use someplace else, we can't reuse it */
+        if (page_ref_count(page) > 1)
+                goto out;
        local_irq_save(flags);
        arch_spin_lock(&cpu_buffer->lock);
@@ -4419,6 +4422,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
        arch_spin_unlock(&cpu_buffer->lock);
        local_irq_restore(flags);
+ out:
        free_page((unsigned long)bpage);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 73e67b68c53b..2a8d8a294345 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct
 }
 /**
- * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list
 * @pid_list: The list to modify
 * @self: The current task for fork or NULL for exit
 * @task: The task to add or remove
@@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr)
 }
 /**
- * trace_snapshot - take a snapshot of the current buffer.
+ * tracing_snapshot - take a snapshot of the current buffer.
 *
 * This causes a swap between the snapshot buffer and the current live
 * tracing buffer. You can use this to take snapshots of the live
@@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void)
 EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
 /**
- * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
 *
- * This is similar to trace_snapshot(), but it will allocate the
+ * This is similar to tracing_snapshot(), but it will allocate the
 * snapshot buffer if it isn't already allocated. Use this only
 * where it is safe to sleep, as the allocation may sleep.
 *
@@ -1303,7 +1303,7 @@ unsigned long __read_mostly	tracing_thresh;
 /*
 * Copy the new maximum trace into the separate maximum-trace
 * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
+ * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
 */
 static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2415,7 +2415,7 @@ trace_process_export(struct trace_export *export,
        entry = ring_buffer_event_data(event);
        size = ring_buffer_event_length(event);
-        export->write(entry, size);
+        export->write(export, entry, size);
 }
 static DEFINE_MUTEX(ftrace_export_lock);
@@ -4178,37 +4178,30 @@ static const struct file_operations show_traces_fops = {
        .llseek         = seq_lseek,
 };
-/*
- * The tracer itself will not take this lock, but still we want
- * to provide a consistent cpumask to user-space:
- */
-static DEFINE_MUTEX(tracing_cpumask_update_lock);
-/*
- * Temporary storage for the character representation of the
- * CPU bitmask (and one more byte for the newline):
- */
-static char mask_str[NR_CPUS + 1];
 static ssize_t
 tracing_cpumask_read(struct file *filp, char __user *ubuf,
                     size_t count, loff_t *ppos)
 {
        struct trace_array *tr = file_inode(filp)->i_private;
+        char *mask_str;
        int len;
-        mutex_lock(&tracing_cpumask_update_lock);
+        len = snprintf(NULL, 0, "%*pb\n",
+                       cpumask_pr_args(tr->tracing_cpumask)) + 1;
+        mask_str = kmalloc(len, GFP_KERNEL);
+        if (!mask_str)
+                return -ENOMEM;
-        len = snprintf(mask_str, count, "%*pb\n",
+        len = snprintf(mask_str, len, "%*pb\n",
                       cpumask_pr_args(tr->tracing_cpumask));
        if (len >= count) {
                count = -EINVAL;
                goto out_err;
        }
-        count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+        count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
 out_err:
-        mutex_unlock(&tracing_cpumask_update_lock);
+        kfree(mask_str);
        return count;
 }
@@ -4228,8 +4221,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        if (err)
                goto err_unlock;
-        mutex_lock(&tracing_cpumask_update_lock);
        local_irq_disable();
        arch_spin_lock(&tr->max_lock);
        for_each_tracing_cpu(cpu) {
@@ -4252,8 +4243,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        local_irq_enable();
        cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
-        mutex_unlock(&tracing_cpumask_update_lock);
        free_cpumask_var(tracing_cpumask_new);
        return count;
@@ -6780,7 +6769,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                .spd_release    = buffer_spd_release,
        };
        struct buffer_ref *ref;
-        int entries, size, i;
+        int entries, i;
        ssize_t ret = 0;
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -6834,14 +6823,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                        break;
                }
-                /*
-                 * zero out any left over data, this is going to
-                 * user land.
-                 */
-                size = ring_buffer_page_len(ref->page);
-                if (size < PAGE_SIZE)
-                        memset(ref->page + size, 0, PAGE_SIZE - size);
                page = virt_to_page(ref->page);
                spd.pages[i] = page;
@@ -7599,6 +7580,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
        buf->data = alloc_percpu(struct trace_array_cpu);
        if (!buf->data) {
                ring_buffer_free(buf->buffer);
+                buf->buffer = NULL;
                return -ENOMEM;
        }
@@ -7622,7 +7604,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
                                    allocate_snapshot ? size : 1);
        if (WARN_ON(ret)) {
                ring_buffer_free(tr->trace_buffer.buffer);
+                tr->trace_buffer.buffer = NULL;
                free_percpu(tr->trace_buffer.data);
+                tr->trace_buffer.data = NULL;
                return -ENOMEM;
        }
        tr->allocated_snapshot = allocate_snapshot;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 734accc02418..3c7bfc4bf5e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
        if (__this_cpu_read(disable_stack_tracer) != 1)
                goto out;
+        /* If rcu is not watching, then save stack trace can fail */
+        if (!rcu_is_watching())
+                goto out;
        ip += MCOUNT_INSN_SIZE;
        check_stack(ip, &stack);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index ce74a4901d2b..ef1da2a5f9bd 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
                return retval;
        }
+        groups_sort(group_info);
        retval = set_current_groups(group_info);
        put_group_info(group_info);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8fdb710bfdd7..43d18cb46308 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -38,7 +38,6 @@
 #include <linux/hardirq.h>
 #include <linux/mempolicy.h>
 #include <linux/freezer.h>
-#include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
@@ -48,6 +47,7 @@
 #include <linux/nodemask.h>
 #include <linux/moduleparam.h>
 #include <linux/uaccess.h>
+#include <linux/sched/isolation.h>
 #include "workqueue_internal.h"
@@ -1634,7 +1634,7 @@ static void worker_enter_idle(struct worker *worker)
                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
        /*
-         * Sanity check nr_running.  Because wq_unbind_fn() releases
+         * Sanity check nr_running.  Because unbind_workers() releases
         * pool->lock between setting %WORKER_UNBOUND and zapping
         * nr_running, the warning may trigger spuriously.  Check iff
         * unbind is not in progress.
@@ -4510,9 +4510,8 @@ void show_workqueue_state(void)
 * cpu comes back online.
 */
-static void wq_unbind_fn(struct work_struct *work)
+static void unbind_workers(int cpu)
 {
-        int cpu = smp_processor_id();
        struct worker_pool *pool;
        struct worker *worker;
@@ -4589,16 +4588,6 @@ static void rebind_workers(struct worker_pool *pool)
        spin_lock_irq(&pool->lock);
-        /*
-         * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
-         * w/o preceding DOWN_PREPARE.  Work around it.  CPU hotplug is
-         * being reworked and this can go away in time.
-         */
-        if (!(pool->flags & POOL_DISASSOCIATED)) {
-                spin_unlock_irq(&pool->lock);
-                return;
-        }
        pool->flags &= ~POOL_DISASSOCIATED;
        for_each_pool_worker(worker, pool) {
@@ -4709,12 +4698,13 @@ int workqueue_online_cpu(unsigned int cpu)
 int workqueue_offline_cpu(unsigned int cpu)
 {
-        struct work_struct unbind_work;
        struct workqueue_struct *wq;
        /* unbinding per-cpu workers should happen on the local CPU */
-        INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
+        if (WARN_ON(cpu != smp_processor_id()))
-        queue_work_on(cpu, system_highpri_wq, &unbind_work);
+                return -1;
+        unbind_workers(cpu);
        /* update NUMA affinity of unbound workqueues */
        mutex_lock(&wq_pool_mutex);
@@ -4722,9 +4712,6 @@ int workqueue_offline_cpu(unsigned int cpu)
                wq_update_unbound_numa(wq, cpu, false);
        mutex_unlock(&wq_pool_mutex);
-        /* wait for per-cpu unbinding to finish */
-        flush_work(&unbind_work);
-        destroy_work_on_stack(&unbind_work);
        return 0;
 }
@@ -4957,6 +4944,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
        if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
                return -ENOMEM;
+        /*
+         * Not excluding isolated cpus on purpose.
+         * If the user wishes to include them, we allow that.
+         */
        cpumask_and(cpumask, cpumask, cpu_possible_mask);
        if (!cpumask_empty(cpumask)) {
                apply_wqattrs_lock();
@@ -5555,7 +5546,7 @@ int __init workqueue_init_early(void)
        WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
        BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-        cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+        cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2018-01-02 08:46:35 -0500
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2018-01-02 08:46:35 -0500
commit	b6a09416e83ffe4eccfb4ef1b91b3b66483fa810 (patch)
tree	b30f266e85047244dcdb47d5afc134e76aec530d /kernel
parent	db809859c8cee415293b830e67178f526d1eb2be (diff)
parent	30a7acd573899fd8b8ac39236eff6468b195ac7d (diff)