38 files changed, 465 insertions, 164 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 715f9fcf4712..c57bd10340ed 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -467,7 +467,7 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
        return kind_ops[BTF_INFO_KIND(t->info)];
 }
-bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
        return BTF_STR_OFFSET_VALID(offset) &&
                offset < btf->hdr.str_len;
@@ -1219,8 +1219,6 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset,
        u8 nr_copy_bits;
        u64 print_num;
-        data += BITS_ROUNDDOWN_BYTES(bits_offset);
-        bits_offset = BITS_PER_BYTE_MASKED(bits_offset);
        nr_copy_bits = nr_bits + bits_offset;
        nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
@@ -1255,7 +1253,9 @@ static void btf_int_bits_seq_show(const struct btf *btf,
         * BTF_INT_OFFSET() cannot exceed 64 bits.
         */
        total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
-        btf_bitfield_seq_show(data, total_bits_offset, nr_bits, m);
+        data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
+        bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+        btf_bitfield_seq_show(data, bits_offset, nr_bits, m);
 }
 static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
@@ -1459,7 +1459,8 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
                /* "typedef void new_void", "const void"...etc */
                if (!btf_type_is_void(next_type) &&
-                    !btf_type_is_fwd(next_type)) {
+                    !btf_type_is_fwd(next_type) &&
+                    !btf_type_is_func_proto(next_type)) {
                        btf_verifier_log_type(env, v->t, "Invalid type_id");
                        return -EINVAL;
                }
@@ -2001,12 +2002,12 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
                member_offset = btf_member_bit_offset(t, member);
                bitfield_size = btf_member_bitfield_size(t, member);
+                bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+                bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
                if (bitfield_size) {
-                        btf_bitfield_seq_show(data, member_offset,
+                        btf_bitfield_seq_show(data + bytes_offset, bits8_offset,
                                              bitfield_size, m);
                } else {
-                        bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
-                        bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
                        ops = btf_type_ops(member_type);
                        ops->seq_show(btf, member_type, member->type,
                                      data + bytes_offset, bits8_offset, m);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9425c2fb872f..d17d05570a3f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -572,7 +572,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
        bpf_compute_and_save_data_end(skb, &saved_data_end);
        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
-                                 bpf_prog_run_save_cb);
+                                 __bpf_prog_run_save_cb);
        bpf_restore_data_end(skb, saved_data_end);
        __skb_pull(skb, offset);
        skb->sk = save_sk;
@@ -718,6 +718,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
        case BPF_FUNC_trace_printk:
                if (capable(CAP_SYS_ADMIN))
                        return bpf_get_trace_printk_proto();
+                /* fall through */
        default:
                return NULL;
        }
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4b7c76765d9d..f9274114c88d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -686,7 +686,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
        }
        if (htab_is_prealloc(htab)) {
-                pcpu_freelist_push(&htab->freelist, &l->fnode);
+                __pcpu_freelist_push(&htab->freelist, &l->fnode);
        } else {
                atomic_dec(&htab->count);
                l->htab = htab;
@@ -748,7 +748,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
                } else {
                        struct pcpu_freelist_node *l;
-                        l = pcpu_freelist_pop(&htab->freelist);
+                        l = __pcpu_freelist_pop(&htab->freelist);
                        if (!l)
                                return ERR_PTR(-E2BIG);
                        l_new = container_of(l, struct htab_elem, fnode);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index abf1002080df..93a5cbbde421 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -471,6 +471,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
        }
        if (!node || node->prefixlen != key->prefixlen ||
+            node->prefixlen != matchlen ||
            (node->flags & LPM_TREE_NODE_FLAG_IM)) {
                ret = -ENOENT;
                goto out;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 99d243e1ad6e..52378d3e34b3 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -12,6 +12,7 @@
 struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 {
        struct bpf_map *inner_map, *inner_map_meta;
+        u32 inner_map_meta_size;
        struct fd f;
        f = fdget(inner_map_ufd);
@@ -36,7 +37,12 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
                return ERR_PTR(-EINVAL);
        }
-        inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER);
+        inner_map_meta_size = sizeof(*inner_map_meta);
+        /* In some cases verifier needs to access beyond just base map. */
+        if (inner_map->ops == &array_map_ops)
+                inner_map_meta_size = sizeof(struct bpf_array);
+        inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
        if (!inner_map_meta) {
                fdput(f);
                return ERR_PTR(-ENOMEM);
@@ -46,9 +52,16 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
        inner_map_meta->key_size = inner_map->key_size;
        inner_map_meta->value_size = inner_map->value_size;
        inner_map_meta->map_flags = inner_map->map_flags;
-        inner_map_meta->ops = inner_map->ops;
        inner_map_meta->max_entries = inner_map->max_entries;
+        /* Misc members not needed in bpf_map_meta_equal() check. */
+        inner_map_meta->ops = inner_map->ops;
+        if (inner_map->ops == &array_map_ops) {
+                inner_map_meta->unpriv_array = inner_map->unpriv_array;
+                container_of(inner_map_meta, struct bpf_array, map)->index_mask =
+                     container_of(inner_map, struct bpf_array, map)->index_mask;
+        }
        fdput(f);
        return inner_map_meta;
 }
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 673fa6fe2d73..0c1b4ba9e90e 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
        free_percpu(s->freelist);
 }
-static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
+static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
-                                        struct pcpu_freelist_node *node)
+                                         struct pcpu_freelist_node *node)
 {
        raw_spin_lock(&head->lock);
        node->next = head->first;
@@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
        raw_spin_unlock(&head->lock);
 }
-void pcpu_freelist_push(struct pcpu_freelist *s,
+void __pcpu_freelist_push(struct pcpu_freelist *s,
                        struct pcpu_freelist_node *node)
 {
        struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
-        __pcpu_freelist_push(head, node);
+        ___pcpu_freelist_push(head, node);
+}
+void pcpu_freelist_push(struct pcpu_freelist *s,
+                        struct pcpu_freelist_node *node)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __pcpu_freelist_push(s, node);
+        local_irq_restore(flags);
 }
 void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
@@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
        for_each_possible_cpu(cpu) {
 again:
                head = per_cpu_ptr(s->freelist, cpu);
-                __pcpu_freelist_push(head, buf);
+                ___pcpu_freelist_push(head, buf);
                i++;
                buf += elem_size;
                if (i == nr_elems)
@@ -74,14 +84,12 @@ again:
        local_irq_restore(flags);
 }
-struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
 {
        struct pcpu_freelist_head *head;
        struct pcpu_freelist_node *node;
-        unsigned long flags;
        int orig_cpu, cpu;
-        local_irq_save(flags);
        orig_cpu = cpu = raw_smp_processor_id();
        while (1) {
                head = per_cpu_ptr(s->freelist, cpu);
@@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
                node = head->first;
                if (node) {
                        head->first = node->next;
-                        raw_spin_unlock_irqrestore(&head->lock, flags);
+                        raw_spin_unlock(&head->lock);
                        return node;
                }
                raw_spin_unlock(&head->lock);
                cpu = cpumask_next(cpu, cpu_possible_mask);
                if (cpu >= nr_cpu_ids)
                        cpu = 0;
-                if (cpu == orig_cpu) {
+                if (cpu == orig_cpu)
-                        local_irq_restore(flags);
                        return NULL;
-                }
        }
 }
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+        struct pcpu_freelist_node *ret;
+        unsigned long flags;
+        local_irq_save(flags);
+        ret = __pcpu_freelist_pop(s);
+        local_irq_restore(flags);
+        return ret;
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3049aae8ea1e..c3960118e617 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -22,8 +22,12 @@ struct pcpu_freelist_node {
        struct pcpu_freelist_node *next;
 };
+/* pcpu_freelist_* do spin_lock_irqsave. */
 void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
 struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */
+void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *);
 void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
                            u32 nr_elems);
 int pcpu_freelist_init(struct pcpu_freelist *);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 90daf285de03..950ab2f28922 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -44,7 +44,7 @@ static void do_up_read(struct irq_work *entry)
        struct stack_map_irq_work *work;
        work = container_of(entry, struct stack_map_irq_work, irq_work);
-        up_read(work->sem);
+        up_read_non_owner(work->sem);
        work->sem = NULL;
 }
@@ -180,11 +180,14 @@ static inline int stack_map_parse_build_id(void *page_addr,
                if (nhdr->n_type == BPF_BUILD_ID &&
                    nhdr->n_namesz == sizeof("GNU") &&
-                    nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
+                    nhdr->n_descsz > 0 &&
+                    nhdr->n_descsz <= BPF_BUILD_ID_SIZE) {
                        memcpy(build_id,
                               note_start + note_offs +
                               ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
-                               BPF_BUILD_ID_SIZE);
+                               nhdr->n_descsz);
+                        memset(build_id + nhdr->n_descsz, 0,
+                               BPF_BUILD_ID_SIZE - nhdr->n_descsz);
                        return 0;
                }
                new_offs = note_offs + sizeof(Elf32_Nhdr) +
@@ -260,7 +263,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma,
                return -EFAULT; /* page not mapped */
        ret = -EINVAL;
-        page_addr = page_address(page);
+        page_addr = kmap_atomic(page);
        ehdr = (Elf32_Ehdr *)page_addr;
        /* compare magic x7f "ELF" */
@@ -276,6 +279,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma,
        else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
                ret = stack_map_get_build_id_64(page_addr, build_id);
 out:
+        kunmap_atomic(page_addr);
        put_page(page);
        return ret;
 }
@@ -310,6 +314,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
                for (i = 0; i < trace_nr; i++) {
                        id_offs[i].status = BPF_STACK_BUILD_ID_IP;
                        id_offs[i].ip = ips[i];
+                        memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
                }
                return;
        }
@@ -320,6 +325,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
                        /* per entry fall back to ips */
                        id_offs[i].status = BPF_STACK_BUILD_ID_IP;
                        id_offs[i].ip = ips[i];
+                        memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
                        continue;
                }
                id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
@@ -332,6 +338,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
        } else {
                work->sem = &current->mm->mmap_sem;
                irq_work_queue(&work->irq_work);
+                /*
+                 * The irq_work will release the mmap_sem with
+                 * up_read_non_owner(). The rwsem_release() is called
+                 * here to release the lock from lockdep's perspective.
+                 */
+                rwsem_release(&current->mm->mmap_sem.dep_map, 1, _RET_IP_);
        }
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd..8577bb7f8be6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -713,8 +713,13 @@ static int map_lookup_elem(union bpf_attr *attr)
        if (bpf_map_is_dev_bound(map)) {
                err = bpf_map_offload_lookup_elem(map, key, value);
-        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+                goto done;
-                   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+        }
+        preempt_disable();
+        this_cpu_inc(bpf_prog_active);
+        if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+            map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
                err = bpf_percpu_hash_copy(map, key, value);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
                err = bpf_percpu_array_copy(map, key, value);
@@ -744,7 +749,10 @@ static int map_lookup_elem(union bpf_attr *attr)
                }
                rcu_read_unlock();
        }
+        this_cpu_dec(bpf_prog_active);
+        preempt_enable();
+done:
        if (err)
                goto free_value;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f6bc62a9ee8e..8f295b790297 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1617,12 +1617,13 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
        return 0;
 }
-static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
-                             int size, enum bpf_access_type t)
+                             u32 regno, int off, int size,
+                             enum bpf_access_type t)
 {
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *reg = &regs[regno];
-        struct bpf_insn_access_aux info;
+        struct bpf_insn_access_aux info = {};
        if (reg->smin_value < 0) {
                verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
@@ -1636,6 +1637,8 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,
                return -EACCES;
        }
+        env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
        return 0;
 }
@@ -2032,7 +2035,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                        verbose(env, "cannot write into socket\n");
                        return -EACCES;
                }
-                err = check_sock_access(env, regno, off, size, t);
+                err = check_sock_access(env, insn_idx, regno, off, size, t);
                if (!err && value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else {
@@ -3103,6 +3106,40 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
        }
 }
+static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
+                                    const struct bpf_insn *insn)
+{
+        return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K;
+}
+static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
+                                       u32 alu_state, u32 alu_limit)
+{
+        /* If we arrived here from different branches with different
+         * state or limits to sanitize, then this won't work.
+         */
+        if (aux->alu_state &&
+            (aux->alu_state != alu_state ||
+             aux->alu_limit != alu_limit))
+                return -EACCES;
+        /* Corresponding fixup done in fixup_bpf_calls(). */
+        aux->alu_state = alu_state;
+        aux->alu_limit = alu_limit;
+        return 0;
+}
+static int sanitize_val_alu(struct bpf_verifier_env *env,
+                            struct bpf_insn *insn)
+{
+        struct bpf_insn_aux_data *aux = cur_aux(env);
+        if (can_skip_alu_sanitation(env, insn))
+                return 0;
+        return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0);
+}
 static int sanitize_ptr_alu(struct bpf_verifier_env *env,
                            struct bpf_insn *insn,
                            const struct bpf_reg_state *ptr_reg,
@@ -3117,7 +3154,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
        struct bpf_reg_state tmp;
        bool ret;
-        if (env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K)
+        if (can_skip_alu_sanitation(env, insn))
                return 0;
        /* We already marked aux for masking from non-speculative
@@ -3133,19 +3170,8 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
        if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
                return 0;
+        if (update_alu_sanitation_state(aux, alu_state, alu_limit))
-        /* If we arrived here from different branches with different
-         * limits to sanitize, then this won't work.
-         */
-        if (aux->alu_state &&
-            (aux->alu_state != alu_state ||
-             aux->alu_limit != alu_limit))
                return -EACCES;
-        /* Corresponding fixup done in fixup_bpf_calls(). */
-        aux->alu_state = alu_state;
-        aux->alu_limit = alu_limit;
 do_sim:
        /* Simulate and find potential out-of-bounds access under
         * speculative execution from truncation as a result of
@@ -3418,6 +3444,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
        s64 smin_val, smax_val;
        u64 umin_val, umax_val;
        u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
+        u32 dst = insn->dst_reg;
+        int ret;
        if (insn_bitness == 32) {
                /* Relevant for 32-bit RSH: Information can propagate towards
@@ -3452,6 +3480,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
        switch (opcode) {
        case BPF_ADD:
+                ret = sanitize_val_alu(env, insn);
+                if (ret < 0) {
+                        verbose(env, "R%d tried to add from different pointers or scalars\n", dst);
+                        return ret;
+                }
                if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
                    signed_add_overflows(dst_reg->smax_value, smax_val)) {
                        dst_reg->smin_value = S64_MIN;
@@ -3471,6 +3504,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
                break;
        case BPF_SUB:
+                ret = sanitize_val_alu(env, insn);
+                if (ret < 0) {
+                        verbose(env, "R%d tried to sub from different pointers or scalars\n", dst);
+                        return ret;
+                }
                if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
                    signed_sub_overflows(dst_reg->smax_value, smin_val)) {
                        /* Overflow possible, we know nothing */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 91d5c38eb7e5..d1c6d152da89 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -376,9 +376,6 @@ void __weak arch_smt_update(void) { }
 #ifdef CONFIG_HOTPLUG_SMT
 enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
-EXPORT_SYMBOL_GPL(cpu_smt_control);
-static bool cpu_smt_available __read_mostly;
 void __init cpu_smt_disable(bool force)
 {
@@ -397,25 +394,11 @@ void __init cpu_smt_disable(bool force)
 /*
 * The decision whether SMT is supported can only be done after the full
- * CPU identification. Called from architecture code before non boot CPUs
+ * CPU identification. Called from architecture code.
- * are brought up.
- */
-void __init cpu_smt_check_topology_early(void)
-{
-        if (!topology_smt_supported())
-                cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
-}
-/*
- * If SMT was disabled by BIOS, detect it here, after the CPUs have been
- * brought online. This ensures the smt/l1tf sysfs entries are consistent
- * with reality. cpu_smt_available is set to true during the bringup of non
- * boot CPUs when a SMT sibling is detected. Note, this may overwrite
- * cpu_smt_control's previous setting.
 */
 void __init cpu_smt_check_topology(void)
 {
-        if (!cpu_smt_available)
+        if (!topology_smt_supported())
                cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
 }
@@ -428,18 +411,10 @@ early_param("nosmt", smt_cmdline_disable);
 static inline bool cpu_smt_allowed(unsigned int cpu)
 {
-        if (topology_is_primary_thread(cpu))
+        if (cpu_smt_control == CPU_SMT_ENABLED)
                return true;
-        /*
+        if (topology_is_primary_thread(cpu))
-         * If the CPU is not a 'primary' thread and the booted_once bit is
-         * set then the processor has SMT support. Store this information
-         * for the late check of SMT support in cpu_smt_check_topology().
-         */
-        if (per_cpu(cpuhp_state, cpu).booted_once)
-                cpu_smt_available = true;
-        if (cpu_smt_control == CPU_SMT_ENABLED)
                return true;
        /*
@@ -2090,10 +2065,8 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
                 */
                cpuhp_offline_cpu_device(cpu);
        }
-        if (!ret) {
+        if (!ret)
                cpu_smt_control = ctrlval;
-                arch_smt_update();
-        }
        cpu_maps_update_done();
        return ret;
 }
@@ -2104,7 +2077,6 @@ static int cpuhp_smt_enable(void)
        cpu_maps_update_begin();
        cpu_smt_control = CPU_SMT_ENABLED;
-        arch_smt_update();
        for_each_present_cpu(cpu) {
                /* Skip online CPUs and CPUs on offline nodes */
                if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index d6361776dc5c..1fb6fd68b9c7 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -378,6 +378,8 @@ void __init swiotlb_exit(void)
                memblock_free_late(io_tlb_start,
                                   PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
        }
+        io_tlb_start = 0;
+        io_tlb_end = 0;
        io_tlb_nslabs = 0;
        max_segment = 0;
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3cd13a30f732..26d6edab051a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -436,18 +436,18 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        int ret;
+        int perf_cpu = sysctl_perf_cpu_time_max_percent;
-        if (ret || !write)
-                return ret;
        /*
         * If throttling is disabled don't allow the write:
         */
-        if (sysctl_perf_cpu_time_max_percent == 100 ||
+        if (write && (perf_cpu == 100 || perf_cpu == 0))
-            sysctl_perf_cpu_time_max_percent == 0)
                return -EINVAL;
+        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        if (ret || !write)
+                return ret;
        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        update_perf_cpu_limits();
@@ -4963,6 +4963,11 @@ static void __perf_event_period(struct perf_event *event,
        }
 }
+static int perf_event_check_period(struct perf_event *event, u64 value)
+{
+        return event->pmu->check_period(event, value);
+}
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
        u64 value;
@@ -4979,6 +4984,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
        if (event->attr.freq && value > sysctl_perf_event_sample_rate)
                return -EINVAL;
+        if (perf_event_check_period(event, value))
+                return -EINVAL;
        event_function_call(event, __perf_event_period, &value);
        return 0;
@@ -9391,6 +9399,11 @@ static int perf_pmu_nop_int(struct pmu *pmu)
        return 0;
 }
+static int perf_event_nop_int(struct perf_event *event, u64 value)
+{
+        return 0;
+}
 static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
@@ -9691,6 +9704,9 @@ got_cpu_context:
                pmu->pmu_disable = perf_pmu_nop_void;
        }
+        if (!pmu->check_period)
+                pmu->check_period = perf_event_nop_int;
        if (!pmu->event_idx)
                pmu->event_idx = perf_event_idx_default;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4a9937076331..5ab4fe3b1dcc 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -734,6 +734,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
        size = sizeof(struct ring_buffer);
        size += nr_pages * sizeof(void *);
+        if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
+                goto fail;
        rb = kzalloc(size, GFP_KERNEL);
        if (!rb)
                goto fail;
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d14979577ee..2639a30a8aa5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -307,7 +307,7 @@ void rcuwait_wake_up(struct rcuwait *w)
         *        MB (A)              MB (B)
         *    [L] cond            [L] tsk
         */
-        smp_rmb(); /* (B) */
+        smp_mb(); /* (B) */
        /*
         * Avoid using task_rcu_dereference() magic as long as we are careful,
@@ -558,12 +558,14 @@ static struct task_struct *find_alive_thread(struct task_struct *p)
        return NULL;
 }
-static struct task_struct *find_child_reaper(struct task_struct *father)
+static struct task_struct *find_child_reaper(struct task_struct *father,
+                                                struct list_head *dead)
        __releases(&tasklist_lock)
        __acquires(&tasklist_lock)
 {
        struct pid_namespace *pid_ns = task_active_pid_ns(father);
        struct task_struct *reaper = pid_ns->child_reaper;
+        struct task_struct *p, *n;
        if (likely(reaper != father))
                return reaper;
@@ -579,6 +581,12 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
                panic("Attempted to kill init! exitcode=0x%08x\n",
                        father->signal->group_exit_code ?: father->exit_code);
        }
+        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
+                list_del_init(&p->ptrace_entry);
+                release_task(p);
+        }
        zap_pid_ns_processes(pid_ns);
        write_lock_irq(&tasklist_lock);
@@ -668,7 +676,7 @@ static void forget_original_parent(struct task_struct *father,
                exit_ptrace(father, dead);
        /* Can drop and reacquire tasklist_lock */
-        reaper = find_child_reaper(father);
+        reaper = find_child_reaper(father, dead);
        if (list_empty(&father->children))
                return;
@@ -866,6 +874,7 @@ void __noreturn do_exit(long code)
        exit_task_namespaces(tsk);
        exit_task_work(tsk);
        exit_thread(tsk);
+        exit_umh(tsk);
        /*
         * Flush inherited counters to the parent - before the parent
diff --git a/kernel/fork.c b/kernel/fork.c
index a60459947f18..b69248e6f0e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -217,6 +217,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
                memset(s->addr, 0, THREAD_SIZE);
                tsk->stack_vm_area = s;
+                tsk->stack = s->addr;
                return s->addr;
        }
@@ -1833,8 +1834,6 @@ static __latent_entropy struct task_struct *copy_process(
        posix_cpu_timers_init(p);
-        p->start_time = ktime_get_ns();
-        p->real_start_time = ktime_get_boot_ns();
        p->io_context = NULL;
        audit_set_context(p, NULL);
        cgroup_fork(p);
@@ -2001,6 +2000,17 @@ static __latent_entropy struct task_struct *copy_process(
                goto bad_fork_free_pid;
        /*
+         * From this point on we must avoid any synchronous user-space
+         * communication until we take the tasklist-lock. In particular, we do
+         * not want user-space to be able to predict the process start-time by
+         * stalling fork(2) after we recorded the start_time but before it is
+         * visible to the system.
+         */
+        p->start_time = ktime_get_ns();
+        p->real_start_time = ktime_get_boot_ns();
+        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
         * Need tasklist lock for parent etc handling!
         */
diff --git a/kernel/futex.c b/kernel/futex.c
index be3bff2315ff..a0514e01c3eb 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1452,11 +1452,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
        if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
                return;
-        /*
+        get_task_struct(p);
-         * Queue the task for later wakeup for after we've released
-         * the hb->lock. wake_q_add() grabs reference to p.
-         */
-        wake_q_add(wake_q, p);
        __unqueue_futex(q);
        /*
         * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
@@ -1466,6 +1462,13 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
         * plist_del in __unqueue_futex().
         */
        smp_store_release(&q->lock_ptr, NULL);
+        /*
+         * Queue the task for later wakeup for after we've released
+         * the hb->lock. wake_q_add() grabs reference to p.
+         */
+        wake_q_add(wake_q, p);
+        put_task_struct(p);
 }
 /*
@@ -2218,11 +2221,11 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
         * decrement the counter at queue_unlock() when some error has
         * occurred and we don't end up adding the task to the list.
         */
-        hb_waiters_inc(hb);
+        hb_waiters_inc(hb); /* implies smp_mb(); (A) */
        q->lock_ptr = &hb->lock;
-        spin_lock(&hb->lock); /* implies smp_mb(); (A) */
+        spin_lock(&hb->lock);
        return hb;
 }
@@ -2858,35 +2861,39 @@ retry_private:
         * and BUG when futex_unlock_pi() interleaves with this.
         *
         * Therefore acquire wait_lock while holding hb->lock, but drop the
-         * latter before calling rt_mutex_start_proxy_lock(). This still fully
+         * latter before calling __rt_mutex_start_proxy_lock(). This
-         * serializes against futex_unlock_pi() as that does the exact same
+         * interleaves with futex_unlock_pi() -- which does a similar lock
-         * lock handoff sequence.
+         * handoff -- such that the latter can observe the futex_q::pi_state
+         * before __rt_mutex_start_proxy_lock() is done.
         */
        raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
        spin_unlock(q.lock_ptr);
+        /*
+         * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+         * such that futex_unlock_pi() is guaranteed to observe the waiter when
+         * it sees the futex_q::pi_state.
+         */
        ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
        raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
        if (ret) {
                if (ret == 1)
                        ret = 0;
+                goto cleanup;
-                spin_lock(q.lock_ptr);
-                goto no_block;
        }
        if (unlikely(to))
                hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
        ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+cleanup:
        spin_lock(q.lock_ptr);
        /*
-         * If we failed to acquire the lock (signal/timeout), we must
+         * If we failed to acquire the lock (deadlock/signal/timeout), we must
         * first acquire the hb->lock before removing the lock from the
-         * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+         * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
-         * wait lists consistent.
+         * lists consistent.
         *
         * In particular; it is important that futex_unlock_pi() can not
         * observe this inconsistency.
@@ -3010,6 +3017,10 @@ retry:
                 * there is no point where we hold neither; and therefore
                 * wake_futex_pi() must observe a state consistent with what we
                 * observed.
+                 *
+                 * In particular; this forces __rt_mutex_start_proxy() to
+                 * complete such that we're guaranteed to observe the
+                 * rt_waiter. Also see the WARN in wake_futex_pi().
                 */
                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
                spin_unlock(&hb->lock);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index ee062b7939d3..ef8ad36cadcf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -457,7 +457,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
        /* Validate affinity mask(s) */
        if (affinity) {
-                for (i = 0; i < cnt; i++, i++) {
+                for (i = 0; i < cnt; i++) {
                        if (cpumask_empty(&affinity[i].mask))
                                return -EINVAL;
                }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4888ce4667a..84b54a17b95d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -393,6 +393,9 @@ int irq_setup_affinity(struct irq_desc *desc)
        }
        cpumask_and(&mask, cpu_online_mask, set);
+        if (cpumask_empty(&mask))
+                cpumask_copy(&mask, cpu_online_mask);
        if (node != NUMA_NO_NODE) {
                const struct cpumask *nodemask = cpumask_of_node(node);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 581edcc63c26..978d63a8261c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1726,12 +1726,33 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
        rt_mutex_set_owner(lock, NULL);
 }
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock:               the rt_mutex to take
+ * @waiter:             the pre-initialized rt_mutex_waiter
+ * @task:               the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ *  0 - task blocked on lock
+ *  1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
 int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                              struct rt_mutex_waiter *waiter,
                              struct task_struct *task)
 {
        int ret;
+        lockdep_assert_held(&lock->wait_lock);
        if (try_to_take_rt_mutex(lock, task, NULL))
                return 1;
@@ -1749,9 +1770,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                ret = 0;
        }
-        if (unlikely(ret))
-                remove_waiter(lock, waiter);
        debug_rt_mutex_print_deadlock(waiter);
        return ret;
@@ -1763,12 +1781,18 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 * @waiter:             the pre-initialized rt_mutex_waiter
 * @task:               the task to prepare
 *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
 * Returns:
 *  0 - task blocked on lock
 *  1 - acquired the lock for task, caller should wake it up
 * <0 - error
 *
- * Special API call for FUTEX_REQUEUE_PI support.
+ * Special API call for PI-futex support.
 */
 int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                              struct rt_mutex_waiter *waiter,
@@ -1778,6 +1802,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
        raw_spin_lock_irq(&lock->wait_lock);
        ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+        if (unlikely(ret))
+                remove_waiter(lock, waiter);
        raw_spin_unlock_irq(&lock->wait_lock);
        return ret;
@@ -1845,7 +1871,8 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
 * @lock:               the rt_mutex we were woken on
 * @waiter:             the pre-initialized rt_mutex_waiter
 *
- * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
 *
 * Unless we acquired the lock; we're still enqueued on the wait-list and can
 * in fact still be granted ownership until we're removed. Therefore we can
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09b180063ee1..50d9af615dc4 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -198,15 +198,22 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
                woken++;
                tsk = waiter->task;
-                wake_q_add(wake_q, tsk);
+                get_task_struct(tsk);
                list_del(&waiter->list);
                /*
-                 * Ensure that the last operation is setting the reader
+                 * Ensure calling get_task_struct() before setting the reader
                 * waiter to nil such that rwsem_down_read_failed() cannot
                 * race with do_exit() by always holding a reference count
                 * to the task to wakeup.
                 */
                smp_store_release(&waiter->task, NULL);
+                /*
+                 * Ensure issuing the wakeup (either by us or someone else)
+                 * after setting the reader waiter to nil.
+                 */
+                wake_q_add(wake_q, tsk);
+                /* wake_q_add() already take the task ref */
+                put_task_struct(tsk);
        }
        adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
diff --git a/kernel/relay.c b/kernel/relay.c
index 04f248644e06..9e0f52375487 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -428,6 +428,8 @@ static struct dentry *relay_create_buf_file(struct rchan *chan,
        dentry = chan->cb->create_buf_file(tmpname, chan->parent,
                                           S_IRUSR, buf,
                                           &chan->is_global);
+        if (IS_ERR(dentry))
+                dentry = NULL;
        kfree(tmpname);
@@ -461,7 +463,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
                dentry = chan->cb->create_buf_file(NULL, NULL,
                                                   S_IRUSR, buf,
                                                   &chan->is_global);
-                if (WARN_ON(dentry))
+                if (IS_ERR_OR_NULL(dentry))
                        goto free_buf;
        }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a674c7db2f29..d8d76a65cfdd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -396,6 +396,18 @@ static bool set_nr_if_polling(struct task_struct *p)
 #endif
 #endif
+/**
+ * wake_q_add() - queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ */
 void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 {
        struct wake_q_node *node = &task->wake_q;
@@ -405,10 +417,11 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
         * its already queued (either by us or someone else) and will get the
         * wakeup due to that.
         *
-         * This cmpxchg() executes a full barrier, which pairs with the full
+         * In order to ensure that a pending wakeup will observe our pending
-         * barrier executed by the wakeup in wake_up_q().
+         * state, even in the failed case, an explicit smp_mb() must be used.
         */
-        if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+        smp_mb__before_atomic();
+        if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
                return;
        get_task_struct(task);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 50aa2aba69bd..310d0637fe4b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5980,6 +5980,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 #ifdef CONFIG_SCHED_SMT
 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+EXPORT_SYMBOL_GPL(sched_smt_present);
 static inline void set_idle_cores(int cpu, int val)
 {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index fe24de3fbc93..0e97ca9306ef 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -124,6 +124,7 @@
 * sampling of the aggregate task states would be.
 */
+#include "../workqueue_internal.h"
 #include <linux/sched/loadavg.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
@@ -321,7 +322,7 @@ static bool update_stats(struct psi_group *group)
        expires = group->next_update;
        if (now < expires)
                goto out;
-        if (now - expires > psi_period)
+        if (now - expires >= psi_period)
                missed_periods = div_u64(now - expires, psi_period);
        /*
@@ -480,9 +481,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
                        groupc->tasks[t]++;
        write_seqcount_end(&groupc->seq);
-        if (!delayed_work_pending(&group->clock_work))
-                schedule_delayed_work(&group->clock_work, PSI_FREQ);
 }
 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -513,6 +511,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 {
        int cpu = task_cpu(task);
        struct psi_group *group;
+        bool wake_clock = true;
        void *iter = NULL;
        if (!task->pid)
@@ -530,8 +529,22 @@ void psi_task_change(struct task_struct *task, int clear, int set)
        task->psi_flags &= ~clear;
        task->psi_flags |= set;
-        while ((group = iterate_groups(task, &iter)))
+        /*
+         * Periodic aggregation shuts off if there is a period of no
+         * task changes, so we wake it back up if necessary. However,
+         * don't do this if the task change is the aggregation worker
+         * itself going to sleep, or we'll ping-pong forever.
+         */
+        if (unlikely((clear & TSK_RUNNING) &&
+                     (task->flags & PF_WQ_WORKER) &&
+                     wq_worker_last_func(task) == psi_update_work))
+                wake_clock = false;
+        while ((group = iterate_groups(task, &iter))) {
                psi_group_change(group, cpu, clear, set);
+                if (wake_clock && !delayed_work_pending(&group->clock_work))
+                        schedule_delayed_work(&group->clock_work, PSI_FREQ);
+        }
 }
 void psi_memstall_tick(struct task_struct *task, int cpu)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d7f538847b84..e815781ed751 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -976,6 +976,9 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
        struct seccomp_filter *filter = file->private_data;
        struct seccomp_knotif *knotif;
+        if (!filter)
+                return 0;
        mutex_lock(&filter->notify_lock);
        /*
@@ -1300,6 +1303,7 @@ out:
 out_put_fd:
        if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
                if (ret < 0) {
+                        listener_f->private_data = NULL;
                        fput(listener_f);
                        put_unused_fd(listener);
                } else {
diff --git a/kernel/signal.c b/kernel/signal.c
index e1d7ad8e6ab1..57b7771e20d7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -688,6 +688,48 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in
 }
 EXPORT_SYMBOL_GPL(dequeue_signal);
+static int dequeue_synchronous_signal(kernel_siginfo_t *info)
+{
+        struct task_struct *tsk = current;
+        struct sigpending *pending = &tsk->pending;
+        struct sigqueue *q, *sync = NULL;
+        /*
+         * Might a synchronous signal be in the queue?
+         */
+        if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
+                return 0;
+        /*
+         * Return the first synchronous signal in the queue.
+         */
+        list_for_each_entry(q, &pending->list, list) {
+                /* Synchronous signals have a postive si_code */
+                if ((q->info.si_code > SI_USER) &&
+                    (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
+                        sync = q;
+                        goto next;
+                }
+        }
+        return 0;
+next:
+        /*
+         * Check if there is another siginfo for the same signal.
+         */
+        list_for_each_entry_continue(q, &pending->list, list) {
+                if (q->info.si_signo == sync->info.si_signo)
+                        goto still_pending;
+        }
+        sigdelset(&pending->signal, sync->info.si_signo);
+        recalc_sigpending();
+still_pending:
+        list_del_init(&sync->list);
+        copy_siginfo(info, &sync->info);
+        __sigqueue_free(sync);
+        return info->si_signo;
+}
 /*
 * Tell a process that it has a new active signal..
 *
@@ -1057,10 +1099,9 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
        result = TRACE_SIGNAL_DELIVERED;
        /*
-         * Skip useless siginfo allocation for SIGKILL SIGSTOP,
+         * Skip useless siginfo allocation for SIGKILL and kernel threads.
-         * and kernel threads.
         */
-        if (sig_kernel_only(sig) || (t->flags & PF_KTHREAD))
+        if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
                goto out_set;
        /*
@@ -2394,6 +2435,14 @@ relock:
                goto relock;
        }
+        /* Has this task already been marked for death? */
+        if (signal_group_exit(signal)) {
+                ksig->info.si_signo = signr = SIGKILL;
+                sigdelset(&current->pending.signal, SIGKILL);
+                recalc_sigpending();
+                goto fatal;
+        }
        for (;;) {
                struct k_sigaction *ka;
@@ -2407,7 +2456,15 @@ relock:
                        goto relock;
                }
-                signr = dequeue_signal(current, &current->blocked, &ksig->info);
+                /*
+                 * Signals generated by the execution of an instruction
+                 * need to be delivered before any other pending signals
+                 * so that the instruction pointer in the signal stack
+                 * frame points to the faulting instruction.
+                 */
+                signr = dequeue_synchronous_signal(&ksig->info);
+                if (!signr)
+                        signr = dequeue_signal(current, &current->blocked, &ksig->info);
                if (!signr)
                        break; /* will return 0 */
@@ -2489,6 +2546,7 @@ relock:
                        continue;
                }
+        fatal:
                spin_unlock_irq(&sighand->siglock);
                /*
diff --git a/kernel/smp.c b/kernel/smp.c
index 163c451af42e..f4cf1b0bb3b8 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -584,8 +584,6 @@ void __init smp_init(void)
                num_nodes, (num_nodes > 1 ? "s" : ""),
                num_cpus,  (num_cpus  > 1 ? "s" : ""));
-        /* Final decision about SMT support */
-        cpu_smt_check_topology();
        /* Any cleanup work */
        smp_cpus_done(setup_max_cpus);
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index a48cbf1414b8..f7eb62eceb24 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1207,7 +1207,8 @@ DECLARE_RWSEM(uts_sem);
 /*
 * Work around broken programs that cannot handle "Linux 3.0".
 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
- * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60.
+ * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be
+ * 2.6.60.
 */
 static int override_release(char __user *release, size_t len)
 {
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 8f0644af40be..80f955210861 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -685,6 +685,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
         * set up the signal and overrun bookkeeping.
         */
        timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
+        timer->it_interval = ns_to_ktime(timer->it.cpu.incr);
        /*
         * This acts as a modification timestamp for the timer,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8b068adb9da1..f1a86a0d881d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1204,22 +1204,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 {
-        int err;
+        return __bpf_probe_register(btp, prog);
-        mutex_lock(&bpf_event_mutex);
-        err = __bpf_probe_register(btp, prog);
-        mutex_unlock(&bpf_event_mutex);
-        return err;
 }
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 {
-        int err;
+        return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
-        mutex_lock(&bpf_event_mutex);
-        err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
-        mutex_unlock(&bpf_event_mutex);
-        return err;
 }
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c521b7347482..c4238b441624 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3384,6 +3384,8 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
        const char tgid_space[] = "          ";
        const char space[] = "  ";
+        print_event_info(buf, m);
        seq_printf(m, "#                          %s  _-----=> irqs-off\n",
                   tgid ? tgid_space : space);
        seq_printf(m, "#                          %s / _----=> need-resched\n",
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5c19b8c41c7e..9eaf07f99212 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -607,11 +607,17 @@ static int trace_kprobe_create(int argc, const char *argv[])
        char buf[MAX_EVENT_NAME_LEN];
        unsigned int flags = TPARG_FL_KERNEL;
-        /* argc must be >= 1 */
+        switch (argv[0][0]) {
-        if (argv[0][0] == 'r') {
+        case 'r':
                is_return = true;
                flags |= TPARG_FL_RETURN;
-        } else if (argv[0][0] != 'p' || argc < 2)
+                break;
+        case 'p':
+                break;
+        default:
+                return -ECANCELED;
+        }
+        if (argc < 2)
                return -ECANCELED;
        event = strchr(&argv[0][1], ':');
@@ -855,22 +861,14 @@ static const struct file_operations kprobe_profile_ops = {
 static nokprobe_inline int
 fetch_store_strlen(unsigned long addr)
 {
-        mm_segment_t old_fs;
        int ret, len = 0;
        u8 c;
-        old_fs = get_fs();
-        set_fs(KERNEL_DS);
-        pagefault_disable();
        do {
-                ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+                ret = probe_mem_read(&c, (u8 *)addr + len, 1);
                len++;
        } while (c && ret == 0 && len < MAX_STRING_SIZE);
-        pagefault_enable();
-        set_fs(old_fs);
        return (ret < 0) ? ret : len;
 }
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 5c56afc17cf8..4737bb8c07a3 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -180,10 +180,12 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs,
                if (unlikely(arg->dynamic))
                        *dl = make_data_loc(maxlen, dyndata - base);
                ret = process_fetch_insn(arg->code, regs, dl, base);
-                if (unlikely(ret < 0 && arg->dynamic))
+                if (unlikely(ret < 0 && arg->dynamic)) {
                        *dl = make_data_loc(0, dyndata - base);
-                else
+                } else {
                        dyndata += ret;
+                        maxlen -= ret;
+                }
        }
 }
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index e335576b9411..9bde07c06362 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -5,7 +5,7 @@
 * Copyright (C) IBM Corporation, 2010-2012
 * Author:      Srikar Dronamraju <srikar@linux.vnet.ibm.com>
 */
-#define pr_fmt(fmt)     "trace_kprobe: " fmt
+#define pr_fmt(fmt)     "trace_uprobe: " fmt
 #include <linux/ctype.h>
 #include <linux/module.h>
@@ -160,6 +160,13 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
        if (ret >= 0) {
                if (ret == maxlen)
                        dst[ret - 1] = '\0';
+                else
+                        /*
+                         * Include the terminating null byte. In this case it
+                         * was copied by strncpy_from_user but not accounted
+                         * for in ret.
+                         */
+                        ret++;
                *(u32 *)dest = make_data_loc(ret, (void *)dst - base);
        }
diff --git a/kernel/umh.c b/kernel/umh.c
index 0baa672e023c..d937cbad903a 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -37,6 +37,8 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
 static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
 static DEFINE_SPINLOCK(umh_sysctl_lock);
 static DECLARE_RWSEM(umhelper_sem);
+static LIST_HEAD(umh_list);
+static DEFINE_MUTEX(umh_list_lock);
 static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
@@ -100,10 +102,12 @@ static int call_usermodehelper_exec_async(void *data)
        commit_creds(new);
        sub_info->pid = task_pid_nr(current);
-        if (sub_info->file)
+        if (sub_info->file) {
                retval = do_execve_file(sub_info->file,
                                        sub_info->argv, sub_info->envp);
-        else
+                if (!retval)
+                        current->flags |= PF_UMH;
+        } else
                retval = do_execve(getname_kernel(sub_info->path),
                                   (const char __user *const __user *)sub_info->argv,
                                   (const char __user *const __user *)sub_info->envp);
@@ -517,6 +521,11 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
                goto out;
        err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+        if (!err) {
+                mutex_lock(&umh_list_lock);
+                list_add(&info->list, &umh_list);
+                mutex_unlock(&umh_list_lock);
+        }
 out:
        fput(file);
        return err;
@@ -679,6 +688,26 @@ static int proc_cap_handler(struct ctl_table *table, int write,
        return 0;
 }
+void __exit_umh(struct task_struct *tsk)
+{
+        struct umh_info *info;
+        pid_t pid = tsk->pid;
+        mutex_lock(&umh_list_lock);
+        list_for_each_entry(info, &umh_list, list) {
+                if (info->pid == pid) {
+                        list_del(&info->list);
+                        mutex_unlock(&umh_list_lock);
+                        goto out;
+                }
+        }
+        mutex_unlock(&umh_list_lock);
+        return;
+out:
+        if (info->cleanup)
+                info->cleanup(info);
+}
 struct ctl_table usermodehelper_table[] = {
        {
                .procname       = "bset",
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 392be4b252f6..fc5d23d752a5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -910,6 +910,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
 }
 /**
+ * wq_worker_last_func - retrieve worker's last work function
+ *
+ * Determine the last function a worker executed. This is called from
+ * the scheduler to get a worker's last known identity.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ *
+ * Return:
+ * The last work function %current executed as a worker, NULL if it
+ * hasn't executed any work yet.
+ */
+work_func_t wq_worker_last_func(struct task_struct *task)
+{
+        struct worker *worker = kthread_data(task);
+        return worker->last_func;
+}
+/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to set
@@ -2184,6 +2204,9 @@ __acquires(&pool->lock)
        if (unlikely(cpu_intensive))
                worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+        /* tag the worker for identification in schedule() */
+        worker->last_func = worker->current_func;
        /* we're done with it, release */
        hash_del(&worker->hentry);
        worker->current_work = NULL;
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 66fbb5a9e633..cb68b03ca89a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -53,6 +53,9 @@ struct worker {
        /* used only by rescuers to point to the target workqueue */
        struct workqueue_struct *rescue_wq;     /* I: the workqueue to rescue */
+        /* used by the scheduler to determine a worker's last known identity */
+        work_func_t             last_func;
 };
 /**
@@ -67,9 +70,10 @@ static inline struct worker *current_wq_worker(void)
 /*
 * Scheduler hooks for concurrency managed workqueue.  Only to be used from
- * sched/core.c and workqueue.c.
+ * sched/ and workqueue.c.
 */
 void wq_worker_waking_up(struct task_struct *task, int cpu);
 struct task_struct *wq_worker_sleeping(struct task_struct *task);
+work_func_t wq_worker_last_func(struct task_struct *task);
 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */