126 files changed, 4138 insertions, 2955 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 6e699100872f..34d1e77ee9df 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,5 +1,6 @@
 #
 # Generated files
 #
+kheaders.md5
 timeconst.h
 hz.bc
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index fbba478ae522..bf770d7556f7 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -229,7 +229,7 @@ config MUTEX_SPIN_ON_OWNER
 config RWSEM_SPIN_ON_OWNER
       def_bool y
-       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+       depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
 config LOCK_SPIN_ON_OWNER
       def_bool y
@@ -251,3 +251,10 @@ config ARCH_USE_QUEUED_RWLOCKS
 config QUEUED_RWLOCKS
        def_bool y if ARCH_USE_QUEUED_RWLOCKS
        depends on SMP
+config ARCH_HAS_MMIOWB
+        bool
+config MMIOWB
+        def_bool y if ARCH_HAS_MMIOWB
+        depends on SMP
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c57e78817da..298437bb2c6a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n
 # Don't self-instrument.
 KCOV_INSTRUMENT_kcov.o := n
 KASAN_SANITIZE_kcov.o := n
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 # cond_syscall is currently not LTO compatible
 CFLAGS_sys_ni.o = $(DISABLE_LTO)
@@ -70,6 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o
 obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -121,3 +123,12 @@ $(obj)/configs.o: $(obj)/config_data.gz
 targets += config_data.gz
 $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
        $(call if_changed,gzip)
+$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
+quiet_cmd_genikh = CHK     $(obj)/kheaders_data.tar.xz
+cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@
+$(obj)/kheaders_data.tar.xz: FORCE
+        $(call cmd,genikh)
+clean-files := kheaders_data.tar.xz kheaders.md5
diff --git a/kernel/acct.c b/kernel/acct.c
index addf7732fb56..81f9831a7859 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname)
                filp_close(file, NULL);
                return PTR_ERR(internal);
        }
-        err = mnt_want_write(internal);
+        err = __mnt_want_write(internal);
        if (err) {
                mntput(internal);
                kfree(acct);
@@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname)
        old = xchg(&ns->bacct, &acct->pin);
        mutex_unlock(&acct->lock);
        pin_kill(old);
-        mnt_drop_write(mnt);
+        __mnt_drop_write(mnt);
        mntput(mnt);
        return 0;
 }
diff --git a/kernel/async.c b/kernel/async.c
index f6bd0d9885e1..12c332e4e13e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work)
        /* 1) run (and print duration) */
        if (initcall_debug && system_state < SYSTEM_RUNNING) {
-                pr_debug("calling  %lli_%pF @ %i\n",
+                pr_debug("calling  %lli_%pS @ %i\n",
                        (long long)entry->cookie,
                        entry->func, task_pid_nr(current));
                calltime = ktime_get();
@@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work)
        if (initcall_debug && system_state < SYSTEM_RUNNING) {
                rettime = ktime_get();
                delta = ktime_sub(rettime, calltime);
-                pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
+                pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",
                        (long long)entry->cookie,
                        entry->func,
                        (long long)ktime_to_ns(delta) >> 10);
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index 1323360d90e3..a563c8fdad0d 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -48,19 +48,14 @@ static void backtrace_test_irq(void)
 #ifdef CONFIG_STACKTRACE
 static void backtrace_test_saved(void)
 {
-        struct stack_trace trace;
        unsigned long entries[8];
+        unsigned int nr_entries;
        pr_info("Testing a saved backtrace.\n");
        pr_info("The following trace is a kernel self test and not a bug!\n");
-        trace.nr_entries = 0;
+        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
-        trace.max_entries = ARRAY_SIZE(entries);
+        stack_trace_print(entries, nr_entries, 0);
-        trace.entries = entries;
-        trace.skip = 0;
-        save_stack_trace(&trace);
-        print_stack_trace(&trace, 0);
 }
 #else
 static void backtrace_test_saved(void)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ff09d32a8a1b..c605397c79f0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
        if (fp->jited) {
                struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
-                bpf_jit_binary_unlock_ro(hdr);
                bpf_jit_binary_free(hdr);
                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8974b3755670..3c18260403dd 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -162,10 +162,14 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
                                         struct xdp_frame *xdpf)
 {
+        unsigned int hard_start_headroom;
        unsigned int frame_size;
        void *pkt_data_start;
        struct sk_buff *skb;
+        /* Part of headroom was reserved to xdpf */
+        hard_start_headroom = sizeof(struct xdp_frame) +  xdpf->headroom;
        /* build_skb need to place skb_shared_info after SKB end, and
         * also want to know the memory "truesize".  Thus, need to
         * know the memory frame size backing xdp_buff.
@@ -183,15 +187,15 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
         * is not at a fixed memory location, with mixed length
         * packets, which is bad for cache-line hotness.
         */
-        frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) +
+        frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) +
                SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-        pkt_data_start = xdpf->data - xdpf->headroom;
+        pkt_data_start = xdpf->data - hard_start_headroom;
        skb = build_skb(pkt_data_start, frame_size);
        if (!skb)
                return NULL;
-        skb_reserve(skb, xdpf->headroom);
+        skb_reserve(skb, hard_start_headroom);
        __skb_put(skb, xdpf->len);
        if (xdpf->metasize)
                skb_metadata_set(skb, xdpf->metasize);
@@ -205,6 +209,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
         * - RX ring dev queue index    (skb_record_rx_queue)
         */
+        /* Allow SKB to reuse area used by xdp_frame */
+        xdp_scrub_frame(xdpf);
        return skb;
 }
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 2ada5e21dfa6..bc53e5b20ddc 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -554,19 +554,6 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
 }
 EXPORT_SYMBOL(bpf_prog_get_type_path);
-static void bpf_evict_inode(struct inode *inode)
-{
-        enum bpf_type type;
-        truncate_inode_pages_final(&inode->i_data);
-        clear_inode(inode);
-        if (S_ISLNK(inode->i_mode))
-                kfree(inode->i_link);
-        if (!bpf_inode_type(inode, &type))
-                bpf_any_put(inode->i_private, type);
-}
 /*
 * Display the mount options in /proc/mounts.
 */
@@ -579,11 +566,22 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
        return 0;
 }
+static void bpf_free_inode(struct inode *inode)
+{
+        enum bpf_type type;
+        if (S_ISLNK(inode->i_mode))
+                kfree(inode->i_link);
+        if (!bpf_inode_type(inode, &type))
+                bpf_any_put(inode->i_private, type);
+        free_inode_nonrcu(inode);
+}
 static const struct super_operations bpf_super_ops = {
        .statfs         = simple_statfs,
        .drop_inode     = generic_delete_inode,
        .show_options   = bpf_show_options,
-        .evict_inode    = bpf_evict_inode,
+        .free_inode     = bpf_free_inode,
 };
 enum {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 62f6bced3a3c..afca36f53c49 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -136,21 +136,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 void *bpf_map_area_alloc(size_t size, int numa_node)
 {
-        /* We definitely need __GFP_NORETRY, so OOM killer doesn't
+        /* We really just want to fail instead of triggering OOM killer
-         * trigger under memory pressure as we really just want to
+         * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
-         * fail instead.
+         * which is used for lower order allocation requests.
+         *
+         * It has been observed that higher order allocation requests done by
+         * vmalloc with __GFP_NORETRY being set might fail due to not trying
+         * to reclaim memory from the page cache, thus we set
+         * __GFP_RETRY_MAYFAIL to avoid such situations.
         */
-        const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+        const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
        void *area;
        if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-                area = kmalloc_node(size, GFP_USER | flags, numa_node);
+                area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
+                                    numa_node);
                if (area != NULL)
                        return area;
        }
-        return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
+        return __vmalloc_node_flags_caller(size, numa_node,
-                                           __builtin_return_address(0));
+                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL |
+                                           flags, __builtin_return_address(0));
 }
 void bpf_map_area_free(void *area)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ce166a002d16..09d5d972c9ff 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -212,7 +212,7 @@ struct bpf_call_arg_meta {
        int access_size;
        s64 msize_smax_value;
        u64 msize_umax_value;
-        int ptr_id;
+        int ref_obj_id;
        int func_id;
 };
@@ -346,35 +346,23 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
               type == PTR_TO_TCP_SOCK_OR_NULL;
 }
-static bool type_is_refcounted(enum bpf_reg_type type)
-{
-        return type == PTR_TO_SOCKET;
-}
-static bool type_is_refcounted_or_null(enum bpf_reg_type type)
-{
-        return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL;
-}
-static bool reg_is_refcounted(const struct bpf_reg_state *reg)
-{
-        return type_is_refcounted(reg->type);
-}
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
 {
        return reg->type == PTR_TO_MAP_VALUE &&
                map_value_has_spin_lock(reg->map_ptr);
 }
-static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
+static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
 {
-        return type_is_refcounted_or_null(reg->type);
+        return type == PTR_TO_SOCKET ||
+                type == PTR_TO_SOCKET_OR_NULL ||
+                type == PTR_TO_TCP_SOCK ||
+                type == PTR_TO_TCP_SOCK_OR_NULL;
 }
-static bool arg_type_is_refcounted(enum bpf_arg_type type)
+static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
 {
-        return type == ARG_PTR_TO_SOCKET;
+        return type == ARG_PTR_TO_SOCK_COMMON;
 }
 /* Determine whether the function releases some resources allocated by another
@@ -392,6 +380,12 @@ static bool is_acquire_function(enum bpf_func_id func_id)
                func_id == BPF_FUNC_sk_lookup_udp;
 }
+static bool is_ptr_cast_function(enum bpf_func_id func_id)
+{
+        return func_id == BPF_FUNC_tcp_sock ||
+                func_id == BPF_FUNC_sk_fullsock;
+}
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
        [NOT_INIT]              = "?",
@@ -466,6 +460,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
                                verbose(env, ",call_%d", func(env, reg)->callsite);
                } else {
                        verbose(env, "(id=%d", reg->id);
+                        if (reg_type_may_be_refcounted_or_null(t))
+                                verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
                        if (t != SCALAR_VALUE)
                                verbose(env, ",off=%d", reg->off);
                        if (type_is_pkt_pointer(t))
@@ -1901,8 +1897,9 @@ continue_func:
                }
                frame++;
                if (frame >= MAX_CALL_FRAMES) {
-                        WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
+                        verbose(env, "the call stack of %d frames is too deep !\n",
-                        return -EFAULT;
+                                frame);
+                        return -E2BIG;
                }
                goto process_func;
        }
@@ -2414,16 +2411,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
                if (!type_is_sk_pointer(type))
                        goto err_type;
-        } else if (arg_type == ARG_PTR_TO_SOCKET) {
+                if (reg->ref_obj_id) {
-                expected_type = PTR_TO_SOCKET;
+                        if (meta->ref_obj_id) {
-                if (type != expected_type)
+                                verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
-                        goto err_type;
+                                        regno, reg->ref_obj_id,
-                if (meta->ptr_id || !reg->id) {
+                                        meta->ref_obj_id);
-                        verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n",
+                                return -EFAULT;
-                                meta->ptr_id, reg->id);
+                        }
-                        return -EFAULT;
+                        meta->ref_obj_id = reg->ref_obj_id;
                }
-                meta->ptr_id = reg->id;
        } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
                if (meta->func_id == BPF_FUNC_spin_lock) {
                        if (process_spin_lock(env, regno, true))
@@ -2740,32 +2736,38 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
        return true;
 }
-static bool check_refcount_ok(const struct bpf_func_proto *fn)
+static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
 {
        int count = 0;
-        if (arg_type_is_refcounted(fn->arg1_type))
+        if (arg_type_may_be_refcounted(fn->arg1_type))
                count++;
-        if (arg_type_is_refcounted(fn->arg2_type))
+        if (arg_type_may_be_refcounted(fn->arg2_type))
                count++;
-        if (arg_type_is_refcounted(fn->arg3_type))
+        if (arg_type_may_be_refcounted(fn->arg3_type))
                count++;
-        if (arg_type_is_refcounted(fn->arg4_type))
+        if (arg_type_may_be_refcounted(fn->arg4_type))
                count++;
-        if (arg_type_is_refcounted(fn->arg5_type))
+        if (arg_type_may_be_refcounted(fn->arg5_type))
                count++;
+        /* A reference acquiring function cannot acquire
+         * another refcounted ptr.
+         */
+        if (is_acquire_function(func_id) && count)
+                return false;
        /* We only support one arg being unreferenced at the moment,
         * which is sufficient for the helper functions we have right now.
         */
        return count <= 1;
 }
-static int check_func_proto(const struct bpf_func_proto *fn)
+static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
 {
        return check_raw_mode_ok(fn) &&
               check_arg_pair_ok(fn) &&
-               check_refcount_ok(fn) ? 0 : -EINVAL;
+               check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
 }
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
@@ -2799,19 +2801,20 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 }
 static void release_reg_references(struct bpf_verifier_env *env,
-                                   struct bpf_func_state *state, int id)
+                                   struct bpf_func_state *state,
+                                   int ref_obj_id)
 {
        struct bpf_reg_state *regs = state->regs, *reg;
        int i;
        for (i = 0; i < MAX_BPF_REG; i++)
-                if (regs[i].id == id)
+                if (regs[i].ref_obj_id == ref_obj_id)
                        mark_reg_unknown(env, regs, i);
        bpf_for_each_spilled_reg(i, state, reg) {
                if (!reg)
                        continue;
-                if (reg_is_refcounted(reg) && reg->id == id)
+                if (reg->ref_obj_id == ref_obj_id)
                        __mark_reg_unknown(reg);
        }
 }
@@ -2820,15 +2823,20 @@ static void release_reg_references(struct bpf_verifier_env *env,
 * resources. Identify all copies of the same pointer and clear the reference.
 */
 static int release_reference(struct bpf_verifier_env *env,
-                             struct bpf_call_arg_meta *meta)
+                             int ref_obj_id)
 {
        struct bpf_verifier_state *vstate = env->cur_state;
+        int err;
        int i;
+        err = release_reference_state(cur_func(env), ref_obj_id);
+        if (err)
+                return err;
        for (i = 0; i <= vstate->curframe; i++)
-                release_reg_references(env, vstate->frame[i], meta->ptr_id);
+                release_reg_references(env, vstate->frame[i], ref_obj_id);
-        return release_reference_state(cur_func(env), meta->ptr_id);
+        return 0;
 }
 static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -3047,7 +3055,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
        memset(&meta, 0, sizeof(meta));
        meta.pkt_access = fn->pkt_access;
-        err = check_func_proto(fn);
+        err = check_func_proto(fn, func_id);
        if (err) {
                verbose(env, "kernel subsystem misconfigured func %s#%d\n",
                        func_id_name(func_id), func_id);
@@ -3093,7 +3101,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
                        return err;
                }
        } else if (is_release_function(func_id)) {
-                err = release_reference(env, &meta);
+                err = release_reference(env, meta.ref_obj_id);
                if (err) {
                        verbose(env, "func %s#%d reference has not been acquired before\n",
                                func_id_name(func_id), func_id);
@@ -3154,8 +3162,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
                        if (id < 0)
                                return id;
-                        /* For release_reference() */
+                        /* For mark_ptr_or_null_reg() */
                        regs[BPF_REG_0].id = id;
+                        /* For release_reference() */
+                        regs[BPF_REG_0].ref_obj_id = id;
                } else {
                        /* For mark_ptr_or_null_reg() */
                        regs[BPF_REG_0].id = ++env->id_gen;
@@ -3170,6 +3180,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
                return -EINVAL;
        }
+        if (is_ptr_cast_function(func_id))
+                /* For release_reference() */
+                regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
        do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
        err = check_map_func_compatibility(env, meta.map_ptr, func_id);
@@ -3368,7 +3382,7 @@ do_sim:
                *dst_reg = *ptr_reg;
        }
        ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
-        if (!ptr_is_dst_reg)
+        if (!ptr_is_dst_reg && ret)
                *dst_reg = tmp;
        return !ret ? -EFAULT : 0;
 }
@@ -4124,15 +4138,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
        return 0;
 }
+static void __find_good_pkt_pointers(struct bpf_func_state *state,
+                                     struct bpf_reg_state *dst_reg,
+                                     enum bpf_reg_type type, u16 new_range)
+{
+        struct bpf_reg_state *reg;
+        int i;
+        for (i = 0; i < MAX_BPF_REG; i++) {
+                reg = &state->regs[i];
+                if (reg->type == type && reg->id == dst_reg->id)
+                        /* keep the maximum range already checked */
+                        reg->range = max(reg->range, new_range);
+        }
+        bpf_for_each_spilled_reg(i, state, reg) {
+                if (!reg)
+                        continue;
+                if (reg->type == type && reg->id == dst_reg->id)
+                        reg->range = max(reg->range, new_range);
+        }
+}
 static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
                                   struct bpf_reg_state *dst_reg,
                                   enum bpf_reg_type type,
                                   bool range_right_open)
 {
-        struct bpf_func_state *state = vstate->frame[vstate->curframe];
-        struct bpf_reg_state *regs = state->regs, *reg;
        u16 new_range;
-        int i, j;
+        int i;
        if (dst_reg->off < 0 ||
            (dst_reg->off == 0 && range_right_open))
@@ -4197,20 +4231,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
         * the range won't allow anything.
         * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
         */
-        for (i = 0; i < MAX_BPF_REG; i++)
+        for (i = 0; i <= vstate->curframe; i++)
-                if (regs[i].type == type && regs[i].id == dst_reg->id)
+                __find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
-                        /* keep the maximum range already checked */
+                                         new_range);
-                        regs[i].range = max(regs[i].range, new_range);
-        for (j = 0; j <= vstate->curframe; j++) {
-                state = vstate->frame[j];
-                bpf_for_each_spilled_reg(i, state, reg) {
-                        if (!reg)
-                                continue;
-                        if (reg->type == type && reg->id == dst_reg->id)
-                                reg->range = max(reg->range, new_range);
-                }
-        }
 }
 /* compute branch direction of the expression "if (reg opcode val) goto target;"
@@ -4665,17 +4688,41 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
                } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
                        reg->type = PTR_TO_TCP_SOCK;
                }
-                if (is_null || !(reg_is_refcounted(reg) ||
+                if (is_null) {
-                                 reg_may_point_to_spin_lock(reg))) {
+                        /* We don't need id and ref_obj_id from this point
-                        /* We don't need id from this point onwards anymore,
+                         * onwards anymore, thus we should better reset it,
-                         * thus we should better reset it, so that state
+                         * so that state pruning has chances to take effect.
-                         * pruning has chances to take effect.
+                         */
+                        reg->id = 0;
+                        reg->ref_obj_id = 0;
+                } else if (!reg_may_point_to_spin_lock(reg)) {
+                        /* For not-NULL ptr, reg->ref_obj_id will be reset
+                         * in release_reg_references().
+                         *
+                         * reg->id is still used by spin_lock ptr. Other
+                         * than spin_lock ptr type, reg->id can be reset.
                         */
                        reg->id = 0;
                }
        }
 }
+static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
+                                    bool is_null)
+{
+        struct bpf_reg_state *reg;
+        int i;
+        for (i = 0; i < MAX_BPF_REG; i++)
+                mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
+        bpf_for_each_spilled_reg(i, state, reg) {
+                if (!reg)
+                        continue;
+                mark_ptr_or_null_reg(state, reg, id, is_null);
+        }
+}
 /* The logic is similar to find_good_pkt_pointers(), both could eventually
 * be folded together at some point.
 */
@@ -4683,24 +4730,20 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
                                  bool is_null)
 {
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
-        struct bpf_reg_state *reg, *regs = state->regs;
+        struct bpf_reg_state *regs = state->regs;
+        u32 ref_obj_id = regs[regno].ref_obj_id;
        u32 id = regs[regno].id;
-        int i, j;
+        int i;
-        if (reg_is_refcounted_or_null(&regs[regno]) && is_null)
-                release_reference_state(state, id);
-        for (i = 0; i < MAX_BPF_REG; i++)
+        if (ref_obj_id && ref_obj_id == id && is_null)
-                mark_ptr_or_null_reg(state, &regs[i], id, is_null);
+                /* regs[regno] is in the " == NULL" branch.
+                 * No one could have freed the reference state before
+                 * doing the NULL check.
+                 */
+                WARN_ON_ONCE(release_reference_state(state, id));
-        for (j = 0; j <= vstate->curframe; j++) {
+        for (i = 0; i <= vstate->curframe; i++)
-                state = vstate->frame[j];
+                __mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
-                bpf_for_each_spilled_reg(i, state, reg) {
-                        if (!reg)
-                                continue;
-                        mark_ptr_or_null_reg(state, reg, id, is_null);
-                }
-        }
 }
 static bool try_match_pkt_pointers(const struct bpf_insn *insn,
@@ -6052,15 +6095,17 @@ static int propagate_liveness(struct bpf_verifier_env *env,
        }
        /* Propagate read liveness of registers... */
        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
-        /* We don't need to worry about FP liveness because it's read-only */
+        for (frame = 0; frame <= vstate->curframe; frame++) {
-        for (i = 0; i < BPF_REG_FP; i++) {
+                /* We don't need to worry about FP liveness, it's read-only */
-                if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
+                for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
-                        continue;
+                        if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ)
-                if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
+                                continue;
-                        err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i],
+                        if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) {
-                                            &vparent->frame[vstate->curframe]->regs[i]);
+                                err = mark_reg_read(env, &vstate->frame[frame]->regs[i],
-                        if (err)
+                                                    &vparent->frame[frame]->regs[i]);
-                                return err;
+                                if (err)
+                                        return err;
+                        }
                }
        }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4834c4214e9c..6a1942ed781c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -740,11 +740,10 @@ static inline int nr_cpusets(void)
 * Must be called with cpuset_mutex held.
 *
 * The three key local variables below are:
- *    q  - a linked-list queue of cpuset pointers, used to implement a
+ *    cp - cpuset pointer, used (together with pos_css) to perform a
- *         top-down scan of all cpusets.  This scan loads a pointer
+ *         top-down scan of all cpusets. For our purposes, rebuilding
- *         to each cpuset marked is_sched_load_balance into the
+ *         the schedulers sched domains, we can ignore !is_sched_load_
- *         array 'csa'.  For our purposes, rebuilding the schedulers
+ *         balance cpusets.
- *         sched domains, we can ignore !is_sched_load_balance cpusets.
 *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
 *         that need to be load balanced, for convenient iterative
 *         access by the subsequent code that finds the best partition,
@@ -775,7 +774,7 @@ static inline int nr_cpusets(void)
 static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
 {
-        struct cpuset *cp;      /* scans q */
+        struct cpuset *cp;      /* top-down scan of cpusets */
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
        int i, j, k;            /* indices for partition finding loops */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 025f419d16f6..f2ef10460698 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -9,6 +9,7 @@
 #include <linux/notifier.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/isolation.h>
 #include <linux/sched/task.h>
 #include <linux/sched/smt.h>
 #include <linux/unistd.h>
@@ -564,6 +565,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
                cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
 }
+static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
+{
+        if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
+                return true;
+        /*
+         * When CPU hotplug is disabled, then taking the CPU down is not
+         * possible because takedown_cpu() and the architecture and
+         * subsystem specific mechanisms are not available. So the CPU
+         * which would be completely unplugged again needs to stay around
+         * in the current state.
+         */
+        return st->state <= CPUHP_BRINGUP_CPU;
+}
 static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                              enum cpuhp_state target)
 {
@@ -574,8 +589,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                st->state++;
                ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
                if (ret) {
-                        st->target = prev_state;
+                        if (can_rollback_cpu(st)) {
-                        undo_cpu_up(cpu, st);
+                                st->target = prev_state;
+                                undo_cpu_up(cpu, st);
+                        }
                        break;
                }
        }
@@ -844,6 +861,8 @@ static int take_cpu_down(void *_param)
        /* Give up timekeeping duties */
        tick_handover_do_timer();
+        /* Remove CPU from timer broadcasting */
+        tick_offline_cpu(cpu);
        /* Park the stopper thread */
        stop_machine_park(cpu);
        return 0;
@@ -1183,8 +1202,15 @@ int freeze_secondary_cpus(int primary)
        int cpu, error = 0;
        cpu_maps_update_begin();
-        if (!cpu_online(primary))
+        if (primary == -1) {
                primary = cpumask_first(cpu_online_mask);
+                if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
+                        primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+        } else {
+                if (!cpu_online(primary))
+                        primary = cpumask_first(cpu_online_mask);
+        }
        /*
         * We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
@@ -2017,19 +2043,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
 #ifdef CONFIG_HOTPLUG_SMT
-static const char *smt_states[] = {
-        [CPU_SMT_ENABLED]               = "on",
-        [CPU_SMT_DISABLED]              = "off",
-        [CPU_SMT_FORCE_DISABLED]        = "forceoff",
-        [CPU_SMT_NOT_SUPPORTED]         = "notsupported",
-};
-static ssize_t
-show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]);
-}
 static void cpuhp_offline_cpu_device(unsigned int cpu)
 {
        struct device *dev = get_cpu_device(cpu);
@@ -2100,9 +2113,10 @@ static int cpuhp_smt_enable(void)
        return ret;
 }
 static ssize_t
-store_smt_control(struct device *dev, struct device_attribute *attr,
+__store_smt_control(struct device *dev, struct device_attribute *attr,
-                  const char *buf, size_t count)
+                    const char *buf, size_t count)
 {
        int ctrlval, ret;
@@ -2140,14 +2154,44 @@ store_smt_control(struct device *dev, struct device_attribute *attr,
        unlock_device_hotplug();
        return ret ? ret : count;
 }
+#else /* !CONFIG_HOTPLUG_SMT */
+static ssize_t
+__store_smt_control(struct device *dev, struct device_attribute *attr,
+                    const char *buf, size_t count)
+{
+        return -ENODEV;
+}
+#endif /* CONFIG_HOTPLUG_SMT */
+static const char *smt_states[] = {
+        [CPU_SMT_ENABLED]               = "on",
+        [CPU_SMT_DISABLED]              = "off",
+        [CPU_SMT_FORCE_DISABLED]        = "forceoff",
+        [CPU_SMT_NOT_SUPPORTED]         = "notsupported",
+        [CPU_SMT_NOT_IMPLEMENTED]       = "notimplemented",
+};
+static ssize_t
+show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
+{
+        const char *state = smt_states[cpu_smt_control];
+        return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
+}
+static ssize_t
+store_smt_control(struct device *dev, struct device_attribute *attr,
+                  const char *buf, size_t count)
+{
+        return __store_smt_control(dev, attr, buf, count);
+}
 static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
 static ssize_t
 show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
 {
-        bool active = topology_max_smt_threads() > 1;
+        return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
-        return snprintf(buf, PAGE_SIZE - 2, "%d\n", active);
 }
 static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
@@ -2163,21 +2207,17 @@ static const struct attribute_group cpuhp_smt_attr_group = {
        NULL
 };
-static int __init cpu_smt_state_init(void)
+static int __init cpu_smt_sysfs_init(void)
 {
        return sysfs_create_group(&cpu_subsys.dev_root->kobj,
                                  &cpuhp_smt_attr_group);
 }
-#else
-static inline int cpu_smt_state_init(void) { return 0; }
-#endif
 static int __init cpuhp_sysfs_init(void)
 {
        int cpu, ret;
-        ret = cpu_smt_state_init();
+        ret = cpu_smt_sysfs_init();
        if (ret)
                return ret;
@@ -2198,7 +2238,7 @@ static int __init cpuhp_sysfs_init(void)
        return 0;
 }
 device_initcall(cpuhp_sysfs_init);
-#endif
+#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */
 /*
 * cpu_bit_bitmap[] is a special, "compressed" data structure that
@@ -2288,3 +2328,18 @@ void __init boot_cpu_hotplug_init(void)
 #endif
        this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
 }
+enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+static int __init mitigations_parse_cmdline(char *arg)
+{
+        if (!strcmp(arg, "off"))
+                cpu_mitigations = CPU_MITIGATIONS_OFF;
+        else if (!strcmp(arg, "auto"))
+                cpu_mitigations = CPU_MITIGATIONS_AUTO;
+        else if (!strcmp(arg, "auto,nosmt"))
+                cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
+        return 0;
+}
+early_param("mitigations", mitigations_parse_cmdline);
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 45d51e8e26f6..badd77670d00 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -89,8 +89,8 @@ struct dma_debug_entry {
        int              sg_mapped_ents;
        enum map_err_types  map_err_type;
 #ifdef CONFIG_STACKTRACE
-        struct           stack_trace stacktrace;
+        unsigned int    stack_len;
-        unsigned long    st_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
+        unsigned long   stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
 #endif
 };
@@ -174,7 +174,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
 #ifdef CONFIG_STACKTRACE
        if (entry) {
                pr_warning("Mapped at:\n");
-                print_stack_trace(&entry->stacktrace, 0);
+                stack_trace_print(entry->stack_entries, entry->stack_len, 0);
        }
 #endif
 }
@@ -704,12 +704,10 @@ static struct dma_debug_entry *dma_entry_alloc(void)
        spin_unlock_irqrestore(&free_entries_lock, flags);
 #ifdef CONFIG_STACKTRACE
-        entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
+        entry->stack_len = stack_trace_save(entry->stack_entries,
-        entry->stacktrace.entries = entry->st_entries;
+                                            ARRAY_SIZE(entry->stack_entries),
-        entry->stacktrace.skip = 2;
+                                            1);
-        save_stack_trace(&entry->stacktrace);
 #endif
        return entry;
 }
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 53012db1e53c..6f7619c1f877 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -452,6 +452,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
        unsigned long mask;
        unsigned long offset_slots;
        unsigned long max_slots;
+        unsigned long tmp_io_tlb_used;
        if (no_iotlb_memory)
                panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
@@ -538,9 +539,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
        } while (index != wrap);
 not_found:
+        tmp_io_tlb_used = io_tlb_used;
        spin_unlock_irqrestore(&io_tlb_lock, flags);
        if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
-                dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size);
+                dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
+                         size, io_tlb_nslabs, tmp_io_tlb_used);
        return DMA_MAPPING_ERROR;
 found:
        io_tlb_used += nslots;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1032a16bd186..abbd4b3b96c2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2009,8 +2009,8 @@ event_sched_out(struct perf_event *event,
        event->pmu->del(event, 0);
        event->oncpu = -1;
-        if (event->pending_disable) {
+        if (READ_ONCE(event->pending_disable) >= 0) {
-                event->pending_disable = 0;
+                WRITE_ONCE(event->pending_disable, -1);
                state = PERF_EVENT_STATE_OFF;
        }
        perf_event_set_state(event, state);
@@ -2198,7 +2198,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
 void perf_event_disable_inatomic(struct perf_event *event)
 {
-        event->pending_disable = 1;
+        WRITE_ONCE(event->pending_disable, smp_processor_id());
+        /* can fail, see perf_pending_event_disable() */
        irq_work_queue(&event->pending);
 }
@@ -2477,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
        perf_pmu_enable(cpuctx->ctx.pmu);
 }
+void perf_pmu_resched(struct pmu *pmu)
+{
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+        struct perf_event_context *task_ctx = cpuctx->task_ctx;
+        perf_ctx_lock(cpuctx, task_ctx);
+        ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+        perf_ctx_unlock(cpuctx, task_ctx);
+}
 /*
 * Cross CPU call to install and enable a performance event
 *
@@ -5810,10 +5821,45 @@ void perf_event_wakeup(struct perf_event *event)
        }
 }
+static void perf_pending_event_disable(struct perf_event *event)
+{
+        int cpu = READ_ONCE(event->pending_disable);
+        if (cpu < 0)
+                return;
+        if (cpu == smp_processor_id()) {
+                WRITE_ONCE(event->pending_disable, -1);
+                perf_event_disable_local(event);
+                return;
+        }
+        /*
+         *  CPU-A                       CPU-B
+         *
+         *  perf_event_disable_inatomic()
+         *    @pending_disable = CPU-A;
+         *    irq_work_queue();
+         *
+         *  sched-out
+         *    @pending_disable = -1;
+         *
+         *                              sched-in
+         *                              perf_event_disable_inatomic()
+         *                                @pending_disable = CPU-B;
+         *                                irq_work_queue(); // FAILS
+         *
+         *  irq_work_run()
+         *    perf_pending_event()
+         *
+         * But the event runs on CPU-B and wants disabling there.
+         */
+        irq_work_queue_on(&event->pending, cpu);
+}
 static void perf_pending_event(struct irq_work *entry)
 {
-        struct perf_event *event = container_of(entry,
+        struct perf_event *event = container_of(entry, struct perf_event, pending);
-                        struct perf_event, pending);
        int rctx;
        rctx = perf_swevent_get_recursion_context();
@@ -5822,10 +5868,7 @@ static void perf_pending_event(struct irq_work *entry)
         * and we won't recurse 'further'.
         */
-        if (event->pending_disable) {
+        perf_pending_event_disable(event);
-                event->pending_disable = 0;
-                perf_event_disable_local(event);
-        }
        if (event->pending_wakeup) {
                event->pending_wakeup = 0;
@@ -7189,6 +7232,7 @@ static void perf_event_mmap_output(struct perf_event *event,
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
+        u32 type = mmap_event->event_id.header.type;
        int ret;
        if (!perf_event_mmap_match(event, data))
@@ -7232,6 +7276,7 @@ static void perf_event_mmap_output(struct perf_event *event,
        perf_output_end(&handle);
 out:
        mmap_event->event_id.header.size = size;
+        mmap_event->event_id.header.type = type;
 }
 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -9042,26 +9087,29 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
        if (task == TASK_TOMBSTONE)
                return;
-        if (!ifh->nr_file_filters)
+        if (ifh->nr_file_filters) {
-                return;
+                mm = get_task_mm(event->ctx->task);
+                if (!mm)
-        mm = get_task_mm(event->ctx->task);
+                        goto restart;
-        if (!mm)
-                goto restart;
-        down_read(&mm->mmap_sem);
+                down_read(&mm->mmap_sem);
+        }
        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
-                event->addr_filter_ranges[count].start = 0;
+                if (filter->path.dentry) {
-                event->addr_filter_ranges[count].size = 0;
+                        /*
+                         * Adjust base offset if the filter is associated to a
+                         * binary that needs to be mapped:
+                         */
+                        event->addr_filter_ranges[count].start = 0;
+                        event->addr_filter_ranges[count].size = 0;
-                /*
-                 * Adjust base offset if the filter is associated to a binary
-                 * that needs to be mapped:
-                 */
-                if (filter->path.dentry)
                        perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
+                } else {
+                        event->addr_filter_ranges[count].start = filter->offset;
+                        event->addr_filter_ranges[count].size  = filter->size;
+                }
                count++;
        }
@@ -9069,9 +9117,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
        event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);
-        up_read(&mm->mmap_sem);
+        if (ifh->nr_file_filters) {
+                up_read(&mm->mmap_sem);
-        mmput(mm);
+                mmput(mm);
+        }
 restart:
        perf_event_stop(event, 1);
@@ -10234,6 +10284,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        init_waitqueue_head(&event->waitq);
+        event->pending_disable = -1;
        init_irq_work(&event->pending, perf_pending_event);
        mutex_init(&event->mmap_mutex);
@@ -11876,7 +11927,7 @@ static void __init perf_event_init_all_cpus(void)
        }
 }
-void perf_swevent_init_cpu(unsigned int cpu)
+static void perf_swevent_init_cpu(unsigned int cpu)
 {
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a4047321d7d8..674b35383491 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -392,7 +392,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
                 * store that will be enabled on successful return
                 */
                if (!handle->size) { /* A, matches D */
-                        event->pending_disable = 1;
+                        event->pending_disable = smp_processor_id();
                        perf_output_wakeup(handle);
                        local_set(&rb->aux_nest, 0);
                        goto err_put;
@@ -455,24 +455,21 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
                rb->aux_head += size;
        }
-        if (size || handle->aux_flags) {
+        /*
-                /*
+         * Only send RECORD_AUX if we have something useful to communicate
-                 * Only send RECORD_AUX if we have something useful to communicate
+         *
-                 *
+         * Note: the OVERWRITE records by themselves are not considered
-                 * Note: the OVERWRITE records by themselves are not considered
+         * useful, as they don't communicate any *new* information,
-                 * useful, as they don't communicate any *new* information,
+         * aside from the short-lived offset, that becomes history at
-                 * aside from the short-lived offset, that becomes history at
+         * the next event sched-in and therefore isn't useful.
-                 * the next event sched-in and therefore isn't useful.
+         * The userspace that needs to copy out AUX data in overwrite
-                 * The userspace that needs to copy out AUX data in overwrite
+         * mode should know to use user_page::aux_head for the actual
-                 * mode should know to use user_page::aux_head for the actual
+         * offset. So, from now on we don't output AUX records that
-                 * offset. So, from now on we don't output AUX records that
+         * have *only* OVERWRITE flag set.
-                 * have *only* OVERWRITE flag set.
+         */
-                 */
+        if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
+                perf_event_aux_event(handle->event, aux_head, size,
-                if (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)
+                                     handle->aux_flags);
-                        perf_event_aux_event(handle->event, aux_head, size,
-                                             handle->aux_flags);
-        }
        rb->user_page->aux_head = rb->aux_head;
        if (rb_need_aux_wakeup(rb))
@@ -480,7 +477,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
        if (wakeup) {
                if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
-                        handle->event->pending_disable = 1;
+                        handle->event->pending_disable = smp_processor_id();
                perf_output_wakeup(handle);
        }
@@ -613,8 +610,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
         * PMU requests more than one contiguous chunks of memory
         * for SW double buffering
         */
-        if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
+        if (!overwrite) {
-            !overwrite) {
                if (!max_order)
                        return -EINVAL;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c5cde87329c7..4ca7364c956d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2028,7 +2028,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
                if (uc->handler) {
                        rc = uc->handler(uc, regs);
                        WARN(rc & ~UPROBE_HANDLER_MASK,
-                                "bad rc=0x%x from %pf()\n", rc, uc->handler);
+                                "bad rc=0x%x from %ps()\n", rc, uc->handler);
                }
                if (uc->ret_handler)
@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {
        .priority               = INT_MAX-1,    /* notified after kprobes, kgdb */
 };
-static int __init init_uprobes(void)
+void __init uprobes_init(void)
 {
        int i;
        for (i = 0; i < UPROBES_HASH_SZ; i++)
                mutex_init(&uprobes_mmap_mutex[i]);
-        if (percpu_init_rwsem(&dup_mmap_sem))
+        BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
-                return -ENOMEM;
-        return register_die_notifier(&uprobe_exception_nb);
+        BUG_ON(register_die_notifier(&uprobe_exception_nb));
 }
-__initcall(init_uprobes);
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 17f75b545f66..feb80712b913 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v)
 {
        struct fei_attr *attr = list_entry(v, struct fei_attr, list);
-        seq_printf(m, "%pf\n", attr->kp.addr);
+        seq_printf(m, "%ps\n", attr->kp.addr);
        return 0;
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..8b03d93ba068 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -11,6 +11,7 @@
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */
+#include <linux/anon_inodes.h>
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
@@ -21,6 +22,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/seq_file.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
@@ -815,6 +817,7 @@ void __init fork_init(void)
 #endif
        lockdep_init_task(&init_task);
+        uprobes_init();
 }
 int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -1298,13 +1301,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
                complete_vfork_done(tsk);
 }
-/*
+/**
- * Allocate a new mm structure and copy contents from the
+ * dup_mm() - duplicates an existing mm structure
- * mm structure of the passed in task structure.
+ * @tsk: the task_struct with which the new mm will be associated.
+ * @oldmm: the mm to duplicate.
+ *
+ * Allocates a new mm structure and duplicates the provided @oldmm structure
+ * content into it.
+ *
+ * Return: the duplicated mm or NULL on failure.
 */
-static struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk,
+                                struct mm_struct *oldmm)
 {
-        struct mm_struct *mm, *oldmm = current->mm;
+        struct mm_struct *mm;
        int err;
        mm = allocate_mm();
@@ -1371,7 +1381,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
        }
        retval = -ENOMEM;
-        mm = dup_mm(tsk);
+        mm = dup_mm(tsk, current->mm);
        if (!mm)
                goto fail_nomem;
@@ -1662,6 +1672,58 @@ static inline void rcu_copy_process(struct task_struct *p)
 #endif /* #ifdef CONFIG_TASKS_RCU */
 }
+static int pidfd_release(struct inode *inode, struct file *file)
+{
+        struct pid *pid = file->private_data;
+        file->private_data = NULL;
+        put_pid(pid);
+        return 0;
+}
+#ifdef CONFIG_PROC_FS
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+        struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
+        struct pid *pid = f->private_data;
+        seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
+        seq_putc(m, '\n');
+}
+#endif
+const struct file_operations pidfd_fops = {
+        .release = pidfd_release,
+#ifdef CONFIG_PROC_FS
+        .show_fdinfo = pidfd_show_fdinfo,
+#endif
+};
+/**
+ * pidfd_create() - Create a new pid file descriptor.
+ *
+ * @pid:  struct pid that the pidfd will reference
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set.
+ *
+ * Note, that this function can only be called after the fd table has
+ * been unshared to avoid leaking the pidfd to the new process.
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ *         On error, a negative errno number will be returned.
+ */
+static int pidfd_create(struct pid *pid)
+{
+        int fd;
+        fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
+                              O_RDWR | O_CLOEXEC);
+        if (fd < 0)
+                put_pid(pid);
+        return fd;
+}
 /*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
@@ -1674,13 +1736,14 @@ static __latent_entropy struct task_struct *copy_process(
                                        unsigned long clone_flags,
                                        unsigned long stack_start,
                                        unsigned long stack_size,
+                                        int __user *parent_tidptr,
                                        int __user *child_tidptr,
                                        struct pid *pid,
                                        int trace,
                                        unsigned long tls,
                                        int node)
 {
-        int retval;
+        int pidfd = -1, retval;
        struct task_struct *p;
        struct multiprocess_signals delayed;
@@ -1730,6 +1793,31 @@ static __latent_entropy struct task_struct *copy_process(
                        return ERR_PTR(-EINVAL);
        }
+        if (clone_flags & CLONE_PIDFD) {
+                int reserved;
+                /*
+                 * - CLONE_PARENT_SETTID is useless for pidfds and also
+                 *   parent_tidptr is used to return pidfds.
+                 * - CLONE_DETACHED is blocked so that we can potentially
+                 *   reuse it later for CLONE_PIDFD.
+                 * - CLONE_THREAD is blocked until someone really needs it.
+                 */
+                if (clone_flags &
+                    (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
+                        return ERR_PTR(-EINVAL);
+                /*
+                 * Verify that parent_tidptr is sane so we can potentially
+                 * reuse it later.
+                 */
+                if (get_user(reserved, parent_tidptr))
+                        return ERR_PTR(-EFAULT);
+                if (reserved != 0)
+                        return ERR_PTR(-EINVAL);
+        }
        /*
         * Force any signals received before this point to be delivered
         * before the fork happens.  Collect up signals sent to multiple
@@ -1936,6 +2024,22 @@ static __latent_entropy struct task_struct *copy_process(
                }
        }
+        /*
+         * This has to happen after we've potentially unshared the file
+         * descriptor table (so that the pidfd doesn't leak into the child
+         * if the fd table isn't shared).
+         */
+        if (clone_flags & CLONE_PIDFD) {
+                retval = pidfd_create(pid);
+                if (retval < 0)
+                        goto bad_fork_free_pid;
+                pidfd = retval;
+                retval = put_user(pidfd, parent_tidptr);
+                if (retval)
+                        goto bad_fork_put_pidfd;
+        }
 #ifdef CONFIG_BLOCK
        p->plug = NULL;
 #endif
@@ -1996,7 +2100,7 @@ static __latent_entropy struct task_struct *copy_process(
         */
        retval = cgroup_can_fork(p);
        if (retval)
-                goto bad_fork_free_pid;
+                goto bad_fork_put_pidfd;
        /*
         * From this point on we must avoid any synchronous user-space
@@ -2111,6 +2215,9 @@ bad_fork_cancel_cgroup:
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        cgroup_cancel_fork(p);
+bad_fork_put_pidfd:
+        if (clone_flags & CLONE_PIDFD)
+                ksys_close(pidfd);
 bad_fork_free_pid:
        cgroup_threadgroup_change_end(current);
        if (pid != &init_struct_pid)
@@ -2176,7 +2283,7 @@ static inline void init_idle_pids(struct task_struct *idle)
 struct task_struct *fork_idle(int cpu)
 {
        struct task_struct *task;
-        task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
+        task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
                            cpu_to_node(cpu));
        if (!IS_ERR(task)) {
                init_idle_pids(task);
@@ -2186,6 +2293,11 @@ struct task_struct *fork_idle(int cpu)
        return task;
 }
+struct mm_struct *copy_init_mm(void)
+{
+        return dup_mm(NULL, &init_mm);
+}
 /*
 *  Ok, this is the main fork-routine.
 *
@@ -2223,7 +2335,7 @@ long _do_fork(unsigned long clone_flags,
                        trace = 0;
        }
-        p = copy_process(clone_flags, stack_start, stack_size,
+        p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
                         child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
        add_latent_entropy();
diff --git a/kernel/futex.c b/kernel/futex.c
index c3b73b0311bc..6262f1534ac9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1311,13 +1311,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
 {
+        int err;
        u32 uninitialized_var(curval);
        if (unlikely(should_fail_futex(true)))
                return -EFAULT;
-        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
+        err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
-                return -EFAULT;
+        if (unlikely(err))
+                return err;
        /* If user space value changed, let the caller retry */
        return curval != uval ? -EAGAIN : 0;
@@ -1502,10 +1504,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
        if (unlikely(should_fail_futex(true)))
                ret = -EFAULT;
-        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
+        ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
-                ret = -EFAULT;
+        if (!ret && (curval != uval)) {
-        } else if (curval != uval) {
                /*
                 * If a unconditional UNLOCK_PI operation (user space did not
                 * try the TID->0 transition) raced with a waiter setting the
@@ -1700,32 +1700,32 @@ retry_private:
        double_lock_hb(hb1, hb2);
        op_ret = futex_atomic_op_inuser(op, uaddr2);
        if (unlikely(op_ret < 0)) {
                double_unlock_hb(hb1, hb2);
-#ifndef CONFIG_MMU
+                if (!IS_ENABLED(CONFIG_MMU) ||
-                /*
+                    unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
-                 * we don't get EFAULT from MMU faults if we don't have an MMU,
+                        /*
-                 * but we might get them from range checking
+                         * we don't get EFAULT from MMU faults if we don't have
-                 */
+                         * an MMU, but we might get them from range checking
-                ret = op_ret;
+                         */
-                goto out_put_keys;
-#endif
-                if (unlikely(op_ret != -EFAULT)) {
                        ret = op_ret;
                        goto out_put_keys;
                }
-                ret = fault_in_user_writeable(uaddr2);
+                if (op_ret == -EFAULT) {
-                if (ret)
+                        ret = fault_in_user_writeable(uaddr2);
-                        goto out_put_keys;
+                        if (ret)
+                                goto out_put_keys;
+                }
-                if (!(flags & FLAGS_SHARED))
+                if (!(flags & FLAGS_SHARED)) {
+                        cond_resched();
                        goto retry_private;
+                }
                put_futex_key(&key2);
                put_futex_key(&key1);
+                cond_resched();
                goto retry;
        }
@@ -2350,7 +2350,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
        u32 uval, uninitialized_var(curval), newval;
        struct task_struct *oldowner, *newowner;
        u32 newtid;
-        int ret;
+        int ret, err = 0;
        lockdep_assert_held(q->lock_ptr);
@@ -2421,14 +2421,17 @@ retry:
        if (!pi_state->owner)
                newtid |= FUTEX_OWNER_DIED;
-        if (get_futex_value_locked(&uval, uaddr))
+        err = get_futex_value_locked(&uval, uaddr);
-                goto handle_fault;
+        if (err)
+                goto handle_err;
        for (;;) {
                newval = (uval & FUTEX_OWNER_DIED) | newtid;
-                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+                err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
-                        goto handle_fault;
+                if (err)
+                        goto handle_err;
                if (curval == uval)
                        break;
                uval = curval;
@@ -2456,23 +2459,37 @@ retry:
        return 0;
        /*
-         * To handle the page fault we need to drop the locks here. That gives
+         * In order to reschedule or handle a page fault, we need to drop the
-         * the other task (either the highest priority waiter itself or the
+         * locks here. In the case of a fault, this gives the other task
-         * task which stole the rtmutex) the chance to try the fixup of the
+         * (either the highest priority waiter itself or the task which stole
-         * pi_state. So once we are back from handling the fault we need to
+         * the rtmutex) the chance to try the fixup of the pi_state. So once we
-         * check the pi_state after reacquiring the locks and before trying to
+         * are back from handling the fault we need to check the pi_state after
-         * do another fixup. When the fixup has been done already we simply
+         * reacquiring the locks and before trying to do another fixup. When
-         * return.
+         * the fixup has been done already we simply return.
         *
         * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
         * drop hb->lock since the caller owns the hb -> futex_q relation.
         * Dropping the pi_mutex->wait_lock requires the state revalidate.
         */
-handle_fault:
+handle_err:
        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
        spin_unlock(q->lock_ptr);
-        ret = fault_in_user_writeable(uaddr);
+        switch (err) {
+        case -EFAULT:
+                ret = fault_in_user_writeable(uaddr);
+                break;
+        case -EAGAIN:
+                cond_resched();
+                ret = 0;
+                break;
+        default:
+                WARN_ON_ONCE(1);
+                ret = err;
+                break;
+        }
        spin_lock(q->lock_ptr);
        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
@@ -3041,10 +3058,8 @@ retry:
                 * A unconditional UNLOCK_PI op raced against a waiter
                 * setting the FUTEX_WAITERS bit. Try again.
                 */
-                if (ret == -EAGAIN) {
+                if (ret == -EAGAIN)
-                        put_futex_key(&key);
+                        goto pi_retry;
-                        goto retry;
-                }
                /*
                 * wake_futex_pi has detected invalid state. Tell user
                 * space.
@@ -3059,9 +3074,19 @@ retry:
         * preserve the WAITERS bit not the OWNER_DIED one. We are the
         * owner.
         */
-        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+        if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
                spin_unlock(&hb->lock);
-                goto pi_faulted;
+                switch (ret) {
+                case -EFAULT:
+                        goto pi_faulted;
+                case -EAGAIN:
+                        goto pi_retry;
+                default:
+                        WARN_ON_ONCE(1);
+                        goto out_putkey;
+                }
        }
        /*
@@ -3075,6 +3100,11 @@ out_putkey:
        put_futex_key(&key);
        return ret;
+pi_retry:
+        put_futex_key(&key);
+        cond_resched();
+        goto retry;
 pi_faulted:
        put_futex_key(&key);
@@ -3435,47 +3465,67 @@ err_unlock:
 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
        u32 uval, uninitialized_var(nval), mval;
+        int err;
+        /* Futex address must be 32bit aligned */
+        if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+                return -1;
 retry:
        if (get_user(uval, uaddr))
                return -1;
-        if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
+        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
-                /*
+                return 0;
-                 * Ok, this dying thread is truly holding a futex
-                 * of interest. Set the OWNER_DIED bit atomically
+        /*
-                 * via cmpxchg, and if the value had FUTEX_WAITERS
+         * Ok, this dying thread is truly holding a futex
-                 * set, wake up a waiter (if any). (We have to do a
+         * of interest. Set the OWNER_DIED bit atomically
-                 * futex_wake() even if OWNER_DIED is already set -
+         * via cmpxchg, and if the value had FUTEX_WAITERS
-                 * to handle the rare but possible case of recursive
+         * set, wake up a waiter (if any). (We have to do a
-                 * thread-death.) The rest of the cleanup is done in
+         * futex_wake() even if OWNER_DIED is already set -
-                 * userspace.
+         * to handle the rare but possible case of recursive
-                 */
+         * thread-death.) The rest of the cleanup is done in
-                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+         * userspace.
-                /*
+         */
-                 * We are not holding a lock here, but we want to have
+        mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-                 * the pagefault_disable/enable() protection because
-                 * we want to handle the fault gracefully. If the
+        /*
-                 * access fails we try to fault in the futex with R/W
+         * We are not holding a lock here, but we want to have
-                 * verification via get_user_pages. get_user() above
+         * the pagefault_disable/enable() protection because
-                 * does not guarantee R/W access. If that fails we
+         * we want to handle the fault gracefully. If the
-                 * give up and leave the futex locked.
+         * access fails we try to fault in the futex with R/W
-                 */
+         * verification via get_user_pages. get_user() above
-                if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+         * does not guarantee R/W access. If that fails we
+         * give up and leave the futex locked.
+         */
+        if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
+                switch (err) {
+                case -EFAULT:
                        if (fault_in_user_writeable(uaddr))
                                return -1;
                        goto retry;
-                }
-                if (nval != uval)
+                case -EAGAIN:
+                        cond_resched();
                        goto retry;
-                /*
+                default:
-                 * Wake robust non-PI futexes here. The wakeup of
+                        WARN_ON_ONCE(1);
-                 * PI futexes happens in exit_pi_state():
+                        return err;
-                 */
+                }
-                if (!pi && (uval & FUTEX_WAITERS))
-                        futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
        }
+        if (nval != uval)
+                goto retry;
+        /*
+         * Wake robust non-PI futexes here. The wakeup of
+         * PI futexes happens in exit_pi_state():
+         */
+        if (!pi && (uval & FUTEX_WAITERS))
+                futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
        return 0;
 }
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh
new file mode 100755
index 000000000000..591a94f7b387
--- /dev/null
+++ b/kernel/gen_ikh_data.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# This script generates an archive consisting of kernel headers
+# for CONFIG_IKHEADERS_PROC.
+set -e
+spath="$(dirname "$(readlink -f "$0")")"
+kroot="$spath/.."
+outdir="$(pwd)"
+tarfile=$1
+cpio_dir=$outdir/$tarfile.tmp
+# Script filename relative to the kernel source root
+# We add it to the archive because it is small and any changes
+# to this script will also cause a rebuild of the archive.
+sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")"
+src_file_list="
+include/
+arch/$SRCARCH/include/
+$sfile
+"
+obj_file_list="
+include/
+arch/$SRCARCH/include/
+"
+# Support incremental builds by skipping archive generation
+# if timestamps of files being archived are not changed.
+# This block is useful for debugging the incremental builds.
+# Uncomment it for debugging.
+# iter=1
+# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter;
+# else;         iter=$(($(cat /tmp/iter) + 1)); fi
+# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
+# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
+# include/generated/compile.h is ignored because it is touched even when none
+# of the source files changed. This causes pointless regeneration, so let us
+# ignore them for md5 calculation.
+pushd $kroot > /dev/null
+src_files_md5="$(find $src_file_list -type f                       |
+                grep -v "include/generated/compile.h"              |
+                xargs ls -lR | md5sum | cut -d ' ' -f1)"
+popd > /dev/null
+obj_files_md5="$(find $obj_file_list -type f                       |
+                grep -v "include/generated/compile.h"              |
+                xargs ls -lR | md5sum | cut -d ' ' -f1)"
+if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
+if [ -f kernel/kheaders.md5 ] &&
+        [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
+        [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
+        [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
+                exit
+fi
+if [ "${quiet}" != "silent_" ]; then
+       echo "  GEN     $tarfile"
+fi
+rm -rf $cpio_dir
+mkdir $cpio_dir
+pushd $kroot > /dev/null
+for f in $src_file_list;
+        do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir
+popd > /dev/null
+# The second CPIO can complain if files already exist which can
+# happen with out of tree builds. Just silence CPIO for now.
+for f in $obj_file_list;
+        do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+# Remove comments except SDPX lines
+find $cpio_dir -type f -print0 |
+        xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
+tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
+echo "$src_files_md5" > kernel/kheaders.md5
+echo "$obj_files_md5" >> kernel/kheaders.md5
+echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
+rm -rf $cpio_dir
diff --git a/kernel/iomem.c b/kernel/iomem.c
index f7525e14ebc6..93c264444510 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size,
 *
 * MEMREMAP_WB - matches the default mapping for System RAM on
 * the architecture.  This is usually a read-allocate write-back cache.
- * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM
 * memremap() will bypass establishing a new mapping and instead return
 * a pointer into the direct map.
 *
@@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
        /* Try all mapping types requested until one returns non-NULL */
        if (flags & MEMREMAP_WB) {
                /*
-                 * MEMREMAP_WB is special in that it can be satisifed
+                 * MEMREMAP_WB is special in that it can be satisfied
                 * from the direct map.  Some archs depend on the
                 * capability of memremap() to autodetect cases where
                 * the requested range is potentially in System RAM.
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3faef4a77f71..51128bea3846 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1449,6 +1449,10 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
 int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
 {
        data = data->parent_data;
+        if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE)
+                return 0;
        if (data->chip->irq_set_wake)
                return data->chip->irq_set_wake(data, on);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 516c00a5e867..c1eccd4f6520 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
        raw_spin_lock_irq(&desc->lock);
        data = irq_desc_get_irq_data(desc);
-        seq_printf(m, "handler:  %pf\n", desc->handle_irq);
+        seq_printf(m, "handler:  %ps\n", desc->handle_irq);
        seq_printf(m, "device:   %s\n", desc->dev_name);
        seq_printf(m, "status:   0x%08x\n", desc->status_use_accessors);
        irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 5d5378ea0afe..f6e5515ee077 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -84,8 +84,6 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
 *      @dev: device to request interrupt for
 *      @irq: Interrupt line to allocate
 *      @handler: Function to be called when the IRQ occurs
- *      @thread_fn: function to be called in a threaded interrupt context. NULL
- *                  for devices which handle everything in @handler
 *      @irqflags: Interrupt type flags
 *      @devname: An ascii name for the claiming device, dev_name(dev) if NULL
 *      @dev_id: A cookie passed back to the handler function
@@ -222,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
                            irq_flow_handler_t handler)
 {
        struct irq_chip_generic *gc;
-        unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
-        gc = devm_kzalloc(dev, sz, GFP_KERNEL);
+        gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);
        if (gc)
                irq_init_generic_chip(gc, name, num_ct,
                                      irq_base, reg_base, handler);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6df5ddfdb0f8..a4ace611f47f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
                res = action->handler(irq, action->dev_id);
                trace_irq_handler_exit(irq, action, res);
-                if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+                if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
                              irq, action->handler))
                        local_irq_disable();
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 13539e12cd80..c52b737ab8e3 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -275,11 +275,12 @@ static struct attribute *irq_attrs[] = {
        &actions_attr.attr,
        NULL
 };
+ATTRIBUTE_GROUPS(irq);
 static struct kobj_type irq_kobj_type = {
        .release        = irq_kobj_release,
        .sysfs_ops      = &kobj_sysfs_ops,
-        .default_attrs  = irq_attrs,
+        .default_groups = irq_groups,
 };
 static void irq_sysfs_add(int irq, struct irq_desc *desc)
@@ -558,6 +559,7 @@ int __init early_irq_init(void)
                alloc_masks(&desc[i], node);
                raw_spin_lock_init(&desc[i].lock);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+                mutex_init(&desc[i].request_mutex);
                desc_set_defaults(i, &desc[i], node, NULL, NULL);
        }
        return arch_early_irq_init();
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9ec34a2a6638..78f3ddeb7fe4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -196,6 +196,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
        case IRQ_SET_MASK_OK:
        case IRQ_SET_MASK_OK_DONE:
                cpumask_copy(desc->irq_common_data.affinity, mask);
+                /* fall through */
        case IRQ_SET_MASK_OK_NOCOPY:
                irq_validate_effective_affinity(data);
                irq_set_thread_affinity(desc);
@@ -356,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
        desc->affinity_notify = notify;
        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        if (old_notify)
+        if (old_notify) {
+                cancel_work_sync(&old_notify->work);
                kref_put(&old_notify->kref, old_notify->release);
+        }
        return 0;
 }
@@ -778,7 +781,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
                ret = 0;
                break;
        default:
-                pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
+                pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n",
                       flags, irq_desc_get_irq(desc), chip->irq_set_type);
        }
        if (unmask)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 6d2fa6914b30..2ed97a7c9b2a 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
         */
        raw_spin_lock_irqsave(&desc->lock, flags);
        for_each_action_of_desc(desc, action) {
-                printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
+                printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler);
                if (action->thread_fn)
-                        printk(KERN_CONT " threaded [<%p>] %pf",
+                        printk(KERN_CONT " threaded [<%p>] %ps",
                                        action->thread_fn, action->thread_fn);
                printk(KERN_CONT "\n");
        }
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 1e4cb63a5c82..90c735da15d0 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -9,6 +9,7 @@
 #include <linux/idr.h>
 #include <linux/irq.h>
 #include <linux/math64.h>
+#include <linux/log2.h>
 #include <trace/events/irq.h>
@@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
 DEFINE_PER_CPU(struct irq_timings, irq_timings);
-struct irqt_stat {
-        u64     next_evt;
-        u64     last_ts;
-        u64     variance;
-        u32     avg;
-        u32     nr_samples;
-        int     anomalies;
-        int     valid;
-};
 static DEFINE_IDR(irqt_stats);
 void irq_timings_enable(void)
@@ -40,75 +31,360 @@ void irq_timings_disable(void)
        static_branch_disable(&irq_timing_enabled);
 }
-/**
+/*
- * irqs_update - update the irq timing statistics with a new timestamp
+ * The main goal of this algorithm is to predict the next interrupt
+ * occurrence on the current CPU.
+ *
+ * Currently, the interrupt timings are stored in a circular array
+ * buffer every time there is an interrupt, as a tuple: the interrupt
+ * number and the associated timestamp when the event occurred <irq,
+ * timestamp>.
+ *
+ * For every interrupt occurring in a short period of time, we can
+ * measure the elapsed time between the occurrences for the same
+ * interrupt and we end up with a suite of intervals. The experience
+ * showed the interrupts are often coming following a periodic
+ * pattern.
+ *
+ * The objective of the algorithm is to find out this periodic pattern
+ * in a fastest way and use its period to predict the next irq event.
+ *
+ * When the next interrupt event is requested, we are in the situation
+ * where the interrupts are disabled and the circular buffer
+ * containing the timings is filled with the events which happened
+ * after the previous next-interrupt-event request.
+ *
+ * At this point, we read the circular buffer and we fill the irq
+ * related statistics structure. After this step, the circular array
+ * containing the timings is empty because all the values are
+ * dispatched in their corresponding buffers.
+ *
+ * Now for each interrupt, we can predict the next event by using the
+ * suffix array, log interval and exponential moving average
+ *
+ * 1. Suffix array
+ *
+ * Suffix array is an array of all the suffixes of a string. It is
+ * widely used as a data structure for compression, text search, ...
+ * For instance for the word 'banana', the suffixes will be: 'banana'
+ * 'anana' 'nana' 'ana' 'na' 'a'
+ *
+ * Usually, the suffix array is sorted but for our purpose it is
+ * not necessary and won't provide any improvement in the context of
+ * the solved problem where we clearly define the boundaries of the
+ * search by a max period and min period.
+ *
+ * The suffix array will build a suite of intervals of different
+ * length and will look for the repetition of each suite. If the suite
+ * is repeating then we have the period because it is the length of
+ * the suite whatever its position in the buffer.
+ *
+ * 2. Log interval
+ *
+ * We saw the irq timings allow to compute the interval of the
+ * occurrences for a specific interrupt. We can reasonibly assume the
+ * longer is the interval, the higher is the error for the next event
+ * and we can consider storing those interval values into an array
+ * where each slot in the array correspond to an interval at the power
+ * of 2 of the index. For example, index 12 will contain values
+ * between 2^11 and 2^12.
+ *
+ * At the end we have an array of values where at each index defines a
+ * [2^index - 1, 2 ^ index] interval values allowing to store a large
+ * number of values inside a small array.
+ *
+ * For example, if we have the value 1123, then we store it at
+ * ilog2(1123) = 10 index value.
+ *
+ * Storing those value at the specific index is done by computing an
+ * exponential moving average for this specific slot. For instance,
+ * for values 1800, 1123, 1453, ... fall under the same slot (10) and
+ * the exponential moving average is computed every time a new value
+ * is stored at this slot.
+ *
+ * 3. Exponential Moving Average
+ *
+ * The EMA is largely used to track a signal for stocks or as a low
+ * pass filter. The magic of the formula, is it is very simple and the
+ * reactivity of the average can be tuned with the factors called
+ * alpha.
+ *
+ * The higher the alphas are, the faster the average respond to the
+ * signal change. In our case, if a slot in the array is a big
+ * interval, we can have numbers with a big difference between
+ * them. The impact of those differences in the average computation
+ * can be tuned by changing the alpha value.
+ *
+ *
+ *  -- The algorithm --
+ *
+ * We saw the different processing above, now let's see how they are
+ * used together.
+ *
+ * For each interrupt:
+ *      For each interval:
+ *              Compute the index = ilog2(interval)
+ *              Compute a new_ema(buffer[index], interval)
+ *              Store the index in a circular buffer
+ *
+ *      Compute the suffix array of the indexes
+ *
+ *      For each suffix:
+ *              If the suffix is reverse-found 3 times
+ *                      Return suffix
+ *
+ *      Return Not found
+ *
+ * However we can not have endless suffix array to be build, it won't
+ * make sense and it will add an extra overhead, so we can restrict
+ * this to a maximum suffix length of 5 and a minimum suffix length of
+ * 2. The experience showed 5 is the majority of the maximum pattern
+ * period found for different devices.
+ *
+ * The result is a pattern finding less than 1us for an interrupt.
 *
- * @irqs: an irqt_stat struct pointer
+ * Example based on real values:
- * @ts: the new timestamp
 *
- * The statistics are computed online, in other words, the code is
+ * Example 1 : MMC write/read interrupt interval:
- * designed to compute the statistics on a stream of values rather
- * than doing multiple passes on the values to compute the average,
- * then the variance. The integer division introduces a loss of
- * precision but with an acceptable error margin regarding the results
- * we would have with the double floating precision: we are dealing
- * with nanosec, so big numbers, consequently the mantisse is
- * negligeable, especially when converting the time in usec
- * afterwards.
 *
- * The computation happens at idle time. When the CPU is not idle, the
+ *      223947, 1240, 1384, 1386, 1386,
- * interrupts' timestamps are stored in the circular buffer, when the
+ *      217416, 1236, 1384, 1386, 1387,
- * CPU goes idle and this routine is called, all the buffer's values
+ *      214719, 1241, 1386, 1387, 1384,
- * are injected in the statistical model continuying to extend the
+ *      213696, 1234, 1384, 1386, 1388,
- * statistics from the previous busy-idle cycle.
+ *      219904, 1240, 1385, 1389, 1385,
+ *      212240, 1240, 1386, 1386, 1386,
+ *      214415, 1236, 1384, 1386, 1387,
+ *      214276, 1234, 1384, 1388, ?
 *
- * The observations showed a device will trigger a burst of periodic
+ * For each element, apply ilog2(value)
- * interrupts followed by one or two peaks of longer time, for
- * instance when a SD card device flushes its cache, then the periodic
- * intervals occur again. A one second inactivity period resets the
- * stats, that gives us the certitude the statistical values won't
- * exceed 1x10^9, thus the computation won't overflow.
 *
- * Basically, the purpose of the algorithm is to watch the periodic
+ *      15, 8, 8, 8, 8,
- * interrupts and eliminate the peaks.
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, ?
 *
- * An interrupt is considered periodically stable if the interval of
+ * Max period of 5, we take the last (max_period * 3) 15 elements as
- * its occurences follow the normal distribution, thus the values
+ * we can be confident if the pattern repeats itself three times it is
- * comply with:
+ * a repeating pattern.
 *
- *      avg - 3 x stddev < value < avg + 3 x stddev
+ *                   8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, 8,
+ *      15, 8, 8, 8, ?
 *
- * Which can be simplified to:
+ * Suffixes are:
 *
- *      -3 x stddev < value - avg < 3 x stddev
+ *  1) 8, 15, 8, 8, 8  <- max period
+ *  2) 8, 15, 8, 8
+ *  3) 8, 15, 8
+ *  4) 8, 15           <- min period
 *
- *      abs(value - avg) < 3 x stddev
+ * From there we search the repeating pattern for each suffix.
 *
- * In order to save a costly square root computation, we use the
+ * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
- * variance. For the record, stddev = sqrt(variance). The equation
+ *         |   |  |  |  |  |   |  |  |  |  |   |  |  |  |
- * above becomes:
+ *         8, 15, 8, 8, 8  |   |  |  |  |  |   |  |  |  |
+ *                         8, 15, 8, 8, 8  |   |  |  |  |
+ *                                         8, 15, 8, 8, 8
 *
- *      abs(value - avg) < 3 x sqrt(variance)
+ * When moving the suffix, we found exactly 3 matches.
 *
- * And finally we square it:
+ * The first suffix with period 5 is repeating.
 *
- *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
+ * The next event is (3 * max_period) % suffix_period
 *
- *      (value - avg) x (value - avg) < 9 x variance
+ * In this example, the result 0, so the next event is suffix[0] => 8
 *
- * Statistically speaking, any values out of this interval is
+ * However, 8 is the index in the array of exponential moving average
- * considered as an anomaly and is discarded. However, a normal
+ * which was calculated on the fly when storing the values, so the
- * distribution appears when the number of samples is 30 (it is the
+ * interval is ema[8] = 1366
- * rule of thumb in statistics, cf. "30 samples" on Internet). When
- * there are three consecutive anomalies, the statistics are resetted.
 *
+ *
+ * Example 2:
+ *
+ *      4, 3, 5, 100,
+ *      3, 3, 5, 117,
+ *      4, 4, 5, 112,
+ *      4, 3, 4, 110,
+ *      3, 5, 3, 117,
+ *      4, 4, 5, 112,
+ *      4, 3, 4, 110,
+ *      3, 4, 5, 112,
+ *      4, 3, 4, 110
+ *
+ * ilog2
+ *
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4
+ *
+ * Max period 5:
+ *         0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4,
+ *      0, 0, 0, 4
+ *
+ * Suffixes:
+ *
+ *  1) 0, 0, 4, 0, 0
+ *  2) 0, 0, 4, 0
+ *  3) 0, 0, 4
+ *  4) 0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ *         |  |  |  |  |  |  X
+ *         0, 0, 4, 0, 0, |  X
+ *                        0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ *         |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+ *         0, 0, 4, 0, |  |  |  |  |  |  |  |  |  |  |
+ *                     0, 0, 4, 0, |  |  |  |  |  |  |
+ *                                 0, 0, 4, 0, |  |  |
+ *                                             0  0  4
+ *
+ * Pattern is found 3 times, the remaining is 1 which results from
+ * (max_period * 3) % suffix_period. This value is the index in the
+ * suffix arrays. The suffix array for a period 4 has the value 4
+ * at index 1.
+ */
+#define EMA_ALPHA_VAL           64
+#define EMA_ALPHA_SHIFT         7
+#define PREDICTION_PERIOD_MIN   2
+#define PREDICTION_PERIOD_MAX   5
+#define PREDICTION_FACTOR       4
+#define PREDICTION_MAX          10 /* 2 ^ PREDICTION_MAX useconds */
+#define PREDICTION_BUFFER_SIZE  16 /* slots for EMAs, hardly more than 16 */
+struct irqt_stat {
+        u64     last_ts;
+        u64     ema_time[PREDICTION_BUFFER_SIZE];
+        int     timings[IRQ_TIMINGS_SIZE];
+        int     circ_timings[IRQ_TIMINGS_SIZE];
+        int     count;
+};
+/*
+ * Exponential moving average computation
 */
-static void irqs_update(struct irqt_stat *irqs, u64 ts)
+static u64 irq_timings_ema_new(u64 value, u64 ema_old)
+{
+        s64 diff;
+        if (unlikely(!ema_old))
+                return value;
+        diff = (value - ema_old) * EMA_ALPHA_VAL;
+        /*
+         * We can use a s64 type variable to be added with the u64
+         * ema_old variable as this one will never have its topmost
+         * bit set, it will be always smaller than 2^63 nanosec
+         * interrupt interval (292 years).
+         */
+        return ema_old + (diff >> EMA_ALPHA_SHIFT);
+}
+static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
+{
+        int i;
+        /*
+         * The buffer contains the suite of intervals, in a ilog2
+         * basis, we are looking for a repetition. We point the
+         * beginning of the search three times the length of the
+         * period beginning at the end of the buffer. We do that for
+         * each suffix.
+         */
+        for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
+                int *begin = &buffer[len - (i * 3)];
+                int *ptr = begin;
+                /*
+                 * We look if the suite with period 'i' repeat
+                 * itself. If it is truncated at the end, as it
+                 * repeats we can use the period to find out the next
+                 * element.
+                 */
+                while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
+                        ptr += i;
+                        if (ptr >= &buffer[len])
+                                return begin[((i * 3) % i)];
+                }
+        }
+        return -1;
+}
+static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
+{
+        int index, i, period_max, count, start, min = INT_MAX;
+        if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
+                irqs->count = irqs->last_ts = 0;
+                return U64_MAX;
+        }
+        /*
+         * As we want to find three times the repetition, we need a
+         * number of intervals greater or equal to three times the
+         * maximum period, otherwise we truncate the max period.
+         */
+        period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
+                PREDICTION_PERIOD_MAX : irqs->count / 3;
+        /*
+         * If we don't have enough irq timings for this prediction,
+         * just bail out.
+         */
+        if (period_max <= PREDICTION_PERIOD_MIN)
+                return U64_MAX;
+        /*
+         * 'count' will depends if the circular buffer wrapped or not
+         */
+        count = irqs->count < IRQ_TIMINGS_SIZE ?
+                irqs->count : IRQ_TIMINGS_SIZE;
+        start = irqs->count < IRQ_TIMINGS_SIZE ?
+                0 : (irqs->count & IRQ_TIMINGS_MASK);
+        /*
+         * Copy the content of the circular buffer into another buffer
+         * in order to linearize the buffer instead of dealing with
+         * wrapping indexes and shifted array which will be prone to
+         * error and extremelly difficult to debug.
+         */
+        for (i = 0; i < count; i++) {
+                int index = (start + i) & IRQ_TIMINGS_MASK;
+                irqs->timings[i] = irqs->circ_timings[index];
+                min = min_t(int, irqs->timings[i], min);
+        }
+        index = irq_timings_next_event_index(irqs->timings, count, period_max);
+        if (index < 0)
+                return irqs->last_ts + irqs->ema_time[min];
+        return irqs->last_ts + irqs->ema_time[index];
+}
+static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
 {
        u64 old_ts = irqs->last_ts;
-        u64 variance = 0;
        u64 interval;
-        s64 diff;
+        int index;
        /*
         * The timestamps are absolute time values, we need to compute
@@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts)
         * want as we need another timestamp to compute an interval.
         */
        if (interval >= NSEC_PER_SEC) {
-                memset(irqs, 0, sizeof(*irqs));
+                irqs->count = 0;
-                irqs->last_ts = ts;
                return;
        }
        /*
-         * Pre-compute the delta with the average as the result is
+         * Get the index in the ema table for this interrupt. The
-         * used several times in this function.
+         * PREDICTION_FACTOR increase the interval size for the array
-         */
+         * of exponential average.
-        diff = interval - irqs->avg;
-        /*
-         * Increment the number of samples.
-         */
-        irqs->nr_samples++;
-        /*
-         * Online variance divided by the number of elements if there
-         * is more than one sample.  Normally the formula is division
-         * by nr_samples - 1 but we assume the number of element will be
-         * more than 32 and dividing by 32 instead of 31 is enough
-         * precise.
-         */
-        if (likely(irqs->nr_samples > 1))
-                variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
-        /*
-         * The rule of thumb in statistics for the normal distribution
-         * is having at least 30 samples in order to have the model to
-         * apply. Values outside the interval are considered as an
-         * anomaly.
-         */
-        if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
-                /*
-                 * After three consecutive anomalies, we reset the
-                 * stats as it is no longer stable enough.
-                 */
-                if (irqs->anomalies++ >= 3) {
-                        memset(irqs, 0, sizeof(*irqs));
-                        irqs->last_ts = ts;
-                        return;
-                }
-        } else {
-                /*
-                 * The anomalies must be consecutives, so at this
-                 * point, we reset the anomalies counter.
-                 */
-                irqs->anomalies = 0;
-        }
-        /*
-         * The interrupt is considered stable enough to try to predict
-         * the next event on it.
         */
-        irqs->valid = 1;
+        index = likely(interval) ?
+                ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
        /*
-         * Online average algorithm:
+         * Store the index as an element of the pattern in another
-         *
+         * circular array.
-         *  new_average = average + ((value - average) / count)
-         *
-         * The variance computation depends on the new average
-         * to be computed here first.
-         *
         */
-        irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
+        irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
-        /*
+        irqs->ema_time[index] = irq_timings_ema_new(interval,
-         * Online variance algorithm:
+                                                    irqs->ema_time[index]);
-         *
-         *  new_variance = variance + (value - average) x (value - new_average)
-         *
-         * Warning: irqs->avg is updated with the line above, hence
-         * 'interval - irqs->avg' is no longer equal to 'diff'
-         */
-        irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
-        /*
+        irqs->count++;
-         * Update the next event
-         */
-        irqs->next_evt = ts + irqs->avg;
 }
 /**
@@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now)
         */
        lockdep_assert_irqs_disabled();
+        if (!irqts->count)
+                return next_evt;
        /*
         * Number of elements in the circular buffer: If it happens it
         * was flushed before, then the number of elements could be
@@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now)
         * type but with the cost of extra computation in the
         * interrupt handler hot path. We choose efficiency.
         *
-         * Inject measured irq/timestamp to the statistical model
+         * Inject measured irq/timestamp to the pattern prediction
-         * while decrementing the counter because we consume the data
+         * model while decrementing the counter because we consume the
-         * from our circular buffer.
+         * data from our circular buffer.
         */
-        for (i = irqts->count & IRQ_TIMINGS_MASK,
-                     irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
-             irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
-                irq = irq_timing_decode(irqts->values[i], &ts);
+        i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
+        irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
+        for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
+                irq = irq_timing_decode(irqts->values[i], &ts);
                s = idr_find(&irqt_stats, irq);
-                if (s) {
+                if (s)
-                        irqs = this_cpu_ptr(s);
+                        irq_timings_store(irq, this_cpu_ptr(s), ts);
-                        irqs_update(irqs, ts);
-                }
        }
        /*
@@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now)
                irqs = this_cpu_ptr(s);
-                if (!irqs->valid)
+                ts = __irq_timings_next_event(irqs, i, now);
-                        continue;
+                if (ts <= now)
+                        return now;
-                if (irqs->next_evt <= now) {
+                if (ts < next_evt)
-                        irq = i;
+                        next_evt = ts;
-                        next_evt = now;
-                        /*
-                         * This interrupt mustn't use in the future
-                         * until new events occur and update the
-                         * statistics.
-                         */
-                        irqs->valid = 0;
-                        break;
-                }
-                if (irqs->next_evt < next_evt) {
-                        irq = i;
-                        next_evt = irqs->next_evt;
-                }
        }
        return next_evt;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 6b7cdf17ccf8..73288914ed5e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,61 +56,70 @@ void __weak arch_irq_work_raise(void)
         */
 }
-/*
+/* Enqueue on current CPU, work must already be claimed and preempt disabled */
- * Enqueue the irq_work @work on @cpu unless it's already pending
+static void __irq_work_queue_local(struct irq_work *work)
- * somewhere.
- *
- * Can be re-enqueued while the callback is still in progress.
- */
-bool irq_work_queue_on(struct irq_work *work, int cpu)
 {
-        /* All work should have been flushed before going offline */
+        /* If the work is "lazy", handle it from next tick if any */
-        WARN_ON_ONCE(cpu_is_offline(cpu));
+        if (work->flags & IRQ_WORK_LAZY) {
+                if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
-#ifdef CONFIG_SMP
+                    tick_nohz_tick_stopped())
+                        arch_irq_work_raise();
-        /* Arch remote IPI send/receive backend aren't NMI safe */
+        } else {
-        WARN_ON_ONCE(in_nmi());
+                if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+                        arch_irq_work_raise();
+        }
+}
+/* Enqueue the irq work @work on the current CPU */
+bool irq_work_queue(struct irq_work *work)
+{
        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;
-        if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+        /* Queue the entry and raise the IPI if needed. */
-                arch_send_call_function_single_ipi(cpu);
+        preempt_disable();
+        __irq_work_queue_local(work);
-#else /* #ifdef CONFIG_SMP */
+        preempt_enable();
-        irq_work_queue(work);
-#endif /* #else #ifdef CONFIG_SMP */
        return true;
 }
+EXPORT_SYMBOL_GPL(irq_work_queue);
-/* Enqueue the irq work @work on the current CPU */
+/*
-bool irq_work_queue(struct irq_work *work)
+ * Enqueue the irq_work @work on @cpu unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue_on(struct irq_work *work, int cpu)
 {
+#ifndef CONFIG_SMP
+        return irq_work_queue(work);
+#else /* CONFIG_SMP: */
+        /* All work should have been flushed before going offline */
+        WARN_ON_ONCE(cpu_is_offline(cpu));
        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;
-        /* Queue the entry and raise the IPI if needed. */
        preempt_disable();
+        if (cpu != smp_processor_id()) {
-        /* If the work is "lazy", handle it from next tick if any */
+                /* Arch remote IPI send/receive backend aren't NMI safe */
-        if (work->flags & IRQ_WORK_LAZY) {
+                WARN_ON_ONCE(in_nmi());
-                if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+                if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
-                    tick_nohz_tick_stopped())
+                        arch_send_call_function_single_ipi(cpu);
-                        arch_irq_work_raise();
        } else {
-                if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+                __irq_work_queue_local(work);
-                        arch_irq_work_raise();
        }
        preempt_enable();
        return true;
+#endif /* CONFIG_SMP */
 }
-EXPORT_SYMBOL_GPL(irq_work_queue);
 bool irq_work_needs_cpu(void)
 {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index bad96b476eb6..de6efdecc70d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -202,11 +202,13 @@ void static_key_disable(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_disable);
-static void __static_key_slow_dec_cpuslocked(struct static_key *key,
+static bool static_key_slow_try_dec(struct static_key *key)
-                                           unsigned long rate_limit,
-                                           struct delayed_work *work)
 {
-        lockdep_assert_cpus_held();
+        int val;
+        val = atomic_fetch_add_unless(&key->enabled, -1, 1);
+        if (val == 1)
+                return false;
        /*
         * The negative count check is valid even when a negative
@@ -215,63 +217,70 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key,
         * returns is unbalanced, because all other static_key_slow_inc()
         * instances block while the update is in progress.
         */
-        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
+        WARN(val < 0, "jump label: negative count!\n");
-                WARN(atomic_read(&key->enabled) < 0,
+        return true;
-                     "jump label: negative count!\n");
+}
+static void __static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+        lockdep_assert_cpus_held();
+        if (static_key_slow_try_dec(key))
                return;
-        }
-        if (rate_limit) {
+        jump_label_lock();
-                atomic_inc(&key->enabled);
+        if (atomic_dec_and_test(&key->enabled))
-                schedule_delayed_work(work, rate_limit);
-        } else {
                jump_label_update(key);
-        }
        jump_label_unlock();
 }
-static void __static_key_slow_dec(struct static_key *key,
+static void __static_key_slow_dec(struct static_key *key)
-                                  unsigned long rate_limit,
-                                  struct delayed_work *work)
 {
        cpus_read_lock();
-        __static_key_slow_dec_cpuslocked(key, rate_limit, work);
+        __static_key_slow_dec_cpuslocked(key);
        cpus_read_unlock();
 }
-static void jump_label_update_timeout(struct work_struct *work)
+void jump_label_update_timeout(struct work_struct *work)
 {
        struct static_key_deferred *key =
                container_of(work, struct static_key_deferred, work.work);
-        __static_key_slow_dec(&key->key, 0, NULL);
+        __static_key_slow_dec(&key->key);
 }
+EXPORT_SYMBOL_GPL(jump_label_update_timeout);
 void static_key_slow_dec(struct static_key *key)
 {
        STATIC_KEY_CHECK_USE(key);
-        __static_key_slow_dec(key, 0, NULL);
+        __static_key_slow_dec(key);
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec);
 void static_key_slow_dec_cpuslocked(struct static_key *key)
 {
        STATIC_KEY_CHECK_USE(key);
-        __static_key_slow_dec_cpuslocked(key, 0, NULL);
+        __static_key_slow_dec_cpuslocked(key);
 }
-void static_key_slow_dec_deferred(struct static_key_deferred *key)
+void __static_key_slow_dec_deferred(struct static_key *key,
+                                    struct delayed_work *work,
+                                    unsigned long timeout)
 {
        STATIC_KEY_CHECK_USE(key);
-        __static_key_slow_dec(&key->key, key->timeout, &key->work);
+        if (static_key_slow_try_dec(key))
+                return;
+        schedule_delayed_work(work, timeout);
 }
-EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred);
-void static_key_deferred_flush(struct static_key_deferred *key)
+void __static_key_deferred_flush(void *key, struct delayed_work *work)
 {
        STATIC_KEY_CHECK_USE(key);
-        flush_delayed_work(&key->work);
+        flush_delayed_work(work);
 }
-EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+EXPORT_SYMBOL_GPL(__static_key_deferred_flush);
 void jump_label_rate_limit(struct static_key_deferred *key,
                unsigned long rl)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d7140447be75..fd5c95ff9251 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1150,7 +1150,7 @@ int kernel_kexec(void)
                error = dpm_suspend_end(PMSG_FREEZE);
                if (error)
                        goto Resume_devices;
-                error = disable_nonboot_cpus();
+                error = suspend_disable_secondary_cpus();
                if (error)
                        goto Enable_cpus;
                local_irq_disable();
@@ -1183,7 +1183,7 @@ int kernel_kexec(void)
 Enable_irqs:
                local_irq_enable();
 Enable_cpus:
-                enable_nonboot_cpus();
+                suspend_enable_secondary_cpus();
                dpm_resume_start(PMSG_RESTORE);
 Resume_devices:
                dpm_resume_end(PMSG_RESTORE);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f1d0e00a3971..f7fb8f6a688f 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -688,7 +688,6 @@ static int kexec_calculate_store_digests(struct kimage *image)
                goto out_free_desc;
        desc->tfm   = tfm;
-        desc->flags = 0;
        ret = crypto_shash_init(desc);
        if (ret < 0)
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
new file mode 100644
index 000000000000..70ae6052920d
--- /dev/null
+++ b/kernel/kheaders.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide kernel headers useful to build tracing programs
+ * such as for running eBPF tracing tools.
+ *
+ * (Borrowed code from kernel/configs.c)
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/uaccess.h>
+/*
+ * Define kernel_headers_data and kernel_headers_data_end, within which the
+ * compressed kernel headers are stored. The file is first compressed with xz.
+ */
+asm (
+"       .pushsection .rodata, \"a\"             \n"
+"       .global kernel_headers_data             \n"
+"kernel_headers_data:                           \n"
+"       .incbin \"kernel/kheaders_data.tar.xz\" \n"
+"       .global kernel_headers_data_end         \n"
+"kernel_headers_data_end:                       \n"
+"       .popsection                             \n"
+);
+extern char kernel_headers_data;
+extern char kernel_headers_data_end;
+static ssize_t
+ikheaders_read_current(struct file *file, char __user *buf,
+                      size_t len, loff_t *offset)
+{
+        return simple_read_from_buffer(buf, len, offset,
+                                       &kernel_headers_data,
+                                       &kernel_headers_data_end -
+                                       &kernel_headers_data);
+}
+static const struct file_operations ikheaders_file_ops = {
+        .read = ikheaders_read_current,
+        .llseek = default_llseek,
+};
+static int __init ikheaders_init(void)
+{
+        struct proc_dir_entry *entry;
+        /* create the current headers file */
+        entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL,
+                            &ikheaders_file_ops);
+        if (!entry)
+                return -ENOMEM;
+        proc_set_size(entry,
+                      &kernel_headers_data_end -
+                      &kernel_headers_data);
+        return 0;
+}
+static void __exit ikheaders_cleanup(void)
+{
+        remove_proc_entry("kheaders.tar.xz", NULL);
+}
+module_init(ikheaders_init);
+module_exit(ikheaders_cleanup);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joel Fernandes");
+MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c83e54727131..b1ea30a5540e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -709,7 +709,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
 static int reuse_unused_kprobe(struct kprobe *ap)
 {
        struct optimized_kprobe *op;
-        int ret;
        /*
         * Unused kprobe MUST be on the way of delayed unoptimizing (means
@@ -720,9 +719,8 @@ static int reuse_unused_kprobe(struct kprobe *ap)
        /* Enable the probe again */
        ap->flags &= ~KPROBE_FLAG_DISABLED;
        /* Optimize it again (remove from op->list) */
-        ret = kprobe_optready(ap);
+        if (!kprobe_optready(ap))
-        if (ret)
+                return -EINVAL;
-                return ret;
        optimize_kprobe(ap);
        return 0;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 96b4179cee6a..99a5b5f46dc5 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -120,8 +120,8 @@ account_global_scheduler_latency(struct task_struct *tsk,
                                break;
                        }
-                        /* 0 and ULONG_MAX entries mean end of backtrace: */
+                        /* 0 entry marks end of backtrace: */
-                        if (record == 0 || record == ULONG_MAX)
+                        if (!record)
                                break;
                }
                if (same) {
@@ -141,20 +141,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
        memcpy(&latency_record[i], lat, sizeof(struct latency_record));
 }
-/*
- * Iterator to store a backtrace into a latency record entry
- */
-static inline void store_stacktrace(struct task_struct *tsk,
-                                        struct latency_record *lat)
-{
-        struct stack_trace trace;
-        memset(&trace, 0, sizeof(trace));
-        trace.max_entries = LT_BACKTRACEDEPTH;
-        trace.entries = &lat->backtrace[0];
-        save_stack_trace_tsk(tsk, &trace);
-}
 /**
 * __account_scheduler_latency - record an occurred latency
 * @tsk - the task struct of the task hitting the latency
@@ -191,7 +177,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
        lat.count = 1;
        lat.time = usecs;
        lat.max = usecs;
-        store_stacktrace(tsk, &lat);
+        stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0);
        raw_spin_lock_irqsave(&latency_lock, flags);
@@ -210,8 +197,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
                                break;
                        }
-                        /* 0 and ULONG_MAX entries mean end of backtrace: */
+                        /* 0 entry is end of backtrace */
-                        if (record == 0 || record == ULONG_MAX)
+                        if (!record)
                                break;
                }
                if (same) {
@@ -252,10 +239,10 @@ static int lstats_show(struct seq_file *m, void *v)
                                   lr->count, lr->time, lr->max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
                                unsigned long bt = lr->backtrace[q];
                                if (!bt)
                                        break;
-                                if (bt == ULONG_MAX)
-                                        break;
                                seq_printf(m, " %ps", (void *)bt);
                        }
                        seq_puts(m, "\n");
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index eb0ee10a1981..f6fbaff10e71 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -419,6 +419,7 @@ static struct attribute *klp_patch_attrs[] = {
        &force_kobj_attr.attr,
        NULL
 };
+ATTRIBUTE_GROUPS(klp_patch);
 static void klp_free_object_dynamic(struct klp_object *obj)
 {
@@ -426,7 +427,13 @@ static void klp_free_object_dynamic(struct klp_object *obj)
        kfree(obj);
 }
-static struct klp_object *klp_alloc_object_dynamic(const char *name)
+static void klp_init_func_early(struct klp_object *obj,
+                                struct klp_func *func);
+static void klp_init_object_early(struct klp_patch *patch,
+                                  struct klp_object *obj);
+static struct klp_object *klp_alloc_object_dynamic(const char *name,
+                                                   struct klp_patch *patch)
 {
        struct klp_object *obj;
@@ -442,7 +449,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name)
                }
        }
-        INIT_LIST_HEAD(&obj->func_list);
+        klp_init_object_early(patch, obj);
        obj->dynamic = true;
        return obj;
@@ -471,6 +478,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func,
                }
        }
+        klp_init_func_early(obj, func);
        /*
         * func->new_func is same as func->old_func. These addresses are
         * set when the object is loaded, see klp_init_object_loaded().
@@ -490,11 +498,9 @@ static int klp_add_object_nops(struct klp_patch *patch,
        obj = klp_find_object(patch, old_obj);
        if (!obj) {
-                obj = klp_alloc_object_dynamic(old_obj->name);
+                obj = klp_alloc_object_dynamic(old_obj->name, patch);
                if (!obj)
                        return -ENOMEM;
-                list_add_tail(&obj->node, &patch->obj_list);
        }
        klp_for_each_func(old_obj, old_func) {
@@ -505,8 +511,6 @@ static int klp_add_object_nops(struct klp_patch *patch,
                func = klp_alloc_func_nop(old_func, obj);
                if (!func)
                        return -ENOMEM;
-                list_add_tail(&func->node, &obj->func_list);
        }
        return 0;
@@ -546,7 +550,7 @@ static void klp_kobj_release_patch(struct kobject *kobj)
 static struct kobj_type klp_ktype_patch = {
        .release = klp_kobj_release_patch,
        .sysfs_ops = &kobj_sysfs_ops,
-        .default_attrs = klp_patch_attrs,
+        .default_groups = klp_patch_groups,
 };
 static void klp_kobj_release_object(struct kobject *kobj)
@@ -588,13 +592,7 @@ static void __klp_free_funcs(struct klp_object *obj, bool nops_only)
                        continue;
                list_del(&func->node);
+                kobject_put(&func->kobj);
-                /* Might be called from klp_init_patch() error path. */
-                if (func->kobj_added) {
-                        kobject_put(&func->kobj);
-                } else if (func->nop) {
-                        klp_free_func_nop(func);
-                }
        }
 }
@@ -624,13 +622,7 @@ static void __klp_free_objects(struct klp_patch *patch, bool nops_only)
                        continue;
                list_del(&obj->node);
+                kobject_put(&obj->kobj);
-                /* Might be called from klp_init_patch() error path. */
-                if (obj->kobj_added) {
-                        kobject_put(&obj->kobj);
-                } else if (obj->dynamic) {
-                        klp_free_object_dynamic(obj);
-                }
        }
 }
@@ -675,10 +667,8 @@ static void klp_free_patch_finish(struct klp_patch *patch)
         * this is called when the patch gets disabled and it
         * cannot get enabled again.
         */
-        if (patch->kobj_added) {
+        kobject_put(&patch->kobj);
-                kobject_put(&patch->kobj);
+        wait_for_completion(&patch->finish);
-                wait_for_completion(&patch->finish);
-        }
        /* Put the module after the last access to struct klp_patch. */
        if (!patch->forced)
@@ -700,8 +690,6 @@ static void klp_free_patch_work_fn(struct work_struct *work)
 static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 {
-        int ret;
        if (!func->old_name)
                return -EINVAL;
@@ -724,13 +712,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
         * object. If the user selects 0 for old_sympos, then 1 will be used
         * since a unique symbol will be the first occurrence.
         */
-        ret = kobject_init_and_add(&func->kobj, &klp_ktype_func,
+        return kobject_add(&func->kobj, &obj->kobj, "%s,%lu",
-                                   &obj->kobj, "%s,%lu", func->old_name,
+                           func->old_name,
-                                   func->old_sympos ? func->old_sympos : 1);
+                           func->old_sympos ? func->old_sympos : 1);
-        if (!ret)
-                func->kobj_added = true;
-        return ret;
 }
 /* Arches may override this to finish any remaining arch-specific tasks */
@@ -801,11 +785,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
        klp_find_object_module(obj);
        name = klp_is_module(obj) ? obj->name : "vmlinux";
-        ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object,
+        ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name);
-                                   &patch->kobj, "%s", name);
        if (ret)
                return ret;
-        obj->kobj_added = true;
        klp_for_each_func(obj, func) {
                ret = klp_init_func(obj, func);
@@ -819,6 +801,21 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
        return ret;
 }
+static void klp_init_func_early(struct klp_object *obj,
+                                struct klp_func *func)
+{
+        kobject_init(&func->kobj, &klp_ktype_func);
+        list_add_tail(&func->node, &obj->func_list);
+}
+static void klp_init_object_early(struct klp_patch *patch,
+                                  struct klp_object *obj)
+{
+        INIT_LIST_HEAD(&obj->func_list);
+        kobject_init(&obj->kobj, &klp_ktype_object);
+        list_add_tail(&obj->node, &patch->obj_list);
+}
 static int klp_init_patch_early(struct klp_patch *patch)
 {
        struct klp_object *obj;
@@ -829,7 +826,7 @@ static int klp_init_patch_early(struct klp_patch *patch)
        INIT_LIST_HEAD(&patch->list);
        INIT_LIST_HEAD(&patch->obj_list);
-        patch->kobj_added = false;
+        kobject_init(&patch->kobj, &klp_ktype_patch);
        patch->enabled = false;
        patch->forced = false;
        INIT_WORK(&patch->free_work, klp_free_patch_work_fn);
@@ -839,13 +836,10 @@ static int klp_init_patch_early(struct klp_patch *patch)
                if (!obj->funcs)
                        return -EINVAL;
-                INIT_LIST_HEAD(&obj->func_list);
+                klp_init_object_early(patch, obj);
-                obj->kobj_added = false;
-                list_add_tail(&obj->node, &patch->obj_list);
                klp_for_each_func_static(obj, func) {
-                        func->kobj_added = false;
+                        klp_init_func_early(obj, func);
-                        list_add_tail(&func->node, &obj->func_list);
                }
        }
@@ -860,11 +854,9 @@ static int klp_init_patch(struct klp_patch *patch)
        struct klp_object *obj;
        int ret;
-        ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
+        ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name);
-                                   klp_root_kobj, "%s", patch->mod->name);
        if (ret)
                return ret;
-        patch->kobj_added = true;
        if (patch->replace) {
                ret = klp_add_nops(patch);
@@ -926,9 +918,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
        if (WARN_ON(patch->enabled))
                return -EINVAL;
-        if (!patch->kobj_added)
-                return -EINVAL;
        pr_notice("enabling patch '%s'\n", patch->mod->name);
        klp_init_transition(patch, KLP_PATCHED);
@@ -1003,11 +992,10 @@ int klp_enable_patch(struct klp_patch *patch)
                return -ENODEV;
        if (!klp_have_reliable_stack()) {
-                pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
+                pr_warn("This architecture doesn't have support for the livepatch consistency model.\n");
-                return -EOPNOTSUPP;
+                pr_warn("The livepatch transition may never complete.\n");
        }
        mutex_lock(&klp_mutex);
        ret = klp_init_patch_early(patch);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 9c89ae8b337a..c53370d596be 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -202,15 +202,15 @@ void klp_update_patch_state(struct task_struct *task)
 * Determine whether the given stack trace includes any references to a
 * to-be-patched or to-be-unpatched function.
 */
-static int klp_check_stack_func(struct klp_func *func,
+static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
-                                struct stack_trace *trace)
+                                unsigned int nr_entries)
 {
        unsigned long func_addr, func_size, address;
        struct klp_ops *ops;
        int i;
-        for (i = 0; i < trace->nr_entries; i++) {
+        for (i = 0; i < nr_entries; i++) {
-                address = trace->entries[i];
+                address = entries[i];
                if (klp_target_state == KLP_UNPATCHED) {
                         /*
@@ -254,29 +254,25 @@ static int klp_check_stack_func(struct klp_func *func,
 static int klp_check_stack(struct task_struct *task, char *err_buf)
 {
        static unsigned long entries[MAX_STACK_ENTRIES];
-        struct stack_trace trace;
        struct klp_object *obj;
        struct klp_func *func;
-        int ret;
+        int ret, nr_entries;
-        trace.skip = 0;
+        ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
-        trace.nr_entries = 0;
-        trace.max_entries = MAX_STACK_ENTRIES;
-        trace.entries = entries;
-        ret = save_stack_trace_tsk_reliable(task, &trace);
        WARN_ON_ONCE(ret == -ENOSYS);
-        if (ret) {
+        if (ret < 0) {
                snprintf(err_buf, STACK_ERR_BUF_SIZE,
                         "%s: %s:%d has an unreliable stack\n",
                         __func__, task->comm, task->pid);
                return ret;
        }
+        nr_entries = ret;
        klp_for_each_object(klp_transition_patch, obj) {
                if (!obj->patched)
                        continue;
                klp_for_each_func(obj, func) {
-                        ret = klp_check_stack_func(func, &trace);
+                        ret = klp_check_stack_func(func, entries, nr_entries);
                        if (ret) {
                                snprintf(err_buf, STACK_ERR_BUF_SIZE,
                                         "%s: %s:%d is sleeping on function %s\n",
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 392c7f23af76..6fe2f333aecb 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
 # and is generally not a function of system call inputs.
 KCOV_INSTRUMENT         := n
-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
-obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
 obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
+obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
new file mode 100644
index 000000000000..fa2c2f951c6b
--- /dev/null
+++ b/kernel/locking/lock_events.c
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+/*
+ * Collect locking event counts
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/fs.h>
+#include "lock_events.h"
+#undef  LOCK_EVENT
+#define LOCK_EVENT(name)        [LOCKEVENT_ ## name] = #name,
+#define LOCK_EVENTS_DIR         "lock_event_counts"
+/*
+ * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different
+ * types of locks will be reported under the <debugfs>/lock_event_counts/
+ * directory. See lock_events_list.h for the list of available locking
+ * events.
+ *
+ * Writing to the special ".reset_counts" file will reset all the above
+ * locking event counts. This is a very slow operation and so should not
+ * be done frequently.
+ *
+ * These event counts are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counts usable even in a production
+ * environment.
+ */
+static const char * const lockevent_names[lockevent_num + 1] = {
+#include "lock_events_list.h"
+        [LOCKEVENT_reset_cnts] = ".reset_counts",
+};
+/*
+ * Per-cpu counts
+ */
+DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+/*
+ * The lockevent_read() function can be overridden.
+ */
+ssize_t __weak lockevent_read(struct file *file, char __user *user_buf,
+                              size_t count, loff_t *ppos)
+{
+        char buf[64];
+        int cpu, id, len;
+        u64 sum = 0;
+        /*
+         * Get the counter ID stored in file->f_inode->i_private
+         */
+        id = (long)file_inode(file)->i_private;
+        if (id >= lockevent_num)
+                return -EBADF;
+        for_each_possible_cpu(cpu)
+                sum += per_cpu(lockevents[id], cpu);
+        len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
+        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+/*
+ * Function to handle write request
+ *
+ * When idx = reset_cnts, reset all the counts.
+ */
+static ssize_t lockevent_write(struct file *file, const char __user *user_buf,
+                           size_t count, loff_t *ppos)
+{
+        int cpu;
+        /*
+         * Get the counter ID stored in file->f_inode->i_private
+         */
+        if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts)
+                return count;
+        for_each_possible_cpu(cpu) {
+                int i;
+                unsigned long *ptr = per_cpu_ptr(lockevents, cpu);
+                for (i = 0 ; i < lockevent_num; i++)
+                        WRITE_ONCE(ptr[i], 0);
+        }
+        return count;
+}
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_lockevent = {
+        .read = lockevent_read,
+        .write = lockevent_write,
+        .llseek = default_llseek,
+};
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#include <asm/paravirt.h>
+static bool __init skip_lockevent(const char *name)
+{
+        static int pv_on __initdata = -1;
+        if (pv_on < 0)
+                pv_on = !pv_is_native_spin_unlock();
+        /*
+         * Skip PV qspinlock events on bare metal.
+         */
+        if (!pv_on && !memcmp(name, "pv_", 3))
+                return true;
+        return false;
+}
+#else
+static inline bool skip_lockevent(const char *name)
+{
+        return false;
+}
+#endif
+/*
+ * Initialize debugfs for the locking event counts.
+ */
+static int __init init_lockevent_counts(void)
+{
+        struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
+        int i;
+        if (!d_counts)
+                goto out;
+        /*
+         * Create the debugfs files
+         *
+         * As reading from and writing to the stat files can be slow, only
+         * root is allowed to do the read/write to limit impact to system
+         * performance.
+         */
+        for (i = 0; i < lockevent_num; i++) {
+                if (skip_lockevent(lockevent_names[i]))
+                        continue;
+                if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
+                                         (void *)(long)i, &fops_lockevent))
+                        goto fail_undo;
+        }
+        if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+                                 d_counts, (void *)(long)LOCKEVENT_reset_cnts,
+                                 &fops_lockevent))
+                goto fail_undo;
+        return 0;
+fail_undo:
+        debugfs_remove_recursive(d_counts);
+out:
+        pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR);
+        return -ENOMEM;
+}
+fs_initcall(init_lockevent_counts);
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
new file mode 100644
index 000000000000..feb1acc54611
--- /dev/null
+++ b/kernel/locking/lock_events.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+#ifndef __LOCKING_LOCK_EVENTS_H
+#define __LOCKING_LOCK_EVENTS_H
+enum lock_events {
+#include "lock_events_list.h"
+        lockevent_num,  /* Total number of lock event counts */
+        LOCKEVENT_reset_cnts = lockevent_num,
+};
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+/*
+ * Per-cpu counters
+ */
+DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void __lockevent_inc(enum lock_events event, bool cond)
+{
+        if (cond)
+                __this_cpu_inc(lockevents[event]);
+}
+#define lockevent_inc(ev)         __lockevent_inc(LOCKEVENT_ ##ev, true)
+#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c)
+static inline void __lockevent_add(enum lock_events event, int inc)
+{
+        __this_cpu_add(lockevents[event], inc);
+}
+#define lockevent_add(ev, c)    __lockevent_add(LOCKEVENT_ ##ev, c)
+#else  /* CONFIG_LOCK_EVENT_COUNTS */
+#define lockevent_inc(ev)
+#define lockevent_add(ev, c)
+#define lockevent_cond_inc(ev, c)
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
+#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
new file mode 100644
index 000000000000..ad7668cfc9da
--- /dev/null
+++ b/kernel/locking/lock_events_list.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+#ifndef LOCK_EVENT
+#define LOCK_EVENT(name)        LOCKEVENT_ ## name,
+#endif
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * Locking events for PV qspinlock.
+ */
+LOCK_EVENT(pv_hash_hops)        /* Average # of hops per hashing operation */
+LOCK_EVENT(pv_kick_unlock)      /* # of vCPU kicks issued at unlock time   */
+LOCK_EVENT(pv_kick_wake)        /* # of vCPU kicks for pv_latency_wake     */
+LOCK_EVENT(pv_latency_kick)     /* Average latency (ns) of vCPU kick       */
+LOCK_EVENT(pv_latency_wake)     /* Average latency (ns) of kick-to-wakeup  */
+LOCK_EVENT(pv_lock_stealing)    /* # of lock stealing operations           */
+LOCK_EVENT(pv_spurious_wakeup)  /* # of spurious wakeups in non-head vCPUs */
+LOCK_EVENT(pv_wait_again)       /* # of wait's after queue head vCPU kick  */
+LOCK_EVENT(pv_wait_early)       /* # of early vCPU wait's                  */
+LOCK_EVENT(pv_wait_head)        /* # of vCPU wait's at the queue head      */
+LOCK_EVENT(pv_wait_node)        /* # of vCPU wait's at non-head queue node */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+/*
+ * Locking events for qspinlock
+ *
+ * Subtracting lock_use_node[234] from lock_slowpath will give you
+ * lock_use_node1.
+ */
+LOCK_EVENT(lock_pending)        /* # of locking ops via pending code         */
+LOCK_EVENT(lock_slowpath)       /* # of locking ops via MCS lock queue       */
+LOCK_EVENT(lock_use_node2)      /* # of locking ops that use 2nd percpu node */
+LOCK_EVENT(lock_use_node3)      /* # of locking ops that use 3rd percpu node */
+LOCK_EVENT(lock_use_node4)      /* # of locking ops that use 4th percpu node */
+LOCK_EVENT(lock_no_node)        /* # of locking ops w/o using percpu node    */
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+/*
+ * Locking events for rwsem
+ */
+LOCK_EVENT(rwsem_sleep_reader)  /* # of reader sleeps                   */
+LOCK_EVENT(rwsem_sleep_writer)  /* # of writer sleeps                   */
+LOCK_EVENT(rwsem_wake_reader)   /* # of reader wakeups                  */
+LOCK_EVENT(rwsem_wake_writer)   /* # of writer wakeups                  */
+LOCK_EVENT(rwsem_opt_wlock)     /* # of write locks opt-spin acquired   */
+LOCK_EVENT(rwsem_opt_fail)      /* # of failed opt-spinnings            */
+LOCK_EVENT(rwsem_rlock)         /* # of read locks acquired             */
+LOCK_EVENT(rwsem_rlock_fast)    /* # of fast read locks acquired        */
+LOCK_EVENT(rwsem_rlock_fail)    /* # of failed read lock acquisitions   */
+LOCK_EVENT(rwsem_rtrylock)      /* # of read trylock calls              */
+LOCK_EVENT(rwsem_wlock)         /* # of write locks acquired            */
+LOCK_EVENT(rwsem_wlock_fail)    /* # of failed write lock acquisitions  */
+LOCK_EVENT(rwsem_wtrylock)      /* # of write trylock calls             */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 34cdcbedda49..d06190fa5082 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -434,29 +434,14 @@ static void print_lockdep_off(const char *bug_msg)
 #endif
 }
-static int save_trace(struct stack_trace *trace)
+static int save_trace(struct lock_trace *trace)
 {
-        trace->nr_entries = 0;
+        unsigned long *entries = stack_trace + nr_stack_trace_entries;
-        trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+        unsigned int max_entries;
-        trace->entries = stack_trace + nr_stack_trace_entries;
-        trace->skip = 3;
-        save_stack_trace(trace);
-        /*
-         * Some daft arches put -1 at the end to indicate its a full trace.
-         *
-         * <rant> this is buggy anyway, since it takes a whole extra entry so a
-         * complete trace that maxes out the entries provided will be reported
-         * as incomplete, friggin useless </rant>
-         */
-        if (trace->nr_entries != 0 &&
-            trace->entries[trace->nr_entries-1] == ULONG_MAX)
-                trace->nr_entries--;
-        trace->max_entries = trace->nr_entries;
+        trace->offset = nr_stack_trace_entries;
+        max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+        trace->nr_entries = stack_trace_save(entries, max_entries, 3);
        nr_stack_trace_entries += trace->nr_entries;
        if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
@@ -516,11 +501,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
 {
        char c = '.';
-        if (class->usage_mask & lock_flag(bit + 2))
+        if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
                c = '+';
        if (class->usage_mask & lock_flag(bit)) {
                c = '-';
-                if (class->usage_mask & lock_flag(bit + 2))
+                if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
                        c = '?';
        }
@@ -649,6 +634,9 @@ static int static_obj(const void *obj)
                      end   = (unsigned long) &_end,
                      addr  = (unsigned long) obj;
+        if (arch_is_kernel_initmem_freed(addr))
+                return 0;
        /*
         * static variable?
         */
@@ -1207,7 +1195,7 @@ static struct lock_list *alloc_list_entry(void)
 static int add_lock_to_list(struct lock_class *this,
                            struct lock_class *links_to, struct list_head *head,
                            unsigned long ip, int distance,
-                            struct stack_trace *trace)
+                            struct lock_trace *trace)
 {
        struct lock_list *entry;
        /*
@@ -1426,6 +1414,13 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
 * checking.
 */
+static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+{
+        unsigned long *entries = stack_trace + trace->offset;
+        stack_trace_print(entries, trace->nr_entries, spaces);
+}
 /*
 * Print a dependency chain entry (this is only done when a deadlock
 * has been detected):
@@ -1438,8 +1433,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
        printk("\n-> #%u", depth);
        print_lock_name(target->class);
        printk(KERN_CONT ":\n");
-        print_stack_trace(&target->trace, 6);
+        print_lock_trace(&target->trace, 6);
        return 0;
 }
@@ -1533,10 +1527,9 @@ static inline int class_equal(struct lock_list *entry, void *data)
 }
 static noinline int print_circular_bug(struct lock_list *this,
-                                struct lock_list *target,
+                                       struct lock_list *target,
-                                struct held_lock *check_src,
+                                       struct held_lock *check_src,
-                                struct held_lock *check_tgt,
+                                       struct held_lock *check_tgt)
-                                struct stack_trace *trace)
 {
        struct task_struct *curr = current;
        struct lock_list *parent;
@@ -1676,19 +1669,25 @@ check_redundant(struct lock_list *root, struct lock_class *target,
 }
 #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+static inline int usage_accumulate(struct lock_list *entry, void *mask)
+{
+        *(unsigned long *)mask |= entry->class->usage_mask;
+        return 0;
+}
 /*
 * Forwards and backwards subgraph searching, for the purposes of
 * proving that two subgraphs can be connected by a new dependency
 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
 */
-static inline int usage_match(struct lock_list *entry, void *bit)
+static inline int usage_match(struct lock_list *entry, void *mask)
 {
-        return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+        return entry->class->usage_mask & *(unsigned long *)mask;
 }
 /*
 * Find a node in the forwards-direction dependency sub-graph starting
 * at @root->class that matches @bit.
@@ -1700,14 +1699,14 @@ static inline int usage_match(struct lock_list *entry, void *bit)
 * Return <0 on error.
 */
 static int
-find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
                        struct lock_list **target_entry)
 {
        int result;
        debug_atomic_inc(nr_find_usage_forwards_checks);
-        result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+        result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
        return result;
 }
@@ -1723,14 +1722,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
 * Return <0 on error.
 */
 static int
-find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
                        struct lock_list **target_entry)
 {
        int result;
        debug_atomic_inc(nr_find_usage_backwards_checks);
-        result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
+        result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
        return result;
 }
@@ -1752,7 +1751,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
                        len += printk("%*s   %s", depth, "", usage_str[bit]);
                        len += printk(KERN_CONT " at:\n");
-                        print_stack_trace(class->usage_traces + bit, len);
+                        print_lock_trace(class->usage_traces + bit, len);
                }
        }
        printk("%*s }\n", depth, "");
@@ -1777,7 +1776,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
        do {
                print_lock_class_header(entry->class, depth);
                printk("%*s ... acquired at:\n", depth, "");
-                print_stack_trace(&entry->trace, 2);
+                print_lock_trace(&entry->trace, 2);
                printk("\n");
                if (depth == 0 && (entry != root)) {
@@ -1890,14 +1889,14 @@ print_bad_irq_dependency(struct task_struct *curr,
        print_lock_name(backwards_entry->class);
        pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
-        print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
+        print_lock_trace(backwards_entry->class->usage_traces + bit1, 1);
        pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
        print_lock_name(forwards_entry->class);
        pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
        pr_warn("...");
-        print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
+        print_lock_trace(forwards_entry->class->usage_traces + bit2, 1);
        pr_warn("\nother info that might help us debug this:\n\n");
        print_irq_lock_scenario(backwards_entry, forwards_entry,
@@ -1922,39 +1921,6 @@ print_bad_irq_dependency(struct task_struct *curr,
        return 0;
 }
-static int
-check_usage(struct task_struct *curr, struct held_lock *prev,
-            struct held_lock *next, enum lock_usage_bit bit_backwards,
-            enum lock_usage_bit bit_forwards, const char *irqclass)
-{
-        int ret;
-        struct lock_list this, that;
-        struct lock_list *uninitialized_var(target_entry);
-        struct lock_list *uninitialized_var(target_entry1);
-        this.parent = NULL;
-        this.class = hlock_class(prev);
-        ret = find_usage_backwards(&this, bit_backwards, &target_entry);
-        if (ret < 0)
-                return print_bfs_bug(ret);
-        if (ret == 1)
-                return ret;
-        that.parent = NULL;
-        that.class = hlock_class(next);
-        ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
-        if (ret < 0)
-                return print_bfs_bug(ret);
-        if (ret == 1)
-                return ret;
-        return print_bad_irq_dependency(curr, &this, &that,
-                        target_entry, target_entry1,
-                        prev, next,
-                        bit_backwards, bit_forwards, irqclass);
-}
 static const char *state_names[] = {
 #define LOCKDEP_STATE(__STATE) \
        __stringify(__STATE),
@@ -1971,9 +1937,19 @@ static const char *state_rnames[] = {
 static inline const char *state_name(enum lock_usage_bit bit)
 {
-        return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+        if (bit & LOCK_USAGE_READ_MASK)
+                return state_rnames[bit >> LOCK_USAGE_DIR_MASK];
+        else
+                return state_names[bit >> LOCK_USAGE_DIR_MASK];
 }
+/*
+ * The bit number is encoded like:
+ *
+ *  bit0: 0 exclusive, 1 read lock
+ *  bit1: 0 used in irq, 1 irq enabled
+ *  bit2-n: state
+ */
 static int exclusive_bit(int new_bit)
 {
        int state = new_bit & LOCK_USAGE_STATE_MASK;
@@ -1985,45 +1961,160 @@ static int exclusive_bit(int new_bit)
        return state | (dir ^ LOCK_USAGE_DIR_MASK);
 }
+/*
+ * Observe that when given a bitmask where each bitnr is encoded as above, a
+ * right shift of the mask transforms the individual bitnrs as -1 and
+ * conversely, a left shift transforms into +1 for the individual bitnrs.
+ *
+ * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can
+ * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0)
+ * instead by subtracting the bit number by 2, or shifting the mask right by 2.
+ *
+ * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2.
+ *
+ * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is
+ * all bits set) and recompose with bitnr1 flipped.
+ */
+static unsigned long invert_dir_mask(unsigned long mask)
+{
+        unsigned long excl = 0;
+        /* Invert dir */
+        excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK;
+        excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK;
+        return excl;
+}
+/*
+ * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all
+ * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*).
+ * And then mask out all bitnr0.
+ */
+static unsigned long exclusive_mask(unsigned long mask)
+{
+        unsigned long excl = invert_dir_mask(mask);
+        /* Strip read */
+        excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
+        excl &= ~LOCKF_IRQ_READ;
+        return excl;
+}
+/*
+ * Retrieve the _possible_ original mask to which @mask is
+ * exclusive. Ie: this is the opposite of exclusive_mask().
+ * Note that 2 possible original bits can match an exclusive
+ * bit: one has LOCK_USAGE_READ_MASK set, the other has it
+ * cleared. So both are returned for each exclusive bit.
+ */
+static unsigned long original_mask(unsigned long mask)
+{
+        unsigned long excl = invert_dir_mask(mask);
+        /* Include read in existing usages */
+        excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
+        return excl;
+}
+/*
+ * Find the first pair of bit match between an original
+ * usage mask and an exclusive usage mask.
+ */
+static int find_exclusive_match(unsigned long mask,
+                                unsigned long excl_mask,
+                                enum lock_usage_bit *bitp,
+                                enum lock_usage_bit *excl_bitp)
+{
+        int bit, excl;
+        for_each_set_bit(bit, &mask, LOCK_USED) {
+                excl = exclusive_bit(bit);
+                if (excl_mask & lock_flag(excl)) {
+                        *bitp = bit;
+                        *excl_bitp = excl;
+                        return 0;
+                }
+        }
+        return -1;
+}
+/*
+ * Prove that the new dependency does not connect a hardirq-safe(-read)
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
 static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
-                           struct held_lock *next, enum lock_usage_bit bit)
+                           struct held_lock *next)
 {
+        unsigned long usage_mask = 0, forward_mask, backward_mask;
+        enum lock_usage_bit forward_bit = 0, backward_bit = 0;
+        struct lock_list *uninitialized_var(target_entry1);
+        struct lock_list *uninitialized_var(target_entry);
+        struct lock_list this, that;
+        int ret;
        /*
-         * Prove that the new dependency does not connect a hardirq-safe
+         * Step 1: gather all hard/soft IRQs usages backward in an
-         * lock with a hardirq-unsafe lock - to achieve this we search
+         * accumulated usage mask.
-         * the backwards-subgraph starting at <prev>, and the
-         * forwards-subgraph starting at <next>:
         */
-        if (!check_usage(curr, prev, next, bit,
+        this.parent = NULL;
-                           exclusive_bit(bit), state_name(bit)))
+        this.class = hlock_class(prev);
-                return 0;
+        ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
+        if (ret < 0)
+                return print_bfs_bug(ret);
-        bit++; /* _READ */
+        usage_mask &= LOCKF_USED_IN_IRQ_ALL;
+        if (!usage_mask)
+                return 1;
        /*
-         * Prove that the new dependency does not connect a hardirq-safe-read
+         * Step 2: find exclusive uses forward that match the previous
-         * lock with a hardirq-unsafe lock - to achieve this we search
+         * backward accumulated mask.
-         * the backwards-subgraph starting at <prev>, and the
-         * forwards-subgraph starting at <next>:
         */
-        if (!check_usage(curr, prev, next, bit,
+        forward_mask = exclusive_mask(usage_mask);
-                           exclusive_bit(bit), state_name(bit)))
-                return 0;
-        return 1;
+        that.parent = NULL;
-}
+        that.class = hlock_class(next);
-static int
+        ret = find_usage_forwards(&that, forward_mask, &target_entry1);
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+        if (ret < 0)
-                struct held_lock *next)
+                return print_bfs_bug(ret);
-{
+        if (ret == 1)
-#define LOCKDEP_STATE(__STATE)                                          \
+                return ret;
-        if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
-                return 0;
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-        return 1;
+        /*
+         * Step 3: we found a bad match! Now retrieve a lock from the backward
+         * list whose usage mask matches the exclusive usage mask from the
+         * lock found on the forward list.
+         */
+        backward_mask = original_mask(target_entry1->class->usage_mask);
+        ret = find_usage_backwards(&this, backward_mask, &target_entry);
+        if (ret < 0)
+                return print_bfs_bug(ret);
+        if (DEBUG_LOCKS_WARN_ON(ret == 1))
+                return 1;
+        /*
+         * Step 4: narrow down to a pair of incompatible usage bits
+         * and report it.
+         */
+        ret = find_exclusive_match(target_entry->class->usage_mask,
+                                   target_entry1->class->usage_mask,
+                                   &backward_bit, &forward_bit);
+        if (DEBUG_LOCKS_WARN_ON(ret == -1))
+                return 1;
+        return print_bad_irq_dependency(curr, &this, &that,
+                        target_entry, target_entry1,
+                        prev, next,
+                        backward_bit, forward_bit,
+                        state_name(backward_bit));
 }
 static void inc_chains(void)
@@ -2040,9 +2131,8 @@ static void inc_chains(void)
 #else
-static inline int
+static inline int check_irq_usage(struct task_struct *curr,
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+                                  struct held_lock *prev, struct held_lock *next)
-                struct held_lock *next)
 {
        return 1;
 }
@@ -2170,8 +2260,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
 */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-               struct held_lock *next, int distance, struct stack_trace *trace,
+               struct held_lock *next, int distance, struct lock_trace *trace)
-               int (*save)(struct stack_trace *trace))
 {
        struct lock_list *uninitialized_var(target_entry);
        struct lock_list *entry;
@@ -2209,20 +2298,20 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
        this.parent = NULL;
        ret = check_noncircular(&this, hlock_class(prev), &target_entry);
        if (unlikely(!ret)) {
-                if (!trace->entries) {
+                if (!trace->nr_entries) {
                        /*
-                         * If @save fails here, the printing might trigger
+                         * If save_trace fails here, the printing might
-                         * a WARN but because of the !nr_entries it should
+                         * trigger a WARN but because of the !nr_entries it
-                         * not do bad things.
+                         * should not do bad things.
                         */
-                        save(trace);
+                        save_trace(trace);
                }
-                return print_circular_bug(&this, target_entry, next, prev, trace);
+                return print_circular_bug(&this, target_entry, next, prev);
        }
        else if (unlikely(ret < 0))
                return print_bfs_bug(ret);
-        if (!check_prev_add_irq(curr, prev, next))
+        if (!check_irq_usage(curr, prev, next))
                return 0;
        /*
@@ -2265,7 +2354,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
                return print_bfs_bug(ret);
-        if (!trace->entries && !save(trace))
+        if (!trace->nr_entries && !save_trace(trace))
                return 0;
        /*
@@ -2297,14 +2386,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
+        struct lock_trace trace = { .nr_entries = 0 };
        int depth = curr->lockdep_depth;
        struct held_lock *hlock;
-        struct stack_trace trace = {
-                .nr_entries = 0,
-                .max_entries = 0,
-                .entries = NULL,
-                .skip = 0,
-        };
        /*
         * Debugging checks.
@@ -2330,7 +2414,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
                 * added:
                 */
                if (hlock->read != 2 && hlock->check) {
-                        int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+                        int ret = check_prev_add(curr, hlock, next, distance,
+                                                 &trace);
                        if (!ret)
                                return 0;
@@ -2731,6 +2816,10 @@ static inline int validate_chain(struct task_struct *curr,
 {
        return 1;
 }
+static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+{
+}
 #endif
 /*
@@ -2784,6 +2873,12 @@ static void check_chain_key(struct task_struct *curr)
 #endif
 }
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+                     enum lock_usage_bit new_bit);
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 static void
 print_usage_bug_scenario(struct held_lock *lock)
 {
@@ -2827,7 +2922,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        print_lock(this);
        pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
-        print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+        print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1);
        print_irqtrace_events(curr);
        pr_warn("\nother info that might help us debug this:\n");
@@ -2853,10 +2948,6 @@ valid_state(struct task_struct *curr, struct held_lock *this,
        return 1;
 }
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
-                     enum lock_usage_bit new_bit);
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
 * print irq inversion bug:
@@ -2936,7 +3027,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
        root.parent = NULL;
        root.class = hlock_class(this);
-        ret = find_usage_forwards(&root, bit, &target_entry);
+        ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
        if (ret < 0)
                return print_bfs_bug(ret);
        if (ret == 1)
@@ -2960,7 +3051,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
        root.parent = NULL;
        root.class = hlock_class(this);
-        ret = find_usage_backwards(&root, bit, &target_entry);
+        ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
        if (ret < 0)
                return print_bfs_bug(ret);
        if (ret == 1)
@@ -3015,7 +3106,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = {
 static inline int state_verbose(enum lock_usage_bit bit,
                                struct lock_class *class)
 {
-        return state_verbose_f[bit >> 2](class);
+        return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class);
 }
 typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
@@ -3157,7 +3248,7 @@ void lockdep_hardirqs_on(unsigned long ip)
        /*
         * See the fine text that goes along with this variable definition.
         */
-        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+        if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled))
                return;
        /*
@@ -4689,8 +4780,8 @@ static void free_zapped_rcu(struct rcu_head *ch)
                return;
        raw_local_irq_save(flags);
-        if (!graph_lock())
+        arch_spin_lock(&lockdep_lock);
-                goto out_irq;
+        current->lockdep_recursion = 1;
        /* closed head */
        pf = delayed_free.pf + (delayed_free.index ^ 1);
@@ -4702,8 +4793,8 @@ static void free_zapped_rcu(struct rcu_head *ch)
         */
        call_rcu_zapped(delayed_free.pf + delayed_free.index);
-        graph_unlock();
+        current->lockdep_recursion = 0;
-out_irq:
+        arch_spin_unlock(&lockdep_lock);
        raw_local_irq_restore(flags);
 }
@@ -4744,21 +4835,17 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)
 {
        struct pending_free *pf;
        unsigned long flags;
-        int locked;
        init_data_structures_once();
        raw_local_irq_save(flags);
-        locked = graph_lock();
+        arch_spin_lock(&lockdep_lock);
-        if (!locked)
+        current->lockdep_recursion = 1;
-                goto out_irq;
        pf = get_pending_free();
        __lockdep_free_key_range(pf, start, size);
        call_rcu_zapped(pf);
+        current->lockdep_recursion = 0;
-        graph_unlock();
+        arch_spin_unlock(&lockdep_lock);
-out_irq:
        raw_local_irq_restore(flags);
        /*
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index d4c197425f68..150ec3f0c5b5 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -42,13 +42,35 @@ enum {
        __LOCKF(USED)
 };
-#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
+#define LOCKDEP_STATE(__STATE)  LOCKF_ENABLED_##__STATE |
-#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+static const unsigned long LOCKF_ENABLED_IRQ =
+#include "lockdep_states.h"
+        0;
+#undef LOCKDEP_STATE
+#define LOCKDEP_STATE(__STATE)  LOCKF_USED_IN_##__STATE |
+static const unsigned long LOCKF_USED_IN_IRQ =
+#include "lockdep_states.h"
+        0;
+#undef LOCKDEP_STATE
+#define LOCKDEP_STATE(__STATE)  LOCKF_ENABLED_##__STATE##_READ |
+static const unsigned long LOCKF_ENABLED_IRQ_READ =
+#include "lockdep_states.h"
+        0;
+#undef LOCKDEP_STATE
+#define LOCKDEP_STATE(__STATE)  LOCKF_USED_IN_##__STATE##_READ |
+static const unsigned long LOCKF_USED_IN_IRQ_READ =
+#include "lockdep_states.h"
+        0;
+#undef LOCKDEP_STATE
+#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
+#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
-#define LOCKF_ENABLED_IRQ_READ \
+#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ)
-                (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
+#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ)
-#define LOCKF_USED_IN_IRQ_READ \
-                (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
 /*
 * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ad40a2617063..80a463d31a8d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -829,7 +829,9 @@ static void lock_torture_cleanup(void)
                                                "End of test: SUCCESS");
        kfree(cxt.lwsa);
+        cxt.lwsa = NULL;
        kfree(cxt.lrsa);
+        cxt.lrsa = NULL;
 end:
        torture_cleanup_end();
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 883cf1b92d90..f17dad99eec8 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,6 +7,8 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include "rwsem.h"
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
                        const char *name, struct lock_class_key *rwsem_key)
 {
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 5e9247dc2515..e14b32c69639 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -395,7 +395,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
         * 0,1,0 -> 0,0,1
         */
        clear_pending_set_locked(lock);
-        qstat_inc(qstat_lock_pending, true);
+        lockevent_inc(lock_pending);
        return;
        /*
@@ -403,7 +403,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
         * queuing.
         */
 queue:
-        qstat_inc(qstat_lock_slowpath, true);
+        lockevent_inc(lock_slowpath);
 pv_queue:
        node = this_cpu_ptr(&qnodes[0].mcs);
        idx = node->count++;
@@ -419,7 +419,7 @@ pv_queue:
         * simple enough.
         */
        if (unlikely(idx >= MAX_NODES)) {
-                qstat_inc(qstat_lock_no_node, true);
+                lockevent_inc(lock_no_node);
                while (!queued_spin_trylock(lock))
                        cpu_relax();
                goto release;
@@ -430,7 +430,7 @@ pv_queue:
        /*
         * Keep counts of non-zero index values:
         */
-        qstat_inc(qstat_lock_use_node2 + idx - 1, idx);
+        lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
        /*
         * Ensure that we increment the head node->count before initialising
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 8f36c27c1794..89bab079e7a4 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
                if (!(val & _Q_LOCKED_PENDING_MASK) &&
                   (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
-                        qstat_inc(qstat_pv_lock_stealing, true);
+                        lockevent_inc(pv_lock_stealing);
                        return true;
                }
                if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
@@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
                hopcnt++;
                if (!cmpxchg(&he->lock, NULL, lock)) {
                        WRITE_ONCE(he->node, node);
-                        qstat_hop(hopcnt);
+                        lockevent_pv_hop(hopcnt);
                        return &he->lock;
                }
        }
@@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
                smp_store_mb(pn->state, vcpu_halted);
                if (!READ_ONCE(node->locked)) {
-                        qstat_inc(qstat_pv_wait_node, true);
+                        lockevent_inc(pv_wait_node);
-                        qstat_inc(qstat_pv_wait_early, wait_early);
+                        lockevent_cond_inc(pv_wait_early, wait_early);
                        pv_wait(&pn->state, vcpu_halted);
                }
@@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
                 * So it is better to spin for a while in the hope that the
                 * MCS lock will be released soon.
                 */
-                qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
+                lockevent_cond_inc(pv_spurious_wakeup,
+                                  !READ_ONCE(node->locked));
        }
        /*
@@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
        /*
         * Tracking # of slowpath locking operations
         */
-        qstat_inc(qstat_lock_slowpath, true);
+        lockevent_inc(lock_slowpath);
        for (;; waitcnt++) {
                /*
@@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
                        }
                }
                WRITE_ONCE(pn->state, vcpu_hashed);
-                qstat_inc(qstat_pv_wait_head, true);
+                lockevent_inc(pv_wait_head);
-                qstat_inc(qstat_pv_wait_again, waitcnt);
+                lockevent_cond_inc(pv_wait_again, waitcnt);
                pv_wait(&lock->locked, _Q_SLOW_VAL);
                /*
@@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
         * vCPU is harmless other than the additional latency in completing
         * the unlock.
         */
-        qstat_inc(qstat_pv_kick_unlock, true);
+        lockevent_inc(pv_kick_unlock);
        pv_kick(node->cpu);
 }
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index d73f85388d5c..54152670ff24 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -9,262 +9,105 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
- * Authors: Waiman Long <waiman.long@hpe.com>
+ * Authors: Waiman Long <longman@redhat.com>
 */
-/*
+#include "lock_events.h"
- * When queued spinlock statistical counters are enabled, the following
- * debugfs files will be created for reporting the counter values:
- *
- * <debugfs>/qlockstat/
- *   pv_hash_hops       - average # of hops per hashing operation
- *   pv_kick_unlock     - # of vCPU kicks issued at unlock time
- *   pv_kick_wake       - # of vCPU kicks used for computing pv_latency_wake
- *   pv_latency_kick    - average latency (ns) of vCPU kick operation
- *   pv_latency_wake    - average latency (ns) from vCPU kick to wakeup
- *   pv_lock_stealing   - # of lock stealing operations
- *   pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
- *   pv_wait_again      - # of wait's after a queue head vCPU kick
- *   pv_wait_early      - # of early vCPU wait's
- *   pv_wait_head       - # of vCPU wait's at the queue head
- *   pv_wait_node       - # of vCPU wait's at a non-head queue node
- *   lock_pending       - # of locking operations via pending code
- *   lock_slowpath      - # of locking operations via MCS lock queue
- *   lock_use_node2     - # of locking operations that use 2nd per-CPU node
- *   lock_use_node3     - # of locking operations that use 3rd per-CPU node
- *   lock_use_node4     - # of locking operations that use 4th per-CPU node
- *   lock_no_node       - # of locking operations without using per-CPU node
- *
- * Subtracting lock_use_node[234] from lock_slowpath will give you
- * lock_use_node1.
- *
- * Writing to the "reset_counters" file will reset all the above counter
- * values.
- *
- * These statistical counters are implemented as per-cpu variables which are
- * summed and computed whenever the corresponding debugfs files are read. This
- * minimizes added overhead making the counters usable even in a production
- * environment.
- *
- * There may be slight difference between pv_kick_wake and pv_kick_unlock.
- */
-enum qlock_stats {
-        qstat_pv_hash_hops,
-        qstat_pv_kick_unlock,
-        qstat_pv_kick_wake,
-        qstat_pv_latency_kick,
-        qstat_pv_latency_wake,
-        qstat_pv_lock_stealing,
-        qstat_pv_spurious_wakeup,
-        qstat_pv_wait_again,
-        qstat_pv_wait_early,
-        qstat_pv_wait_head,
-        qstat_pv_wait_node,
-        qstat_lock_pending,
-        qstat_lock_slowpath,
-        qstat_lock_use_node2,
-        qstat_lock_use_node3,
-        qstat_lock_use_node4,
-        qstat_lock_no_node,
-        qstat_num,      /* Total number of statistical counters */
-        qstat_reset_cnts = qstat_num,
-};
-#ifdef CONFIG_QUEUED_LOCK_STAT
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
 /*
- * Collect pvqspinlock statistics
+ * Collect pvqspinlock locking event counts
 */
-#include <linux/debugfs.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/fs.h>
-static const char * const qstat_names[qstat_num + 1] = {
+#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev]
-        [qstat_pv_hash_hops]       = "pv_hash_hops",
-        [qstat_pv_kick_unlock]     = "pv_kick_unlock",
-        [qstat_pv_kick_wake]       = "pv_kick_wake",
-        [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
-        [qstat_pv_latency_kick]    = "pv_latency_kick",
-        [qstat_pv_latency_wake]    = "pv_latency_wake",
-        [qstat_pv_lock_stealing]   = "pv_lock_stealing",
-        [qstat_pv_wait_again]      = "pv_wait_again",
-        [qstat_pv_wait_early]      = "pv_wait_early",
-        [qstat_pv_wait_head]       = "pv_wait_head",
-        [qstat_pv_wait_node]       = "pv_wait_node",
-        [qstat_lock_pending]       = "lock_pending",
-        [qstat_lock_slowpath]      = "lock_slowpath",
-        [qstat_lock_use_node2]     = "lock_use_node2",
-        [qstat_lock_use_node3]     = "lock_use_node3",
-        [qstat_lock_use_node4]     = "lock_use_node4",
-        [qstat_lock_no_node]       = "lock_no_node",
-        [qstat_reset_cnts]         = "reset_counters",
-};
 /*
- * Per-cpu counters
+ * PV specific per-cpu counter
 */
-static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
 static DEFINE_PER_CPU(u64, pv_kick_time);
 /*
- * Function to read and return the qlock statistical counter values
+ * Function to read and return the PV qspinlock counts.
 *
 * The following counters are handled specially:
- * 1. qstat_pv_latency_kick
+ * 1. pv_latency_kick
 *    Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
- * 2. qstat_pv_latency_wake
+ * 2. pv_latency_wake
 *    Average wake latency (ns) = pv_latency_wake/pv_kick_wake
- * 3. qstat_pv_hash_hops
+ * 3. pv_hash_hops
 *    Average hops/hash = pv_hash_hops/pv_kick_unlock
 */
-static ssize_t qstat_read(struct file *file, char __user *user_buf,
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
-                          size_t count, loff_t *ppos)
+                       size_t count, loff_t *ppos)
 {
        char buf[64];
-        int cpu, counter, len;
+        int cpu, id, len;
-        u64 stat = 0, kicks = 0;
+        u64 sum = 0, kicks = 0;
        /*
         * Get the counter ID stored in file->f_inode->i_private
         */
-        counter = (long)file_inode(file)->i_private;
+        id = (long)file_inode(file)->i_private;
-        if (counter >= qstat_num)
+        if (id >= lockevent_num)
                return -EBADF;
        for_each_possible_cpu(cpu) {
-                stat += per_cpu(qstats[counter], cpu);
+                sum += per_cpu(lockevents[id], cpu);
                /*
-                 * Need to sum additional counter for some of them
+                 * Need to sum additional counters for some of them
                 */
-                switch (counter) {
+                switch (id) {
-                case qstat_pv_latency_kick:
+                case LOCKEVENT_pv_latency_kick:
-                case qstat_pv_hash_hops:
+                case LOCKEVENT_pv_hash_hops:
-                        kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+                        kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu);
                        break;
-                case qstat_pv_latency_wake:
+                case LOCKEVENT_pv_latency_wake:
-                        kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+                        kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu);
                        break;
                }
        }
-        if (counter == qstat_pv_hash_hops) {
+        if (id == LOCKEVENT_pv_hash_hops) {
                u64 frac = 0;
                if (kicks) {
-                        frac = 100ULL * do_div(stat, kicks);
+                        frac = 100ULL * do_div(sum, kicks);
                        frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
                }
                /*
                 * Return a X.XX decimal number
                 */
-                len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+                len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n",
+                               sum, frac);
        } else {
                /*
                 * Round to the nearest ns
                 */
-                if ((counter == qstat_pv_latency_kick) ||
+                if ((id == LOCKEVENT_pv_latency_kick) ||
-                    (counter == qstat_pv_latency_wake)) {
+                    (id == LOCKEVENT_pv_latency_wake)) {
                        if (kicks)
-                                stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+                                sum = DIV_ROUND_CLOSEST_ULL(sum, kicks);
                }
-                len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+                len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
        }
        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
 }
 /*
- * Function to handle write request
- *
- * When counter = reset_cnts, reset all the counter values.
- * Since the counter updates aren't atomic, the resetting is done twice
- * to make sure that the counters are very likely to be all cleared.
- */
-static ssize_t qstat_write(struct file *file, const char __user *user_buf,
-                           size_t count, loff_t *ppos)
-{
-        int cpu;
-        /*
-         * Get the counter ID stored in file->f_inode->i_private
-         */
-        if ((long)file_inode(file)->i_private != qstat_reset_cnts)
-                return count;
-        for_each_possible_cpu(cpu) {
-                int i;
-                unsigned long *ptr = per_cpu_ptr(qstats, cpu);
-                for (i = 0 ; i < qstat_num; i++)
-                        WRITE_ONCE(ptr[i], 0);
-        }
-        return count;
-}
-/*
- * Debugfs data structures
- */
-static const struct file_operations fops_qstat = {
-        .read = qstat_read,
-        .write = qstat_write,
-        .llseek = default_llseek,
-};
-/*
- * Initialize debugfs for the qspinlock statistical counters
- */
-static int __init init_qspinlock_stat(void)
-{
-        struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
-        int i;
-        if (!d_qstat)
-                goto out;
-        /*
-         * Create the debugfs files
-         *
-         * As reading from and writing to the stat files can be slow, only
-         * root is allowed to do the read/write to limit impact to system
-         * performance.
-         */
-        for (i = 0; i < qstat_num; i++)
-                if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
-                                         (void *)(long)i, &fops_qstat))
-                        goto fail_undo;
-        if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
-                                 (void *)(long)qstat_reset_cnts, &fops_qstat))
-                goto fail_undo;
-        return 0;
-fail_undo:
-        debugfs_remove_recursive(d_qstat);
-out:
-        pr_warn("Could not create 'qlockstat' debugfs entries\n");
-        return -ENOMEM;
-}
-fs_initcall(init_qspinlock_stat);
-/*
- * Increment the PV qspinlock statistical counters
- */
-static inline void qstat_inc(enum qlock_stats stat, bool cond)
-{
-        if (cond)
-                this_cpu_inc(qstats[stat]);
-}
-/*
 * PV hash hop count
 */
-static inline void qstat_hop(int hopcnt)
+static inline void lockevent_pv_hop(int hopcnt)
 {
-        this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+        this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt);
 }
 /*
@@ -276,7 +119,7 @@ static inline void __pv_kick(int cpu)
        per_cpu(pv_kick_time, cpu) = start;
        pv_kick(cpu);
-        this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+        this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start);
 }
 /*
@@ -289,18 +132,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)
        *pkick_time = 0;
        pv_wait(ptr, val);
        if (*pkick_time) {
-                this_cpu_add(qstats[qstat_pv_latency_wake],
+                this_cpu_add(EVENT_COUNT(pv_latency_wake),
                             sched_clock() - *pkick_time);
-                qstat_inc(qstat_pv_kick_wake, true);
+                lockevent_inc(pv_kick_wake);
        }
 }
 #define pv_kick(c)      __pv_kick(c)
 #define pv_wait(p, v)   __pv_wait(p, v)
-#else /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+#else /* CONFIG_LOCK_EVENT_COUNTS */
-static inline void qstat_inc(enum qlock_stats stat, bool cond)  { }
+static inline void lockevent_pv_hop(int hopcnt) { }
-static inline void qstat_hop(int hopcnt)                        { }
-#endif /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
deleted file mode 100644
index a7ffb2a96ede..000000000000
--- a/kernel/locking/rwsem-spinlock.c
+++ /dev/null
@@ -1,339 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
- * generic spinlock implementation
- *
- * Copyright (c) 2001   David Howells (dhowells@redhat.com).
- * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
- * - Derived also from comments by Linus
- */
-#include <linux/rwsem.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/export.h>
-enum rwsem_waiter_type {
-        RWSEM_WAITING_FOR_WRITE,
-        RWSEM_WAITING_FOR_READ
-};
-struct rwsem_waiter {
-        struct list_head list;
-        struct task_struct *task;
-        enum rwsem_waiter_type type;
-};
-int rwsem_is_locked(struct rw_semaphore *sem)
-{
-        int ret = 1;
-        unsigned long flags;
-        if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
-                ret = (sem->count != 0);
-                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        }
-        return ret;
-}
-EXPORT_SYMBOL(rwsem_is_locked);
-/*
- * initialise the semaphore
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
-                  struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-        /*
-         * Make sure we are not reinitializing a held semaphore:
-         */
-        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
-        lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
-        sem->count = 0;
-        raw_spin_lock_init(&sem->wait_lock);
-        INIT_LIST_HEAD(&sem->wait_list);
-}
-EXPORT_SYMBOL(__init_rwsem);
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here, then:
- *   - the 'active count' _reached_ zero
- *   - the 'waiting count' is non-zero
- * - the spinlock must be held by the caller
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if wakewrite is non-zero
- */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
-{
-        struct rwsem_waiter *waiter;
-        struct task_struct *tsk;
-        int woken;
-        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-        if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
-                if (wakewrite)
-                        /* Wake up a writer. Note that we do not grant it the
-                         * lock - it will have to acquire it when it runs. */
-                        wake_up_process(waiter->task);
-                goto out;
-        }
-        /* grant an infinite number of read locks to the front of the queue */
-        woken = 0;
-        do {
-                struct list_head *next = waiter->list.next;
-                list_del(&waiter->list);
-                tsk = waiter->task;
-                /*
-                 * Make sure we do not wakeup the next reader before
-                 * setting the nil condition to grant the next reader;
-                 * otherwise we could miss the wakeup on the other
-                 * side and end up sleeping again. See the pairing
-                 * in rwsem_down_read_failed().
-                 */
-                smp_mb();
-                waiter->task = NULL;
-                wake_up_process(tsk);
-                put_task_struct(tsk);
-                woken++;
-                if (next == &sem->wait_list)
-                        break;
-                waiter = list_entry(next, struct rwsem_waiter, list);
-        } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-        sem->count += woken;
- out:
-        return sem;
-}
-/*
- * wake a single writer
- */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
-{
-        struct rwsem_waiter *waiter;
-        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-        wake_up_process(waiter->task);
-        return sem;
-}
-/*
- * get a read lock on the semaphore
- */
-int __sched __down_read_common(struct rw_semaphore *sem, int state)
-{
-        struct rwsem_waiter waiter;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
-                /* granted */
-                sem->count++;
-                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-                goto out;
-        }
-        /* set up my own style of waitqueue */
-        waiter.task = current;
-        waiter.type = RWSEM_WAITING_FOR_READ;
-        get_task_struct(current);
-        list_add_tail(&waiter.list, &sem->wait_list);
-        /* wait to be given the lock */
-        for (;;) {
-                if (!waiter.task)
-                        break;
-                if (signal_pending_state(state, current))
-                        goto out_nolock;
-                set_current_state(state);
-                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-                schedule();
-                raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        }
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- out:
-        return 0;
-out_nolock:
-        /*
-         * We didn't take the lock, so that there is a writer, which
-         * is owner or the first waiter of the sem. If it's a waiter,
-         * it will be woken by current owner. Not need to wake anybody.
-         */
-        list_del(&waiter.list);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        return -EINTR;
-}
-void __sched __down_read(struct rw_semaphore *sem)
-{
-        __down_read_common(sem, TASK_UNINTERRUPTIBLE);
-}
-int __sched __down_read_killable(struct rw_semaphore *sem)
-{
-        return __down_read_common(sem, TASK_KILLABLE);
-}
-/*
- * trylock for reading -- returns 1 if successful, 0 if contention
- */
-int __down_read_trylock(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        int ret = 0;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
-                /* granted */
-                sem->count++;
-                ret = 1;
-        }
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        return ret;
-}
-/*
- * get a write lock on the semaphore
- */
-int __sched __down_write_common(struct rw_semaphore *sem, int state)
-{
-        struct rwsem_waiter waiter;
-        unsigned long flags;
-        int ret = 0;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        /* set up my own style of waitqueue */
-        waiter.task = current;
-        waiter.type = RWSEM_WAITING_FOR_WRITE;
-        list_add_tail(&waiter.list, &sem->wait_list);
-        /* wait for someone to release the lock */
-        for (;;) {
-                /*
-                 * That is the key to support write lock stealing: allows the
-                 * task already on CPU to get the lock soon rather than put
-                 * itself into sleep and waiting for system woke it or someone
-                 * else in the head of the wait list up.
-                 */
-                if (sem->count == 0)
-                        break;
-                if (signal_pending_state(state, current))
-                        goto out_nolock;
-                set_current_state(state);
-                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-                schedule();
-                raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        }
-        /* got the lock */
-        sem->count = -1;
-        list_del(&waiter.list);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        return ret;
-out_nolock:
-        list_del(&waiter.list);
-        if (!list_empty(&sem->wait_list) && sem->count >= 0)
-                __rwsem_do_wake(sem, 0);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        return -EINTR;
-}
-void __sched __down_write(struct rw_semaphore *sem)
-{
-        __down_write_common(sem, TASK_UNINTERRUPTIBLE);
-}
-int __sched __down_write_killable(struct rw_semaphore *sem)
-{
-        return __down_write_common(sem, TASK_KILLABLE);
-}
-/*
- * trylock for writing -- returns 1 if successful, 0 if contention
- */
-int __down_write_trylock(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        int ret = 0;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->count == 0) {
-                /* got the lock */
-                sem->count = -1;
-                ret = 1;
-        }
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        return ret;
-}
-/*
- * release a read lock on the semaphore
- */
-void __up_read(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (--sem->count == 0 && !list_empty(&sem->wait_list))
-                sem = __rwsem_wake_one_writer(sem);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-/*
- * release a write lock on the semaphore
- */
-void __up_write(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->count = 0;
-        if (!list_empty(&sem->wait_list))
-                sem = __rwsem_do_wake(sem, 1);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-/*
- * downgrade a write lock into a read lock
- * - just wake up any readers at the front of the queue
- */
-void __downgrade_write(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->count = 1;
-        if (!list_empty(&sem->wait_list))
-                sem = __rwsem_do_wake(sem, 0);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index fbe96341beee..6b3ee9948bf1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -147,6 +147,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
                         * will notice the queued writer.
                         */
                        wake_q_add(wake_q, waiter->task);
+                        lockevent_inc(rwsem_wake_writer);
                }
                return;
@@ -176,9 +177,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
                        goto try_reader_grant;
                }
                /*
-                 * It is not really necessary to set it to reader-owned here,
+                 * Set it to reader-owned to give spinners an early
-                 * but it gives the spinners an early indication that the
+                 * indication that readers now have the lock.
-                 * readers now have the lock.
                 */
                __rwsem_set_reader_owned(sem, waiter->task);
        }
@@ -215,6 +215,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
        }
        adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+        lockevent_cond_inc(rwsem_wake_reader, woken);
        if (list_empty(&sem->wait_list)) {
                /* hit end of list above */
                adjustment -= RWSEM_WAITING_BIAS;
@@ -225,92 +226,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 }
 /*
- * Wait for the read lock to be granted
- */
-static inline struct rw_semaphore __sched *
-__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
-{
-        long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
-        struct rwsem_waiter waiter;
-        DEFINE_WAKE_Q(wake_q);
-        waiter.task = current;
-        waiter.type = RWSEM_WAITING_FOR_READ;
-        raw_spin_lock_irq(&sem->wait_lock);
-        if (list_empty(&sem->wait_list)) {
-                /*
-                 * In case the wait queue is empty and the lock isn't owned
-                 * by a writer, this reader can exit the slowpath and return
-                 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
-                 * been set in the count.
-                 */
-                if (atomic_long_read(&sem->count) >= 0) {
-                        raw_spin_unlock_irq(&sem->wait_lock);
-                        return sem;
-                }
-                adjustment += RWSEM_WAITING_BIAS;
-        }
-        list_add_tail(&waiter.list, &sem->wait_list);
-        /* we're now waiting on the lock, but no longer actively locking */
-        count = atomic_long_add_return(adjustment, &sem->count);
-        /*
-         * If there are no active locks, wake the front queued process(es).
-         *
-         * If there are no writers and we are first in the queue,
-         * wake our own waiter to join the existing active readers !
-         */
-        if (count == RWSEM_WAITING_BIAS ||
-            (count > RWSEM_WAITING_BIAS &&
-             adjustment != -RWSEM_ACTIVE_READ_BIAS))
-                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-        raw_spin_unlock_irq(&sem->wait_lock);
-        wake_up_q(&wake_q);
-        /* wait to be given the lock */
-        while (true) {
-                set_current_state(state);
-                if (!waiter.task)
-                        break;
-                if (signal_pending_state(state, current)) {
-                        raw_spin_lock_irq(&sem->wait_lock);
-                        if (waiter.task)
-                                goto out_nolock;
-                        raw_spin_unlock_irq(&sem->wait_lock);
-                        break;
-                }
-                schedule();
-        }
-        __set_current_state(TASK_RUNNING);
-        return sem;
-out_nolock:
-        list_del(&waiter.list);
-        if (list_empty(&sem->wait_list))
-                atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
-        raw_spin_unlock_irq(&sem->wait_lock);
-        __set_current_state(TASK_RUNNING);
-        return ERR_PTR(-EINTR);
-}
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed(struct rw_semaphore *sem)
-{
-        return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed);
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed_killable(struct rw_semaphore *sem)
-{
-        return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed_killable);
-/*
 * This function must be called with the sem->wait_lock held to prevent
 * race conditions between checking the rwsem wait list and setting the
 * sem->count accordingly.
@@ -346,21 +261,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 */
 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 {
-        long old, count = atomic_long_read(&sem->count);
+        long count = atomic_long_read(&sem->count);
-        while (true) {
-                if (!(count == 0 || count == RWSEM_WAITING_BIAS))
-                        return false;
-                old = atomic_long_cmpxchg_acquire(&sem->count, count,
+        while (!count || count == RWSEM_WAITING_BIAS) {
-                                      count + RWSEM_ACTIVE_WRITE_BIAS);
+                if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
-                if (old == count) {
+                                        count + RWSEM_ACTIVE_WRITE_BIAS)) {
                        rwsem_set_owner(sem);
+                        lockevent_inc(rwsem_opt_wlock);
                        return true;
                }
-                count = old;
        }
+        return false;
 }
 static inline bool owner_on_cpu(struct task_struct *owner)
@@ -481,6 +392,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
        osq_unlock(&sem->osq);
 done:
        preempt_enable();
+        lockevent_cond_inc(rwsem_opt_fail, !taken);
        return taken;
 }
@@ -505,6 +417,97 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
 #endif
 /*
+ * Wait for the read lock to be granted
+ */
+static inline struct rw_semaphore __sched *
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
+{
+        long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
+        struct rwsem_waiter waiter;
+        DEFINE_WAKE_Q(wake_q);
+        waiter.task = current;
+        waiter.type = RWSEM_WAITING_FOR_READ;
+        raw_spin_lock_irq(&sem->wait_lock);
+        if (list_empty(&sem->wait_list)) {
+                /*
+                 * In case the wait queue is empty and the lock isn't owned
+                 * by a writer, this reader can exit the slowpath and return
+                 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
+                 * been set in the count.
+                 */
+                if (atomic_long_read(&sem->count) >= 0) {
+                        raw_spin_unlock_irq(&sem->wait_lock);
+                        rwsem_set_reader_owned(sem);
+                        lockevent_inc(rwsem_rlock_fast);
+                        return sem;
+                }
+                adjustment += RWSEM_WAITING_BIAS;
+        }
+        list_add_tail(&waiter.list, &sem->wait_list);
+        /* we're now waiting on the lock, but no longer actively locking */
+        count = atomic_long_add_return(adjustment, &sem->count);
+        /*
+         * If there are no active locks, wake the front queued process(es).
+         *
+         * If there are no writers and we are first in the queue,
+         * wake our own waiter to join the existing active readers !
+         */
+        if (count == RWSEM_WAITING_BIAS ||
+            (count > RWSEM_WAITING_BIAS &&
+             adjustment != -RWSEM_ACTIVE_READ_BIAS))
+                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+        raw_spin_unlock_irq(&sem->wait_lock);
+        wake_up_q(&wake_q);
+        /* wait to be given the lock */
+        while (true) {
+                set_current_state(state);
+                if (!waiter.task)
+                        break;
+                if (signal_pending_state(state, current)) {
+                        raw_spin_lock_irq(&sem->wait_lock);
+                        if (waiter.task)
+                                goto out_nolock;
+                        raw_spin_unlock_irq(&sem->wait_lock);
+                        break;
+                }
+                schedule();
+                lockevent_inc(rwsem_sleep_reader);
+        }
+        __set_current_state(TASK_RUNNING);
+        lockevent_inc(rwsem_rlock);
+        return sem;
+out_nolock:
+        list_del(&waiter.list);
+        if (list_empty(&sem->wait_list))
+                atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+        raw_spin_unlock_irq(&sem->wait_lock);
+        __set_current_state(TASK_RUNNING);
+        lockevent_inc(rwsem_rlock_fail);
+        return ERR_PTR(-EINTR);
+}
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+        return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed);
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+{
+        return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed_killable);
+/*
 * Wait until we successfully acquire the write lock
 */
 static inline struct rw_semaphore *
@@ -580,6 +583,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
                                goto out_nolock;
                        schedule();
+                        lockevent_inc(rwsem_sleep_writer);
                        set_current_state(state);
                } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
@@ -588,6 +592,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
        __set_current_state(TASK_RUNNING);
        list_del(&waiter.list);
        raw_spin_unlock_irq(&sem->wait_lock);
+        lockevent_inc(rwsem_wlock);
        return ret;
@@ -601,6 +606,7 @@ out_nolock:
                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
        raw_spin_unlock_irq(&sem->wait_lock);
        wake_up_q(&wake_q);
+        lockevent_inc(rwsem_wlock_fail);
        return ERR_PTR(-EINTR);
 }
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e586f0d03ad3..ccbf18f560ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem)
        rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
-        rwsem_set_reader_owned(sem);
 }
 EXPORT_SYMBOL(down_read);
@@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem)
                return -EINTR;
        }
-        rwsem_set_reader_owned(sem);
        return 0;
 }
@@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem)
 {
        int ret = __down_read_trylock(sem);
-        if (ret == 1) {
+        if (ret == 1)
                rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
-                rwsem_set_reader_owned(sem);
-        }
        return ret;
 }
@@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem)
        rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
-        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(down_write);
@@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem)
                return -EINTR;
        }
-        rwsem_set_owner(sem);
        return 0;
 }
@@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem)
 {
        int ret = __down_write_trylock(sem);
-        if (ret == 1) {
+        if (ret == 1)
                rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
-                rwsem_set_owner(sem);
-        }
        return ret;
 }
@@ -117,9 +109,7 @@ EXPORT_SYMBOL(down_write_trylock);
 void up_read(struct rw_semaphore *sem)
 {
        rwsem_release(&sem->dep_map, 1, _RET_IP_);
-        DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
-        rwsem_clear_reader_owned(sem);
        __up_read(sem);
 }
@@ -131,9 +121,7 @@ EXPORT_SYMBOL(up_read);
 void up_write(struct rw_semaphore *sem)
 {
        rwsem_release(&sem->dep_map, 1, _RET_IP_);
-        DEBUG_RWSEMS_WARN_ON(sem->owner != current);
-        rwsem_clear_owner(sem);
        __up_write(sem);
 }
@@ -145,9 +133,7 @@ EXPORT_SYMBOL(up_write);
 void downgrade_write(struct rw_semaphore *sem)
 {
        lock_downgrade(&sem->dep_map, _RET_IP_);
-        DEBUG_RWSEMS_WARN_ON(sem->owner != current);
-        rwsem_set_reader_owned(sem);
        __downgrade_write(sem);
 }
@@ -161,7 +147,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
        rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
-        rwsem_set_reader_owned(sem);
 }
 EXPORT_SYMBOL(down_read_nested);
@@ -172,7 +157,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
        rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
-        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(_down_write_nest_lock);
@@ -193,7 +177,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
        rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
-        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(down_write_nested);
@@ -208,7 +191,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
                return -EINTR;
        }
-        rwsem_set_owner(sem);
        return 0;
 }
@@ -216,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested);
 void up_read_non_owner(struct rw_semaphore *sem)
 {
-        DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
+        DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
+                                sem);
        __up_read(sem);
 }
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index bad2bca0268b..64877f5294e3 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -23,15 +23,44 @@
 * is involved. Ideally we would like to track all the readers that own
 * a rwsem, but the overhead is simply too big.
 */
+#include "lock_events.h"
 #define RWSEM_READER_OWNED      (1UL << 0)
 #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1)
 #ifdef CONFIG_DEBUG_RWSEMS
-# define DEBUG_RWSEMS_WARN_ON(c)        DEBUG_LOCKS_WARN_ON(c)
+# define DEBUG_RWSEMS_WARN_ON(c, sem)   do {                    \
+        if (!debug_locks_silent &&                              \
+            WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+                #c, atomic_long_read(&(sem)->count),            \
+                (long)((sem)->owner), (long)current,            \
+                list_empty(&(sem)->wait_list) ? "" : "not "))   \
+                        debug_locks_off();                      \
+        } while (0)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c, sem)
+#endif
+/*
+ * R/W semaphores originally for PPC using the stuff in lib/rwsem.c.
+ * Adapted largely from include/asm-i386/rwsem.h
+ * by Paul Mackerras <paulus@samba.org>.
+ */
+/*
+ * the semaphore definition
+ */
+#ifdef CONFIG_64BIT
+# define RWSEM_ACTIVE_MASK              0xffffffffL
 #else
-# define DEBUG_RWSEMS_WARN_ON(c)
+# define RWSEM_ACTIVE_MASK              0x0000ffffL
 #endif
+#define RWSEM_ACTIVE_BIAS               0x00000001L
+#define RWSEM_WAITING_BIAS              (-RWSEM_ACTIVE_MASK-1)
+#define RWSEM_ACTIVE_READ_BIAS          RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS         (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 /*
 * All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -132,3 +161,144 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 {
 }
 #endif
+extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+/*
+ * lock for reading
+ */
+static inline void __down_read(struct rw_semaphore *sem)
+{
+        if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
+                rwsem_down_read_failed(sem);
+                DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
+                                        RWSEM_READER_OWNED), sem);
+        } else {
+                rwsem_set_reader_owned(sem);
+        }
+}
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+        if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
+                if (IS_ERR(rwsem_down_read_failed_killable(sem)))
+                        return -EINTR;
+                DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
+                                        RWSEM_READER_OWNED), sem);
+        } else {
+                rwsem_set_reader_owned(sem);
+        }
+        return 0;
+}
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+        /*
+         * Optimize for the case when the rwsem is not locked at all.
+         */
+        long tmp = RWSEM_UNLOCKED_VALUE;
+        lockevent_inc(rwsem_rtrylock);
+        do {
+                if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+                                        tmp + RWSEM_ACTIVE_READ_BIAS)) {
+                        rwsem_set_reader_owned(sem);
+                        return 1;
+                }
+        } while (tmp >= 0);
+        return 0;
+}
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct rw_semaphore *sem)
+{
+        long tmp;
+        tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
+                                             &sem->count);
+        if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
+                rwsem_down_write_failed(sem);
+        rwsem_set_owner(sem);
+}
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+        long tmp;
+        tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
+                                             &sem->count);
+        if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
+                if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+                        return -EINTR;
+        rwsem_set_owner(sem);
+        return 0;
+}
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+        long tmp;
+        lockevent_inc(rwsem_wtrylock);
+        tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
+                      RWSEM_ACTIVE_WRITE_BIAS);
+        if (tmp == RWSEM_UNLOCKED_VALUE) {
+                rwsem_set_owner(sem);
+                return true;
+        }
+        return false;
+}
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct rw_semaphore *sem)
+{
+        long tmp;
+        DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
+                                sem);
+        rwsem_clear_reader_owned(sem);
+        tmp = atomic_long_dec_return_release(&sem->count);
+        if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
+                rwsem_wake(sem);
+}
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+        DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
+        rwsem_clear_owner(sem);
+        if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
+                                                    &sem->count) < 0))
+                rwsem_wake(sem);
+}
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+        long tmp;
+        /*
+         * When downgrading from exclusive to shared ownership,
+         * anything inside the write-locked region cannot leak
+         * into the read side. In contrast, anything in the
+         * read-locked region is ok to be re-ordered into the
+         * write side. As such, rely on RELEASE semantics.
+         */
+        DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
+        tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
+        rwsem_set_reader_owned(sem);
+        if (tmp < 0)
+                rwsem_downgrade_wake(sem);
+}
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 936f3d14dd6b..0ff08380f531 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -22,6 +22,13 @@
 #include <linux/debug_locks.h>
 #include <linux/export.h>
+#ifdef CONFIG_MMIOWB
+#ifndef arch_mmiowb_state
+DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state);
+EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
+#endif
+#endif
 /*
 * If lockdep is enabled then we use the non-preemption spin-ops
 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 9aa0fccd5d43..399669f7eba8 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock)
 {
        debug_spin_lock_before(lock);
        arch_spin_lock(&lock->raw_lock);
+        mmiowb_spin_lock();
        debug_spin_lock_after(lock);
 }
@@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
 {
        int ret = arch_spin_trylock(&lock->raw_lock);
-        if (ret)
+        if (ret) {
+                mmiowb_spin_lock();
                debug_spin_lock_after(lock);
+        }
 #ifndef CONFIG_SMP
        /*
         * Must not happen on UP:
@@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
 void do_raw_spin_unlock(raw_spinlock_t *lock)
 {
+        mmiowb_spin_unlock();
        debug_spin_unlock(lock);
        arch_spin_unlock(&lock->raw_lock);
 }
diff --git a/kernel/module.c b/kernel/module.c
index 0b9aa8ab89f0..a9020bdd4cf6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
+/* Work queue for freeing init sections in success case */
+static struct work_struct init_free_wq;
+static struct llist_head init_free_list;
 #ifdef CONFIG_MODULES_TREE_LOOKUP
 /*
@@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init)
        if (!rodata_enabled)
                return;
+        set_vm_flush_reset_perms(mod->core_layout.base);
+        set_vm_flush_reset_perms(mod->init_layout.base);
        frob_text(&mod->core_layout, set_memory_ro);
+        frob_text(&mod->core_layout, set_memory_x);
        frob_rodata(&mod->core_layout, set_memory_ro);
        frob_text(&mod->init_layout, set_memory_ro);
+        frob_text(&mod->init_layout, set_memory_x);
        frob_rodata(&mod->init_layout, set_memory_ro);
        if (after_init)
@@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)
        frob_writable_data(&mod->init_layout, set_memory_nx);
 }
-static void module_disable_nx(const struct module *mod)
-{
-        frob_rodata(&mod->core_layout, set_memory_x);
-        frob_ro_after_init(&mod->core_layout, set_memory_x);
-        frob_writable_data(&mod->core_layout, set_memory_x);
-        frob_rodata(&mod->init_layout, set_memory_x);
-        frob_writable_data(&mod->init_layout, set_memory_x);
-}
 /* Iterate through all modules and set each module's text as RW */
 void set_all_modules_text_rw(void)
 {
@@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void)
        }
        mutex_unlock(&module_mutex);
 }
-static void disable_ro_nx(const struct module_layout *layout)
-{
-        if (rodata_enabled) {
-                frob_text(layout, set_memory_rw);
-                frob_rodata(layout, set_memory_rw);
-                frob_ro_after_init(layout, set_memory_rw);
-        }
-        frob_rodata(layout, set_memory_x);
-        frob_ro_after_init(layout, set_memory_x);
-        frob_writable_data(layout, set_memory_x);
-}
 #else
-static void disable_ro_nx(const struct module_layout *layout) { }
 static void module_enable_nx(const struct module *mod) { }
-static void module_disable_nx(const struct module *mod) { }
 #endif
 #ifdef CONFIG_LIVEPATCH
@@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod)
 void __weak module_memfree(void *module_region)
 {
+        /*
+         * This memory may be RO, and freeing RO memory in an interrupt is not
+         * supported by vmalloc.
+         */
+        WARN_ON(in_interrupt());
        vfree(module_region);
 }
@@ -2166,7 +2158,6 @@ static void free_module(struct module *mod)
        mutex_unlock(&module_mutex);
        /* This may be empty, but that's OK */
-        disable_ro_nx(&mod->init_layout);
        module_arch_freeing_init(mod);
        module_memfree(mod->init_layout.base);
        kfree(mod->args);
@@ -2176,7 +2167,6 @@ static void free_module(struct module *mod)
        lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
        /* Finally, free the core (containing the module structure) */
-        disable_ro_nx(&mod->core_layout);
        module_memfree(mod->core_layout.base);
 }
@@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)
 /* For freeing module_init on success, in case kallsyms traversing */
 struct mod_initfree {
-        struct rcu_head rcu;
+        struct llist_node node;
        void *module_init;
 };
-static void do_free_init(struct rcu_head *head)
+static void do_free_init(struct work_struct *w)
 {
-        struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
+        struct llist_node *pos, *n, *list;
-        module_memfree(m->module_init);
+        struct mod_initfree *initfree;
-        kfree(m);
+        list = llist_del_all(&init_free_list);
+        synchronize_rcu();
+        llist_for_each_safe(pos, n, list) {
+                initfree = container_of(pos, struct mod_initfree, node);
+                module_memfree(initfree->module_init);
+                kfree(initfree);
+        }
 }
+static int __init modules_wq_init(void)
+{
+        INIT_WORK(&init_free_wq, do_free_init);
+        init_llist_head(&init_free_list);
+        return 0;
+}
+module_init(modules_wq_init);
 /*
 * This is where the real work happens.
 *
@@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)
 #endif
        module_enable_ro(mod, true);
        mod_tree_remove_init(mod);
-        disable_ro_nx(&mod->init_layout);
        module_arch_freeing_init(mod);
        mod->init_layout.base = NULL;
        mod->init_layout.size = 0;
@@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)
         * We want to free module_init, but be aware that kallsyms may be
         * walking this with preempt disabled.  In all the failure paths, we
         * call synchronize_rcu(), but we don't want to slow down the success
-         * path, so use actual RCU here.
+         * path. module_memfree() cannot be called in an interrupt, so do the
+         * work and call synchronize_rcu() in a work queue.
+         *
         * Note that module_alloc() on most architectures creates W+X page
         * mappings which won't be cleaned up until do_free_init() runs.  Any
         * code such as mark_rodata_ro() which depends on those mappings to
         * be cleaned up needs to sync with the queued work - ie
         * rcu_barrier()
         */
-        call_rcu(&freeinit->rcu, do_free_init);
+        if (llist_add(&freeinit->node, &init_free_list))
+                schedule_work(&init_free_wq);
        mutex_unlock(&module_mutex);
        wake_up_all(&module_wq);
@@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
        module_bug_cleanup(mod);
        mutex_unlock(&module_mutex);
-        /* we can't deallocate the module until we clear memory protection */
-        module_disable_ro(mod);
-        module_disable_nx(mod);
 ddebug_cleanup:
        ftrace_release_mod(mod);
        dynamic_debug_remove(mod, info->debug);
diff --git a/kernel/padata.c b/kernel/padata.c
index 3e2633ae3bca..2d2fddbb7a4c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -957,6 +957,7 @@ static struct attribute *padata_default_attrs[] = {
        &parallel_cpumask_attr.attr,
        NULL,
 };
+ATTRIBUTE_GROUPS(padata_default);
 static ssize_t padata_sysfs_show(struct kobject *kobj,
                                 struct attribute *attr, char *buf)
@@ -995,7 +996,7 @@ static const struct sysfs_ops padata_sysfs_ops = {
 static struct kobj_type padata_attr_type = {
        .sysfs_ops = &padata_sysfs_ops,
-        .default_attrs = padata_default_attrs,
+        .default_groups = padata_default_groups,
        .release = padata_sysfs_release,
 };
diff --git a/kernel/panic.c b/kernel/panic.c
index 0ae0d7332f12..c1fcaad337b7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -318,12 +318,7 @@ void panic(const char *fmt, ...)
        }
 #endif
 #if defined(CONFIG_S390)
-        {
+        disabled_wait();
-                unsigned long caller;
-                caller = (unsigned long)__builtin_return_address(0);
-                disabled_wait(caller);
-        }
 #endif
        pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
        local_irq_enable();
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index f8fe57d1022e..9bbaaab14b36 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -114,6 +114,15 @@ config PM_SLEEP_SMP
        depends on PM_SLEEP
        select HOTPLUG_CPU
+config PM_SLEEP_SMP_NONZERO_CPU
+        def_bool y
+        depends on PM_SLEEP_SMP
+        depends on ARCH_SUSPEND_NONZERO_CPU
+        ---help---
+        If an arch can suspend (for suspend, hibernate, kexec, etc) on a
+        non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This
+        will allow nohz_full mask to include CPU0.
 config PM_AUTOSLEEP
        bool "Opportunistic sleep"
        depends on PM_SLEEP
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index abef759de7c8..c8c272df7154 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -14,7 +14,6 @@
 #include <linux/export.h>
 #include <linux/suspend.h>
-#include <linux/syscalls.h>
 #include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
@@ -281,7 +280,7 @@ static int create_image(int platform_mode)
        if (error || hibernation_test(TEST_PLATFORM))
                goto Platform_finish;
-        error = disable_nonboot_cpus();
+        error = suspend_disable_secondary_cpus();
        if (error || hibernation_test(TEST_CPUS))
                goto Enable_cpus;
@@ -323,7 +322,7 @@ static int create_image(int platform_mode)
        local_irq_enable();
 Enable_cpus:
-        enable_nonboot_cpus();
+        suspend_enable_secondary_cpus();
 Platform_finish:
        platform_finish(platform_mode);
@@ -417,7 +416,7 @@ int hibernation_snapshot(int platform_mode)
 int __weak hibernate_resume_nonboot_cpu_disable(void)
 {
-        return disable_nonboot_cpus();
+        return suspend_disable_secondary_cpus();
 }
 /**
@@ -486,7 +485,7 @@ static int resume_target_kernel(bool platform_mode)
        local_irq_enable();
 Enable_cpus:
-        enable_nonboot_cpus();
+        suspend_enable_secondary_cpus();
 Cleanup:
        platform_restore_cleanup(platform_mode);
@@ -564,7 +563,7 @@ int hibernation_platform_enter(void)
        if (error)
                goto Platform_finish;
-        error = disable_nonboot_cpus();
+        error = suspend_disable_secondary_cpus();
        if (error)
                goto Enable_cpus;
@@ -586,7 +585,7 @@ int hibernation_platform_enter(void)
        local_irq_enable();
 Enable_cpus:
-        enable_nonboot_cpus();
+        suspend_enable_secondary_cpus();
 Platform_finish:
        hibernation_ops->finish();
@@ -709,9 +708,7 @@ int hibernate(void)
                goto Exit;
        }
-        pr_info("Syncing filesystems ... \n");
+        ksys_sync_helper();
-        ksys_sync();
-        pr_info("done.\n");
        error = freeze_processes();
        if (error)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 98e76cad128b..4f43e724f6eb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,6 +16,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/suspend.h>
+#include <linux/syscalls.h>
 #include "power.h"
@@ -51,6 +52,19 @@ void unlock_system_sleep(void)
 }
 EXPORT_SYMBOL_GPL(unlock_system_sleep);
+void ksys_sync_helper(void)
+{
+        ktime_t start;
+        long elapsed_msecs;
+        start = ktime_get();
+        ksys_sync();
+        elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start));
+        pr_info("Filesystems sync: %ld.%03ld seconds\n",
+                elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC);
+}
+EXPORT_SYMBOL_GPL(ksys_sync_helper);
 /* Routines for PM-transition notifications */
 static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f08a1e4ee1d4..bc9558ab1e5b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)
 * safe_copy_page - Copy a page in a safe way.
 *
 * Check if the page we are going to copy is marked as present in the kernel
- * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
+ * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
- * and in that case kernel_page_present() always returns 'true').
+ * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
+ * always returns 'true'.
 */
 static void safe_copy_page(void *dst, struct page *s_page)
 {
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0bd595a0b610..ef908c134b34 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -17,7 +17,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
-#include <linux/syscalls.h>
 #include <linux/gfp.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
@@ -428,7 +427,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        if (suspend_test(TEST_PLATFORM))
                goto Platform_wake;
-        error = disable_nonboot_cpus();
+        error = suspend_disable_secondary_cpus();
        if (error || suspend_test(TEST_CPUS))
                goto Enable_cpus;
@@ -458,7 +457,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        BUG_ON(irqs_disabled());
 Enable_cpus:
-        enable_nonboot_cpus();
+        suspend_enable_secondary_cpus();
 Platform_wake:
        platform_resume_noirq(state);
@@ -568,13 +567,11 @@ static int enter_state(suspend_state_t state)
        if (state == PM_SUSPEND_TO_IDLE)
                s2idle_begin();
-#ifndef CONFIG_SUSPEND_SKIP_SYNC
+        if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) {
-        trace_suspend_resume(TPS("sync_filesystems"), 0, true);
+                trace_suspend_resume(TPS("sync_filesystems"), 0, true);
-        pr_info("Syncing filesystems ... ");
+                ksys_sync_helper();
-        ksys_sync();
+                trace_suspend_resume(TPS("sync_filesystems"), 0, false);
-        pr_cont("done.\n");
+        }
-        trace_suspend_resume(TPS("sync_filesystems"), 0, false);
-#endif
        pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
        pm_suspend_clear_flags();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 2d8b60a3c86b..cb24e840a3e6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -10,7 +10,6 @@
 */
 #include <linux/suspend.h>
-#include <linux/syscalls.h>
 #include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
@@ -228,9 +227,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                if (data->frozen)
                        break;
-                printk("Syncing filesystems ... ");
+                ksys_sync_helper();
-                ksys_sync();
-                printk("done.\n");
                error = freeze_processes();
                if (error)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 771e93f9c43f..6f357f4fc859 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -29,6 +29,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/cn_proc.h>
 #include <linux/compat.h>
+#include <linux/sched/signal.h>
 /*
 * Access another process' address space via ptrace.
@@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request,
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;
-        case PTRACE_GETSIGMASK:
+        case PTRACE_GETSIGMASK: {
+                sigset_t *mask;
                if (addr != sizeof(sigset_t)) {
                        ret = -EINVAL;
                        break;
                }
-                if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+                if (test_tsk_restore_sigmask(child))
+                        mask = &child->saved_sigmask;
+                else
+                        mask = &child->blocked;
+                if (copy_to_user(datavp, mask, sizeof(sigset_t)))
                        ret = -EFAULT;
                else
                        ret = 0;
                break;
+        }
        case PTRACE_SETSIGMASK: {
                sigset_t new_set;
@@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request,
                child->blocked = new_set;
                spin_unlock_irq(&child->sighand->siglock);
+                clear_tsk_restore_sigmask(child);
                ret = 0;
                break;
        }
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index acee72c0b24b..4b58c907b4b7 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -233,6 +233,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
 #ifdef CONFIG_RCU_STALL_COMMON
 extern int rcu_cpu_stall_suppress;
+extern int rcu_cpu_stall_timeout;
 int rcu_jiffies_till_stall_check(void);
 #define rcu_ftrace_dump_stall_suppress() \
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index c29761152874..7a6890b23c5f 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -494,6 +494,10 @@ rcu_perf_cleanup(void)
        if (torture_cleanup_begin())
                return;
+        if (!cur_ops) {
+                torture_cleanup_end();
+                return;
+        }
        if (reader_tasks) {
                for (i = 0; i < nrealreaders; i++)
@@ -614,6 +618,7 @@ rcu_perf_init(void)
                pr_cont("\n");
                WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST));
                firsterr = -EINVAL;
+                cur_ops = NULL;
                goto unwind;
        }
        if (cur_ops->init)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f14d1b18a74f..efaa5b3f4d3f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -299,7 +299,6 @@ struct rcu_torture_ops {
        int irq_capable;
        int can_boost;
        int extendables;
-        int ext_irq_conflict;
        const char *name;
 };
@@ -592,12 +591,7 @@ static void srcu_torture_init(void)
 static void srcu_torture_cleanup(void)
 {
-        static DEFINE_TORTURE_RANDOM(rand);
+        cleanup_srcu_struct(&srcu_ctld);
-        if (torture_random(&rand) & 0x800)
-                cleanup_srcu_struct(&srcu_ctld);
-        else
-                cleanup_srcu_struct_quiesced(&srcu_ctld);
        srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
 }
@@ -1160,7 +1154,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
        unsigned long randmask2 = randmask1 >> 3;
        WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
-        /* Most of the time lots of bits, half the time only one bit. */
+        /* Mostly only one bit (need preemption!), sometimes lots of bits. */
        if (!(randmask1 & 0x7))
                mask = mask & randmask2;
        else
@@ -1170,10 +1164,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
            ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
             (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
                mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
-        if ((mask & RCUTORTURE_RDR_IRQ) &&
-            !(mask & cur_ops->ext_irq_conflict) &&
-            (oldmask & cur_ops->ext_irq_conflict))
-                mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */
        return mask ?: RCUTORTURE_RDR_RCU;
 }
@@ -1848,7 +1838,7 @@ static int rcutorture_oom_notify(struct notifier_block *self,
        WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
             __func__);
        rcu_torture_fwd_cb_hist();
-        rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2));
+        rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);
        WRITE_ONCE(rcu_fwd_emergency_stop, true);
        smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
        pr_info("%s: Freed %lu RCU callbacks.\n",
@@ -2094,6 +2084,10 @@ rcu_torture_cleanup(void)
                        cur_ops->cb_barrier();
                return;
        }
+        if (!cur_ops) {
+                torture_cleanup_end();
+                return;
+        }
        rcu_torture_barrier_cleanup();
        torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
@@ -2267,6 +2261,7 @@ rcu_torture_init(void)
                pr_cont("\n");
                WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));
                firsterr = -EINVAL;
+                cur_ops = NULL;
                goto unwind;
        }
        if (cur_ops->fqs == NULL && fqs_duration != 0) {
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 5d4a39a6505a..44d6606b8325 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -76,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
 * Must invoke this after you are finished using a given srcu_struct that
 * was initialized via init_srcu_struct(), else you leak memory.
 */
-void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
+void cleanup_srcu_struct(struct srcu_struct *ssp)
 {
        WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
-        if (quiesced)
+        flush_work(&ssp->srcu_work);
-                WARN_ON(work_pending(&ssp->srcu_work));
-        else
-                flush_work(&ssp->srcu_work);
        WARN_ON(ssp->srcu_gp_running);
        WARN_ON(ssp->srcu_gp_waiting);
        WARN_ON(ssp->srcu_cb_head);
        WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
 }
-EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 /*
 * Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index a60b8ba9e1ac..9b761e546de8 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -360,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
        return SRCU_INTERVAL;
 }
-/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
+/**
-void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @ssp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *ssp)
 {
        int cpu;
@@ -369,24 +375,14 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
                return; /* Just leak it! */
        if (WARN_ON(srcu_readers_active(ssp)))
                return; /* Just leak it! */
-        if (quiesced) {
+        flush_delayed_work(&ssp->work);
-                if (WARN_ON(delayed_work_pending(&ssp->work)))
-                        return; /* Just leak it! */
-        } else {
-                flush_delayed_work(&ssp->work);
-        }
        for_each_possible_cpu(cpu) {
                struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
-                if (quiesced) {
+                del_timer_sync(&sdp->delay_work);
-                        if (WARN_ON(timer_pending(&sdp->delay_work)))
+                flush_work(&sdp->work);
-                                return; /* Just leak it! */
+                if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
-                        if (WARN_ON(work_pending(&sdp->work)))
+                        return; /* Forgot srcu_barrier(), so just leak it! */
-                                return; /* Just leak it! */
-                } else {
-                        del_timer_sync(&sdp->delay_work);
-                        flush_work(&sdp->work);
-                }
        }
        if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
            WARN_ON(srcu_readers_active(ssp))) {
@@ -397,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
        free_percpu(ssp->sda);
        ssp->sda = NULL;
 }
-EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 /*
 * Counts the new reader in the appropriate per-CPU element of the
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 911bd9076d43..477b4eb44af5 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -52,7 +52,7 @@ void rcu_qs(void)
        local_irq_save(flags);
        if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
                rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
-                raise_softirq(RCU_SOFTIRQ);
+                raise_softirq_irqoff(RCU_SOFTIRQ);
        }
        local_irq_restore(flags);
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index acd6ccf56faf..b4d88a594785 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,11 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
 /* Number of rcu_nodes at specified level. */
 int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
-/* panic() on RCU Stall sysctl. */
-int sysctl_panic_on_rcu_stall __read_mostly;
-/* Commandeer a sysrq key to dump RCU's tree. */
-static bool sysrq_rcu;
-module_param(sysrq_rcu, bool, 0444);
 /*
 * The rcu_scheduler_active variable is initialized to the value
@@ -149,7 +144,7 @@ static void sync_sched_exp_online_cleanup(int cpu);
 /* rcuc/rcub kthread realtime priority */
 static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
-module_param(kthread_prio, int, 0644);
+module_param(kthread_prio, int, 0444);
 /* Delay in jiffies for grace-period initialization delays, debug only. */
@@ -406,7 +401,7 @@ static bool rcu_kick_kthreads;
 */
 static ulong jiffies_till_sched_qs = ULONG_MAX;
 module_param(jiffies_till_sched_qs, ulong, 0444);
-static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */
+static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
 module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
 /*
@@ -424,6 +419,7 @@ static void adjust_jiffies_till_sched_qs(void)
                WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
                return;
        }
+        /* Otherwise, set to third fqs scan, but bound below on large system. */
        j = READ_ONCE(jiffies_till_first_fqs) +
                      2 * READ_ONCE(jiffies_till_next_fqs);
        if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
@@ -513,74 +509,6 @@ static const char *gp_state_getname(short gs)
 }
 /*
- * Show the state of the grace-period kthreads.
- */
-void show_rcu_gp_kthreads(void)
-{
-        int cpu;
-        unsigned long j;
-        unsigned long ja;
-        unsigned long jr;
-        unsigned long jw;
-        struct rcu_data *rdp;
-        struct rcu_node *rnp;
-        j = jiffies;
-        ja = j - READ_ONCE(rcu_state.gp_activity);
-        jr = j - READ_ONCE(rcu_state.gp_req_activity);
-        jw = j - READ_ONCE(rcu_state.gp_wake_time);
-        pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
-                rcu_state.name, gp_state_getname(rcu_state.gp_state),
-                rcu_state.gp_state,
-                rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
-                ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
-                (long)READ_ONCE(rcu_state.gp_seq),
-                (long)READ_ONCE(rcu_get_root()->gp_seq_needed),
-                READ_ONCE(rcu_state.gp_flags));
-        rcu_for_each_node_breadth_first(rnp) {
-                if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
-                        continue;
-                pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
-                        rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
-                        (long)rnp->gp_seq_needed);
-                if (!rcu_is_leaf_node(rnp))
-                        continue;
-                for_each_leaf_node_possible_cpu(rnp, cpu) {
-                        rdp = per_cpu_ptr(&rcu_data, cpu);
-                        if (rdp->gpwrap ||
-                            ULONG_CMP_GE(rcu_state.gp_seq,
-                                         rdp->gp_seq_needed))
-                                continue;
-                        pr_info("\tcpu %d ->gp_seq_needed %ld\n",
-                                cpu, (long)rdp->gp_seq_needed);
-                }
-        }
-        /* sched_show_task(rcu_state.gp_kthread); */
-}
-EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
-/* Dump grace-period-request information due to commandeered sysrq. */
-static void sysrq_show_rcu(int key)
-{
-        show_rcu_gp_kthreads();
-}
-static struct sysrq_key_op sysrq_rcudump_op = {
-        .handler = sysrq_show_rcu,
-        .help_msg = "show-rcu(y)",
-        .action_msg = "Show RCU tree",
-        .enable_mask = SYSRQ_ENABLE_DUMP,
-};
-static int __init rcu_sysrq_init(void)
-{
-        if (sysrq_rcu)
-                return register_sysrq_key('y', &sysrq_rcudump_op);
-        return 0;
-}
-early_initcall(rcu_sysrq_init);
-/*
 * Send along grace-period-related data for rcutorture diagnostics.
 */
 void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
@@ -1034,27 +962,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
 }
 /*
- * Handler for the irq_work request posted when a grace period has
- * gone on for too long, but not yet long enough for an RCU CPU
- * stall warning.  Set state appropriately, but just complain if
- * there is unexpected state on entry.
- */
-static void rcu_iw_handler(struct irq_work *iwp)
-{
-        struct rcu_data *rdp;
-        struct rcu_node *rnp;
-        rdp = container_of(iwp, struct rcu_data, rcu_iw);
-        rnp = rdp->mynode;
-        raw_spin_lock_rcu_node(rnp);
-        if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
-                rdp->rcu_iw_gp_seq = rnp->gp_seq;
-                rdp->rcu_iw_pending = false;
-        }
-        raw_spin_unlock_rcu_node(rnp);
-}
-/*
 * Return true if the specified CPU has passed through a quiescent
 * state by virtue of being in or having passed through an dynticks
 * idle state since the last call to dyntick_save_progress_counter()
@@ -1167,295 +1074,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
        return 0;
 }
-static void record_gp_stall_check_time(void)
-{
-        unsigned long j = jiffies;
-        unsigned long j1;
-        rcu_state.gp_start = j;
-        j1 = rcu_jiffies_till_stall_check();
-        /* Record ->gp_start before ->jiffies_stall. */
-        smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
-        rcu_state.jiffies_resched = j + j1 / 2;
-        rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
-}
-/*
- * Complain about starvation of grace-period kthread.
- */
-static void rcu_check_gp_kthread_starvation(void)
-{
-        struct task_struct *gpk = rcu_state.gp_kthread;
-        unsigned long j;
-        j = jiffies - READ_ONCE(rcu_state.gp_activity);
-        if (j > 2 * HZ) {
-                pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
-                       rcu_state.name, j,
-                       (long)rcu_seq_current(&rcu_state.gp_seq),
-                       READ_ONCE(rcu_state.gp_flags),
-                       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
-                       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
-                if (gpk) {
-                        pr_err("RCU grace-period kthread stack dump:\n");
-                        sched_show_task(gpk);
-                        wake_up_process(gpk);
-                }
-        }
-}
-/*
- * Dump stacks of all tasks running on stalled CPUs.  First try using
- * NMIs, but fall back to manual remote stack tracing on architectures
- * that don't support NMI-based stack dumps.  The NMI-triggered stack
- * traces are more accurate because they are printed by the target CPU.
- */
-static void rcu_dump_cpu_stacks(void)
-{
-        int cpu;
-        unsigned long flags;
-        struct rcu_node *rnp;
-        rcu_for_each_leaf_node(rnp) {
-                raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                for_each_leaf_node_possible_cpu(rnp, cpu)
-                        if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
-                                if (!trigger_single_cpu_backtrace(cpu))
-                                        dump_cpu_task(cpu);
-                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-        }
-}
-/*
- * If too much time has passed in the current grace period, and if
- * so configured, go kick the relevant kthreads.
- */
-static void rcu_stall_kick_kthreads(void)
-{
-        unsigned long j;
-        if (!rcu_kick_kthreads)
-                return;
-        j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
-        if (time_after(jiffies, j) && rcu_state.gp_kthread &&
-            (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
-                WARN_ONCE(1, "Kicking %s grace-period kthread\n",
-                          rcu_state.name);
-                rcu_ftrace_dump(DUMP_ALL);
-                wake_up_process(rcu_state.gp_kthread);
-                WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
-        }
-}
-static void panic_on_rcu_stall(void)
-{
-        if (sysctl_panic_on_rcu_stall)
-                panic("RCU Stall\n");
-}
-static void print_other_cpu_stall(unsigned long gp_seq)
-{
-        int cpu;
-        unsigned long flags;
-        unsigned long gpa;
-        unsigned long j;
-        int ndetected = 0;
-        struct rcu_node *rnp = rcu_get_root();
-        long totqlen = 0;
-        /* Kick and suppress, if so configured. */
-        rcu_stall_kick_kthreads();
-        if (rcu_cpu_stall_suppress)
-                return;
-        /*
-         * OK, time to rat on our buddy...
-         * See Documentation/RCU/stallwarn.txt for info on how to debug
-         * RCU CPU stall warnings.
-         */
-        pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name);
-        print_cpu_stall_info_begin();
-        rcu_for_each_leaf_node(rnp) {
-                raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                ndetected += rcu_print_task_stall(rnp);
-                if (rnp->qsmask != 0) {
-                        for_each_leaf_node_possible_cpu(rnp, cpu)
-                                if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
-                                        print_cpu_stall_info(cpu);
-                                        ndetected++;
-                                }
-                }
-                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-        }
-        print_cpu_stall_info_end();
-        for_each_possible_cpu(cpu)
-                totqlen += rcu_get_n_cbs_cpu(cpu);
-        pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
-               smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
-               (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
-        if (ndetected) {
-                rcu_dump_cpu_stacks();
-                /* Complain about tasks blocking the grace period. */
-                rcu_print_detail_task_stall();
-        } else {
-                if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
-                        pr_err("INFO: Stall ended before state dump start\n");
-                } else {
-                        j = jiffies;
-                        gpa = READ_ONCE(rcu_state.gp_activity);
-                        pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
-                               rcu_state.name, j - gpa, j, gpa,
-                               READ_ONCE(jiffies_till_next_fqs),
-                               rcu_get_root()->qsmask);
-                        /* In this case, the current CPU might be at fault. */
-                        sched_show_task(current);
-                }
-        }
-        /* Rewrite if needed in case of slow consoles. */
-        if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
-                WRITE_ONCE(rcu_state.jiffies_stall,
-                           jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-        rcu_check_gp_kthread_starvation();
-        panic_on_rcu_stall();
-        rcu_force_quiescent_state();  /* Kick them all. */
-}
-static void print_cpu_stall(void)
-{
-        int cpu;
-        unsigned long flags;
-        struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-        struct rcu_node *rnp = rcu_get_root();
-        long totqlen = 0;
-        /* Kick and suppress, if so configured. */
-        rcu_stall_kick_kthreads();
-        if (rcu_cpu_stall_suppress)
-                return;
-        /*
-         * OK, time to rat on ourselves...
-         * See Documentation/RCU/stallwarn.txt for info on how to debug
-         * RCU CPU stall warnings.
-         */
-        pr_err("INFO: %s self-detected stall on CPU", rcu_state.name);
-        print_cpu_stall_info_begin();
-        raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
-        print_cpu_stall_info(smp_processor_id());
-        raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
-        print_cpu_stall_info_end();
-        for_each_possible_cpu(cpu)
-                totqlen += rcu_get_n_cbs_cpu(cpu);
-        pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n",
-                jiffies - rcu_state.gp_start,
-                (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
-        rcu_check_gp_kthread_starvation();
-        rcu_dump_cpu_stacks();
-        raw_spin_lock_irqsave_rcu_node(rnp, flags);
-        /* Rewrite if needed in case of slow consoles. */
-        if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
-                WRITE_ONCE(rcu_state.jiffies_stall,
-                           jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-        panic_on_rcu_stall();
-        /*
-         * Attempt to revive the RCU machinery by forcing a context switch.
-         *
-         * A context switch would normally allow the RCU state machine to make
-         * progress and it could be we're stuck in kernel space without context
-         * switches for an entirely unreasonable amount of time.
-         */
-        set_tsk_need_resched(current);
-        set_preempt_need_resched();
-}
-static void check_cpu_stall(struct rcu_data *rdp)
-{
-        unsigned long gs1;
-        unsigned long gs2;
-        unsigned long gps;
-        unsigned long j;
-        unsigned long jn;
-        unsigned long js;
-        struct rcu_node *rnp;
-        if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
-            !rcu_gp_in_progress())
-                return;
-        rcu_stall_kick_kthreads();
-        j = jiffies;
-        /*
-         * Lots of memory barriers to reject false positives.
-         *
-         * The idea is to pick up rcu_state.gp_seq, then
-         * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
-         * another copy of rcu_state.gp_seq.  These values are updated in
-         * the opposite order with memory barriers (or equivalent) during
-         * grace-period initialization and cleanup.  Now, a false positive
-         * can occur if we get an new value of rcu_state.gp_start and a old
-         * value of rcu_state.jiffies_stall.  But given the memory barriers,
-         * the only way that this can happen is if one grace period ends
-         * and another starts between these two fetches.  This is detected
-         * by comparing the second fetch of rcu_state.gp_seq with the
-         * previous fetch from rcu_state.gp_seq.
-         *
-         * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
-         * and rcu_state.gp_start suffice to forestall false positives.
-         */
-        gs1 = READ_ONCE(rcu_state.gp_seq);
-        smp_rmb(); /* Pick up ->gp_seq first... */
-        js = READ_ONCE(rcu_state.jiffies_stall);
-        smp_rmb(); /* ...then ->jiffies_stall before the rest... */
-        gps = READ_ONCE(rcu_state.gp_start);
-        smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
-        gs2 = READ_ONCE(rcu_state.gp_seq);
-        if (gs1 != gs2 ||
-            ULONG_CMP_LT(j, js) ||
-            ULONG_CMP_GE(gps, js))
-                return; /* No stall or GP completed since entering function. */
-        rnp = rdp->mynode;
-        jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
-        if (rcu_gp_in_progress() &&
-            (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
-            cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-                /* We haven't checked in, so go dump stack. */
-                print_cpu_stall();
-        } else if (rcu_gp_in_progress() &&
-                   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
-                   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-                /* They had a few time units to dump stack, so complain. */
-                print_other_cpu_stall(gs2);
-        }
-}
-/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
- *
- * The caller must disable hard irqs.
- */
-void rcu_cpu_stall_reset(void)
-{
-        WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
-}
 /* Trace-event wrapper function for trace_rcu_future_grace_period.  */
 static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
                              unsigned long gp_seq_req, const char *s)
@@ -1585,7 +1203,7 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
 static void rcu_gp_kthread_wake(void)
 {
        if ((current == rcu_state.gp_kthread &&
-             !in_interrupt() && !in_serving_softirq()) ||
+             !in_irq() && !in_serving_softirq()) ||
            !READ_ONCE(rcu_state.gp_flags) ||
            !rcu_state.gp_kthread)
                return;
@@ -2295,11 +1913,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
                return;
        }
        mask = rdp->grpmask;
+        rdp->core_needs_qs = false;
        if ((rnp->qsmask & mask) == 0) {
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        } else {
-                rdp->core_needs_qs = false;
                /*
                 * This GP can't end until cpu checks in, so all of our
                 * callbacks can be processed during the next GP.
@@ -2548,11 +2165,11 @@ void rcu_sched_clock_irq(int user)
 }
 /*
- * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * Scan the leaf rcu_node structures.  For each structure on which all
- * have not yet encountered a quiescent state, using the function specified.
+ * CPUs have reported a quiescent state and on which there are tasks
- * Also initiate boosting for any threads blocked on the root rcu_node.
+ * blocking the current grace period, initiate RCU priority boosting.
- *
+ * Otherwise, invoke the specified function to check dyntick state for
- * The caller must have suppressed start of new grace periods.
+ * each CPU that has not yet reported a quiescent state.
 */
 static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
 {
@@ -2635,101 +2252,6 @@ void rcu_force_quiescent_state(void)
 }
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-/*
- * This function checks for grace-period requests that fail to motivate
- * RCU to come out of its idle mode.
- */
-void
-rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
-                         const unsigned long gpssdelay)
-{
-        unsigned long flags;
-        unsigned long j;
-        struct rcu_node *rnp_root = rcu_get_root();
-        static atomic_t warned = ATOMIC_INIT(0);
-        if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
-            ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
-                return;
-        j = jiffies; /* Expensive access, and in common case don't get here. */
-        if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
-            time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
-            atomic_read(&warned))
-                return;
-        raw_spin_lock_irqsave_rcu_node(rnp, flags);
-        j = jiffies;
-        if (rcu_gp_in_progress() ||
-            ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
-            time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
-            time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
-            atomic_read(&warned)) {
-                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                return;
-        }
-        /* Hold onto the leaf lock to make others see warned==1. */
-        if (rnp_root != rnp)
-                raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
-        j = jiffies;
-        if (rcu_gp_in_progress() ||
-            ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
-            time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
-            time_before(j, rcu_state.gp_activity + gpssdelay) ||
-            atomic_xchg(&warned, 1)) {
-                raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
-                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                return;
-        }
-        WARN_ON(1);
-        if (rnp_root != rnp)
-                raw_spin_unlock_rcu_node(rnp_root);
-        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-        show_rcu_gp_kthreads();
-}
-/*
- * Do a forward-progress check for rcutorture.  This is normally invoked
- * due to an OOM event.  The argument "j" gives the time period during
- * which rcutorture would like progress to have been made.
- */
-void rcu_fwd_progress_check(unsigned long j)
-{
-        unsigned long cbs;
-        int cpu;
-        unsigned long max_cbs = 0;
-        int max_cpu = -1;
-        struct rcu_data *rdp;
-        if (rcu_gp_in_progress()) {
-                pr_info("%s: GP age %lu jiffies\n",
-                        __func__, jiffies - rcu_state.gp_start);
-                show_rcu_gp_kthreads();
-        } else {
-                pr_info("%s: Last GP end %lu jiffies ago\n",
-                        __func__, jiffies - rcu_state.gp_end);
-                preempt_disable();
-                rdp = this_cpu_ptr(&rcu_data);
-                rcu_check_gp_start_stall(rdp->mynode, rdp, j);
-                preempt_enable();
-        }
-        for_each_possible_cpu(cpu) {
-                cbs = rcu_get_n_cbs_cpu(cpu);
-                if (!cbs)
-                        continue;
-                if (max_cpu < 0)
-                        pr_info("%s: callbacks", __func__);
-                pr_cont(" %d: %lu", cpu, cbs);
-                if (cbs <= max_cbs)
-                        continue;
-                max_cbs = cbs;
-                max_cpu = cpu;
-        }
-        if (max_cpu >= 0)
-                pr_cont("\n");
-}
-EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
 /* Perform RCU core processing work for the current CPU.  */
 static __latent_entropy void rcu_core(struct softirq_action *unused)
 {
@@ -2870,7 +2392,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
                 * Use rcu:rcu_callback trace event to find the previous
                 * time callback was passed to __call_rcu().
                 */
-                WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n",
+                WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
                          head, head->func);
                WRITE_ONCE(head->func, rcu_leak_callback);
                return;
@@ -3559,13 +3081,11 @@ static int rcu_pm_notify(struct notifier_block *self,
        switch (action) {
        case PM_HIBERNATION_PREPARE:
        case PM_SUSPEND_PREPARE:
-                if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+                rcu_expedite_gp();
-                        rcu_expedite_gp();
                break;
        case PM_POST_HIBERNATION:
        case PM_POST_SUSPEND:
-                if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+                rcu_unexpedite_gp();
-                        rcu_unexpedite_gp();
                break;
        default:
                break;
@@ -3742,8 +3262,7 @@ static void __init rcu_init_geometry(void)
                jiffies_till_first_fqs = d;
        if (jiffies_till_next_fqs == ULONG_MAX)
                jiffies_till_next_fqs = d;
-        if (jiffies_till_sched_qs == ULONG_MAX)
+        adjust_jiffies_till_sched_qs();
-                adjust_jiffies_till_sched_qs();
        /* If the compile-time values are accurate, just leave. */
        if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
@@ -3858,5 +3377,6 @@ void __init rcu_init(void)
        srcu_init();
 }
+#include "tree_stall.h"
 #include "tree_exp.h"
 #include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bb4f995f2d3f..e253d11af3c4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -393,15 +393,13 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
 int rcu_dynticks_snap(struct rcu_data *rdp);
-/* Forward declarations for rcutree_plugin.h */
+/* Forward declarations for tree_plugin.h */
 static void rcu_bootup_announce(void);
 static void rcu_qs(void);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_print_detail_task_stall(void);
-static int rcu_print_task_stall(struct rcu_node *rnp);
 static int rcu_print_task_exp_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 static void rcu_flavor_sched_clock_irq(int user);
@@ -418,9 +416,6 @@ static void rcu_prepare_for_idle(void);
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
 static void rcu_preempt_deferred_qs(struct task_struct *t);
-static void print_cpu_stall_info_begin(void);
-static void print_cpu_stall_info(int cpu);
-static void print_cpu_stall_info_end(void);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static bool rcu_nocb_cpu_needs_barrier(int cpu);
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
@@ -445,3 +440,10 @@ static void rcu_bind_gp_kthread(void);
 static bool rcu_nohz_full_cpu(void);
 static void rcu_dynticks_task_enter(void);
 static void rcu_dynticks_task_exit(void);
+/* Forward declarations for tree_stall.h */
+static void record_gp_stall_check_time(void);
+static void rcu_iw_handler(struct irq_work *iwp);
+static void check_cpu_stall(struct rcu_data *rdp);
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+                                     const unsigned long gpssdelay);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 4c2a0189e748..9c990df880d1 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -10,6 +10,7 @@
 #include <linux/lockdep.h>
 static void rcu_exp_handler(void *unused);
+static int rcu_print_task_exp_stall(struct rcu_node *rnp);
 /*
 * Record the start of an expedited grace period.
@@ -633,7 +634,7 @@ static void rcu_exp_handler(void *unused)
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
                if (rnp->expmask & rdp->grpmask) {
                        rdp->deferred_qs = true;
-                        WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true);
+                        t->rcu_read_unlock_special.b.exp_hint = true;
                }
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;
@@ -648,7 +649,7 @@ static void rcu_exp_handler(void *unused)
         *
         * If the CPU is fully enabled (or if some buggy RCU-preempt
         * read-side critical section is being used from idle), just
-         * invoke rcu_preempt_defer_qs() to immediately report the
+         * invoke rcu_preempt_deferred_qs() to immediately report the
         * quiescent state.  We cannot use rcu_read_unlock_special()
         * because we are in an interrupt handler, which will cause that
         * function to take an early exit without doing anything.
@@ -670,6 +671,27 @@ static void sync_sched_exp_online_cleanup(int cpu)
 {
 }
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each that is blocking the current
+ * expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+        struct task_struct *t;
+        int ndetected = 0;
+        if (!rnp->exp_tasks)
+                return 0;
+        t = list_entry(rnp->exp_tasks->prev,
+                       struct task_struct, rcu_node_entry);
+        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+                pr_cont(" P%d", t->pid);
+                ndetected++;
+        }
+        return ndetected;
+}
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
@@ -709,6 +731,16 @@ static void sync_sched_exp_online_cleanup(int cpu)
        WARN_ON_ONCE(ret);
 }
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections that are
+ * blocking the current expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+        return 0;
+}
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 /**
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 97dba50f6fb2..1102765f91fd 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -285,7 +285,7 @@ static void rcu_qs(void)
                                       TPS("cpuqs"));
                __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
                barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
-                current->rcu_read_unlock_special.b.need_qs = false;
+                WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
        }
 }
@@ -643,100 +643,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 }
 /*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period on the specified rcu_node structure.
- */
-static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
-{
-        unsigned long flags;
-        struct task_struct *t;
-        raw_spin_lock_irqsave_rcu_node(rnp, flags);
-        if (!rcu_preempt_blocked_readers_cgp(rnp)) {
-                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                return;
-        }
-        t = list_entry(rnp->gp_tasks->prev,
-                       struct task_struct, rcu_node_entry);
-        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
-                /*
-                 * We could be printing a lot while holding a spinlock.
-                 * Avoid triggering hard lockup.
-                 */
-                touch_nmi_watchdog();
-                sched_show_task(t);
-        }
-        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-}
-/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period.
- */
-static void rcu_print_detail_task_stall(void)
-{
-        struct rcu_node *rnp = rcu_get_root();
-        rcu_print_detail_task_stall_rnp(rnp);
-        rcu_for_each_leaf_node(rnp)
-                rcu_print_detail_task_stall_rnp(rnp);
-}
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
-        pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
-               rnp->level, rnp->grplo, rnp->grphi);
-}
-static void rcu_print_task_stall_end(void)
-{
-        pr_cont("\n");
-}
-/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
-        struct task_struct *t;
-        int ndetected = 0;
-        if (!rcu_preempt_blocked_readers_cgp(rnp))
-                return 0;
-        rcu_print_task_stall_begin(rnp);
-        t = list_entry(rnp->gp_tasks->prev,
-                       struct task_struct, rcu_node_entry);
-        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
-                pr_cont(" P%d", t->pid);
-                ndetected++;
-        }
-        rcu_print_task_stall_end();
-        return ndetected;
-}
-/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each that is blocking the current
- * expedited grace period.
- */
-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
-{
-        struct task_struct *t;
-        int ndetected = 0;
-        if (!rnp->exp_tasks)
-                return 0;
-        t = list_entry(rnp->exp_tasks->prev,
-                       struct task_struct, rcu_node_entry);
-        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
-                pr_cont(" P%d", t->pid);
-                ndetected++;
-        }
-        return ndetected;
-}
-/*
 * Check that the list of blocked tasks for the newly completed grace
 * period is in fact empty.  It is a serious bug to complete a grace
 * period that still has RCU readers blocked!  This function must be
@@ -804,19 +710,25 @@ static void rcu_flavor_sched_clock_irq(int user)
 /*
 * Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so.  No need to issue warnings,
+ * critical section, clean up if so.  No need to issue warnings, as
- * as debug_check_no_locks_held() already does this if lockdep
+ * debug_check_no_locks_held() already does this if lockdep is enabled.
- * is enabled.
+ * Besides, if this function does anything other than just immediately
+ * return, there was a bug of some sort.  Spewing warnings from this
+ * function is like as not to simply obscure important prior warnings.
 */
 void exit_rcu(void)
 {
        struct task_struct *t = current;
-        if (likely(list_empty(&current->rcu_node_entry)))
+        if (unlikely(!list_empty(&current->rcu_node_entry))) {
+                t->rcu_read_lock_nesting = 1;
+                barrier();
+                WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
+        } else if (unlikely(t->rcu_read_lock_nesting)) {
+                t->rcu_read_lock_nesting = 1;
+        } else {
                return;
-        t->rcu_read_lock_nesting = 1;
+        }
-        barrier();
-        t->rcu_read_unlock_special.b.blocked = true;
        __rcu_read_unlock();
        rcu_preempt_deferred_qs(current);
 }
@@ -980,33 +892,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 static void rcu_preempt_deferred_qs(struct task_struct *t) { }
 /*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static void rcu_print_detail_task_stall(void)
-{
-}
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
-        return 0;
-}
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections that are
- * blocking the current expedited grace period.
- */
-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
-{
-        return 0;
-}
-/*
 * Because there is no preemptible RCU, there can be no readers blocked,
 * so there is no need to check for blocked tasks.  So check only for
 * bogus qsmask values.
@@ -1185,8 +1070,6 @@ static int rcu_boost_kthread(void *arg)
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
        __releases(rnp->lock)
 {
-        struct task_struct *t;
        raw_lockdep_assert_held_rcu_node(rnp);
        if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1200,9 +1083,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
                if (rnp->exp_tasks == NULL)
                        rnp->boost_tasks = rnp->gp_tasks;
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                t = rnp->boost_kthread_task;
+                rcu_wake_cond(rnp->boost_kthread_task,
-                if (t)
+                              rnp->boost_kthread_status);
-                        rcu_wake_cond(t, rnp->boost_kthread_status);
        } else {
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        }
@@ -1649,98 +1531,6 @@ static void rcu_cleanup_after_idle(void)
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_FAST_NO_HZ
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
-        struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-        sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
-                rdp->last_accelerate & 0xffff, jiffies & 0xffff,
-                ".l"[rdp->all_lazy],
-                ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
-                ".D"[!rdp->tick_nohz_enabled_snap]);
-}
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
-        *cp = '\0';
-}
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-/* Initiate the stall-info list. */
-static void print_cpu_stall_info_begin(void)
-{
-        pr_cont("\n");
-}
-/*
- * Print out diagnostic information for the specified stalled CPU.
- *
- * If the specified CPU is aware of the current RCU grace period, then
- * print the number of scheduling clock interrupts the CPU has taken
- * during the time that it has been aware.  Otherwise, print the number
- * of RCU grace periods that this CPU is ignorant of, for example, "1"
- * if the CPU was aware of the previous grace period.
- *
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
- */
-static void print_cpu_stall_info(int cpu)
-{
-        unsigned long delta;
-        char fast_no_hz[72];
-        struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-        char *ticks_title;
-        unsigned long ticks_value;
-        /*
-         * We could be printing a lot while holding a spinlock.  Avoid
-         * triggering hard lockup.
-         */
-        touch_nmi_watchdog();
-        ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
-        if (ticks_value) {
-                ticks_title = "GPs behind";
-        } else {
-                ticks_title = "ticks this GP";
-                ticks_value = rdp->ticks_this_gp;
-        }
-        print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-        delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
-        pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
-               cpu,
-               "O."[!!cpu_online(cpu)],
-               "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
-               "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
-               !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
-                        rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
-                                "!."[!delta],
-               ticks_value, ticks_title,
-               rcu_dynticks_snap(rdp) & 0xfff,
-               rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
-               rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
-               READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
-               fast_no_hz);
-}
-/* Terminate the stall-info list. */
-static void print_cpu_stall_info_end(void)
-{
-        pr_err("\t");
-}
-/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
-        rdp->ticks_this_gp = 0;
-        rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
-        WRITE_ONCE(rdp->last_fqs_resched, jiffies);
-}
 #ifdef CONFIG_RCU_NOCB_CPU
 /*
@@ -1766,11 +1556,22 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
 */
-/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a
+ * comma-separated list of CPUs and/or CPU ranges.  If an invalid list is
+ * given, a warning is emitted and all CPUs are offloaded.
+ */
 static int __init rcu_nocb_setup(char *str)
 {
        alloc_bootmem_cpumask_var(&rcu_nocb_mask);
-        cpulist_parse(str, rcu_nocb_mask);
+        if (!strcasecmp(str, "all"))
+                cpumask_setall(rcu_nocb_mask);
+        else
+                if (cpulist_parse(str, rcu_nocb_mask)) {
+                        pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+                        cpumask_setall(rcu_nocb_mask);
+                }
        return 1;
 }
 __setup("rcu_nocbs=", rcu_nocb_setup);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
new file mode 100644
index 000000000000..f65a73a97323
--- /dev/null
+++ b/kernel/rcu/tree_stall.h
@@ -0,0 +1,709 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RCU CPU stall warnings for normal RCU grace periods
+ *
+ * Copyright IBM Corporation, 2019
+ *
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+//////////////////////////////////////////////////////////////////////////////
+//
+// Controlling CPU stall warnings, including delay calculation.
+/* panic() on RCU Stall sysctl. */
+int sysctl_panic_on_rcu_stall __read_mostly;
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA          (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA          0
+#endif
+/* Limit-check stall timeouts specified at boottime and runtime. */
+int rcu_jiffies_till_stall_check(void)
+{
+        int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
+        /*
+         * Limit check must be consistent with the Kconfig limits
+         * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+         */
+        if (till_stall_check < 3) {
+                WRITE_ONCE(rcu_cpu_stall_timeout, 3);
+                till_stall_check = 3;
+        } else if (till_stall_check > 300) {
+                WRITE_ONCE(rcu_cpu_stall_timeout, 300);
+                till_stall_check = 300;
+        }
+        return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
+/* Don't do RCU CPU stall warnings during long sysrq printouts. */
+void rcu_sysrq_start(void)
+{
+        if (!rcu_cpu_stall_suppress)
+                rcu_cpu_stall_suppress = 2;
+}
+void rcu_sysrq_end(void)
+{
+        if (rcu_cpu_stall_suppress == 2)
+                rcu_cpu_stall_suppress = 0;
+}
+/* Don't print RCU CPU stall warnings during a kernel panic. */
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+        rcu_cpu_stall_suppress = 1;
+        return NOTIFY_DONE;
+}
+static struct notifier_block rcu_panic_block = {
+        .notifier_call = rcu_panic,
+};
+static int __init check_cpu_stall_init(void)
+{
+        atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+        return 0;
+}
+early_initcall(check_cpu_stall_init);
+/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
+static void panic_on_rcu_stall(void)
+{
+        if (sysctl_panic_on_rcu_stall)
+                panic("RCU Stall\n");
+}
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+        WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+}
+//////////////////////////////////////////////////////////////////////////////
+//
+// Interaction with RCU grace periods
+/* Start of new grace period, so record stall time (and forcing times). */
+static void record_gp_stall_check_time(void)
+{
+        unsigned long j = jiffies;
+        unsigned long j1;
+        rcu_state.gp_start = j;
+        j1 = rcu_jiffies_till_stall_check();
+        /* Record ->gp_start before ->jiffies_stall. */
+        smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
+        rcu_state.jiffies_resched = j + j1 / 2;
+        rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
+}
+/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+        rdp->ticks_this_gp = 0;
+        rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
+        WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+}
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(void)
+{
+        unsigned long j;
+        if (!rcu_kick_kthreads)
+                return;
+        j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
+        if (time_after(jiffies, j) && rcu_state.gp_kthread &&
+            (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
+                WARN_ONCE(1, "Kicking %s grace-period kthread\n",
+                          rcu_state.name);
+                rcu_ftrace_dump(DUMP_ALL);
+                wake_up_process(rcu_state.gp_kthread);
+                WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
+        }
+}
+/*
+ * Handler for the irq_work request posted about halfway into the RCU CPU
+ * stall timeout, and used to detect excessive irq disabling.  Set state
+ * appropriately, but just complain if there is unexpected state on entry.
+ */
+static void rcu_iw_handler(struct irq_work *iwp)
+{
+        struct rcu_data *rdp;
+        struct rcu_node *rnp;
+        rdp = container_of(iwp, struct rcu_data, rcu_iw);
+        rnp = rdp->mynode;
+        raw_spin_lock_rcu_node(rnp);
+        if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
+                rdp->rcu_iw_gp_seq = rnp->gp_seq;
+                rdp->rcu_iw_pending = false;
+        }
+        raw_spin_unlock_rcu_node(rnp);
+}
+//////////////////////////////////////////////////////////////////////////////
+//
+// Printing RCU CPU stall warnings
+#ifdef CONFIG_PREEMPT
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+        unsigned long flags;
+        struct task_struct *t;
+        raw_spin_lock_irqsave_rcu_node(rnp, flags);
+        if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                return;
+        }
+        t = list_entry(rnp->gp_tasks->prev,
+                       struct task_struct, rcu_node_entry);
+        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+                /*
+                 * We could be printing a lot while holding a spinlock.
+                 * Avoid triggering hard lockup.
+                 */
+                touch_nmi_watchdog();
+                sched_show_task(t);
+        }
+        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+        struct task_struct *t;
+        int ndetected = 0;
+        if (!rcu_preempt_blocked_readers_cgp(rnp))
+                return 0;
+        pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
+               rnp->level, rnp->grplo, rnp->grphi);
+        t = list_entry(rnp->gp_tasks->prev,
+                       struct task_struct, rcu_node_entry);
+        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+                pr_cont(" P%d", t->pid);
+                ndetected++;
+        }
+        pr_cont("\n");
+        return ndetected;
+}
+#else /* #ifdef CONFIG_PREEMPT */
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+}
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+        return 0;
+}
+#endif /* #else #ifdef CONFIG_PREEMPT */
+/*
+ * Dump stacks of all tasks running on stalled CPUs.  First try using
+ * NMIs, but fall back to manual remote stack tracing on architectures
+ * that don't support NMI-based stack dumps.  The NMI-triggered stack
+ * traces are more accurate because they are printed by the target CPU.
+ */
+static void rcu_dump_cpu_stacks(void)
+{
+        int cpu;
+        unsigned long flags;
+        struct rcu_node *rnp;
+        rcu_for_each_leaf_node(rnp) {
+                raw_spin_lock_irqsave_rcu_node(rnp, flags);
+                for_each_leaf_node_possible_cpu(rnp, cpu)
+                        if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+                                if (!trigger_single_cpu_backtrace(cpu))
+                                        dump_cpu_task(cpu);
+                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+        }
+}
+#ifdef CONFIG_RCU_FAST_NO_HZ
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+        struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+        sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
+                rdp->last_accelerate & 0xffff, jiffies & 0xffff,
+                ".l"[rdp->all_lazy],
+                ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
+                ".D"[!!rdp->tick_nohz_enabled_snap]);
+}
+#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+        *cp = '\0';
+}
+#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
+/*
+ * Print out diagnostic information for the specified stalled CPU.
+ *
+ * If the specified CPU is aware of the current RCU grace period, then
+ * print the number of scheduling clock interrupts the CPU has taken
+ * during the time that it has been aware.  Otherwise, print the number
+ * of RCU grace periods that this CPU is ignorant of, for example, "1"
+ * if the CPU was aware of the previous grace period.
+ *
+ * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ */
+static void print_cpu_stall_info(int cpu)
+{
+        unsigned long delta;
+        char fast_no_hz[72];
+        struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+        char *ticks_title;
+        unsigned long ticks_value;
+        /*
+         * We could be printing a lot while holding a spinlock.  Avoid
+         * triggering hard lockup.
+         */
+        touch_nmi_watchdog();
+        ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
+        if (ticks_value) {
+                ticks_title = "GPs behind";
+        } else {
+                ticks_title = "ticks this GP";
+                ticks_value = rdp->ticks_this_gp;
+        }
+        print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
+        delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
+        pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
+               cpu,
+               "O."[!!cpu_online(cpu)],
+               "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
+               "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+               !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
+                        rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
+                                "!."[!delta],
+               ticks_value, ticks_title,
+               rcu_dynticks_snap(rdp) & 0xfff,
+               rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
+               rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+               READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
+               fast_no_hz);
+}
+/* Complain about starvation of grace-period kthread.  */
+static void rcu_check_gp_kthread_starvation(void)
+{
+        struct task_struct *gpk = rcu_state.gp_kthread;
+        unsigned long j;
+        j = jiffies - READ_ONCE(rcu_state.gp_activity);
+        if (j > 2 * HZ) {
+                pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
+                       rcu_state.name, j,
+                       (long)rcu_seq_current(&rcu_state.gp_seq),
+                       READ_ONCE(rcu_state.gp_flags),
+                       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
+                       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
+                if (gpk) {
+                        pr_err("RCU grace-period kthread stack dump:\n");
+                        sched_show_task(gpk);
+                        wake_up_process(gpk);
+                }
+        }
+}
+static void print_other_cpu_stall(unsigned long gp_seq)
+{
+        int cpu;
+        unsigned long flags;
+        unsigned long gpa;
+        unsigned long j;
+        int ndetected = 0;
+        struct rcu_node *rnp;
+        long totqlen = 0;
+        /* Kick and suppress, if so configured. */
+        rcu_stall_kick_kthreads();
+        if (rcu_cpu_stall_suppress)
+                return;
+        /*
+         * OK, time to rat on our buddy...
+         * See Documentation/RCU/stallwarn.txt for info on how to debug
+         * RCU CPU stall warnings.
+         */
+        pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
+        rcu_for_each_leaf_node(rnp) {
+                raw_spin_lock_irqsave_rcu_node(rnp, flags);
+                ndetected += rcu_print_task_stall(rnp);
+                if (rnp->qsmask != 0) {
+                        for_each_leaf_node_possible_cpu(rnp, cpu)
+                                if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+                                        print_cpu_stall_info(cpu);
+                                        ndetected++;
+                                }
+                }
+                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+        }
+        for_each_possible_cpu(cpu)
+                totqlen += rcu_get_n_cbs_cpu(cpu);
+        pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+               smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
+               (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+        if (ndetected) {
+                rcu_dump_cpu_stacks();
+                /* Complain about tasks blocking the grace period. */
+                rcu_for_each_leaf_node(rnp)
+                        rcu_print_detail_task_stall_rnp(rnp);
+        } else {
+                if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
+                        pr_err("INFO: Stall ended before state dump start\n");
+                } else {
+                        j = jiffies;
+                        gpa = READ_ONCE(rcu_state.gp_activity);
+                        pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
+                               rcu_state.name, j - gpa, j, gpa,
+                               READ_ONCE(jiffies_till_next_fqs),
+                               rcu_get_root()->qsmask);
+                        /* In this case, the current CPU might be at fault. */
+                        sched_show_task(current);
+                }
+        }
+        /* Rewrite if needed in case of slow consoles. */
+        if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+                WRITE_ONCE(rcu_state.jiffies_stall,
+                           jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+        rcu_check_gp_kthread_starvation();
+        panic_on_rcu_stall();
+        rcu_force_quiescent_state();  /* Kick them all. */
+}
+static void print_cpu_stall(void)
+{
+        int cpu;
+        unsigned long flags;
+        struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+        struct rcu_node *rnp = rcu_get_root();
+        long totqlen = 0;
+        /* Kick and suppress, if so configured. */
+        rcu_stall_kick_kthreads();
+        if (rcu_cpu_stall_suppress)
+                return;
+        /*
+         * OK, time to rat on ourselves...
+         * See Documentation/RCU/stallwarn.txt for info on how to debug
+         * RCU CPU stall warnings.
+         */
+        pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
+        raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
+        print_cpu_stall_info(smp_processor_id());
+        raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
+        for_each_possible_cpu(cpu)
+                totqlen += rcu_get_n_cbs_cpu(cpu);
+        pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
+                jiffies - rcu_state.gp_start,
+                (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+        rcu_check_gp_kthread_starvation();
+        rcu_dump_cpu_stacks();
+        raw_spin_lock_irqsave_rcu_node(rnp, flags);
+        /* Rewrite if needed in case of slow consoles. */
+        if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+                WRITE_ONCE(rcu_state.jiffies_stall,
+                           jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+        panic_on_rcu_stall();
+        /*
+         * Attempt to revive the RCU machinery by forcing a context switch.
+         *
+         * A context switch would normally allow the RCU state machine to make
+         * progress and it could be we're stuck in kernel space without context
+         * switches for an entirely unreasonable amount of time.
+         */
+        set_tsk_need_resched(current);
+        set_preempt_need_resched();
+}
+static void check_cpu_stall(struct rcu_data *rdp)
+{
+        unsigned long gs1;
+        unsigned long gs2;
+        unsigned long gps;
+        unsigned long j;
+        unsigned long jn;
+        unsigned long js;
+        struct rcu_node *rnp;
+        if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
+            !rcu_gp_in_progress())
+                return;
+        rcu_stall_kick_kthreads();
+        j = jiffies;
+        /*
+         * Lots of memory barriers to reject false positives.
+         *
+         * The idea is to pick up rcu_state.gp_seq, then
+         * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
+         * another copy of rcu_state.gp_seq.  These values are updated in
+         * the opposite order with memory barriers (or equivalent) during
+         * grace-period initialization and cleanup.  Now, a false positive
+         * can occur if we get an new value of rcu_state.gp_start and a old
+         * value of rcu_state.jiffies_stall.  But given the memory barriers,
+         * the only way that this can happen is if one grace period ends
+         * and another starts between these two fetches.  This is detected
+         * by comparing the second fetch of rcu_state.gp_seq with the
+         * previous fetch from rcu_state.gp_seq.
+         *
+         * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
+         * and rcu_state.gp_start suffice to forestall false positives.
+         */
+        gs1 = READ_ONCE(rcu_state.gp_seq);
+        smp_rmb(); /* Pick up ->gp_seq first... */
+        js = READ_ONCE(rcu_state.jiffies_stall);
+        smp_rmb(); /* ...then ->jiffies_stall before the rest... */
+        gps = READ_ONCE(rcu_state.gp_start);
+        smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
+        gs2 = READ_ONCE(rcu_state.gp_seq);
+        if (gs1 != gs2 ||
+            ULONG_CMP_LT(j, js) ||
+            ULONG_CMP_GE(gps, js))
+                return; /* No stall or GP completed since entering function. */
+        rnp = rdp->mynode;
+        jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+        if (rcu_gp_in_progress() &&
+            (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+            cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+                /* We haven't checked in, so go dump stack. */
+                print_cpu_stall();
+        } else if (rcu_gp_in_progress() &&
+                   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
+                   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+                /* They had a few time units to dump stack, so complain. */
+                print_other_cpu_stall(gs2);
+        }
+}
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU forward-progress mechanisms, including of callback invocation.
+/*
+ * Show the state of the grace-period kthreads.
+ */
+void show_rcu_gp_kthreads(void)
+{
+        int cpu;
+        unsigned long j;
+        unsigned long ja;
+        unsigned long jr;
+        unsigned long jw;
+        struct rcu_data *rdp;
+        struct rcu_node *rnp;
+        j = jiffies;
+        ja = j - READ_ONCE(rcu_state.gp_activity);
+        jr = j - READ_ONCE(rcu_state.gp_req_activity);
+        jw = j - READ_ONCE(rcu_state.gp_wake_time);
+        pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+                rcu_state.name, gp_state_getname(rcu_state.gp_state),
+                rcu_state.gp_state,
+                rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
+                ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
+                (long)READ_ONCE(rcu_state.gp_seq),
+                (long)READ_ONCE(rcu_get_root()->gp_seq_needed),
+                READ_ONCE(rcu_state.gp_flags));
+        rcu_for_each_node_breadth_first(rnp) {
+                if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
+                        continue;
+                pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
+                        rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
+                        (long)rnp->gp_seq_needed);
+                if (!rcu_is_leaf_node(rnp))
+                        continue;
+                for_each_leaf_node_possible_cpu(rnp, cpu) {
+                        rdp = per_cpu_ptr(&rcu_data, cpu);
+                        if (rdp->gpwrap ||
+                            ULONG_CMP_GE(rcu_state.gp_seq,
+                                         rdp->gp_seq_needed))
+                                continue;
+                        pr_info("\tcpu %d ->gp_seq_needed %ld\n",
+                                cpu, (long)rdp->gp_seq_needed);
+                }
+        }
+        /* sched_show_task(rcu_state.gp_kthread); */
+}
+EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+/*
+ * This function checks for grace-period requests that fail to motivate
+ * RCU to come out of its idle mode.
+ */
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+                                     const unsigned long gpssdelay)
+{
+        unsigned long flags;
+        unsigned long j;
+        struct rcu_node *rnp_root = rcu_get_root();
+        static atomic_t warned = ATOMIC_INIT(0);
+        if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
+            ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
+                return;
+        j = jiffies; /* Expensive access, and in common case don't get here. */
+        if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+            time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+            atomic_read(&warned))
+                return;
+        raw_spin_lock_irqsave_rcu_node(rnp, flags);
+        j = jiffies;
+        if (rcu_gp_in_progress() ||
+            ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
+            time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+            time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+            atomic_read(&warned)) {
+                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                return;
+        }
+        /* Hold onto the leaf lock to make others see warned==1. */
+        if (rnp_root != rnp)
+                raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+        j = jiffies;
+        if (rcu_gp_in_progress() ||
+            ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
+            time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
+            time_before(j, rcu_state.gp_activity + gpssdelay) ||
+            atomic_xchg(&warned, 1)) {
+                raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
+                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                return;
+        }
+        WARN_ON(1);
+        if (rnp_root != rnp)
+                raw_spin_unlock_rcu_node(rnp_root);
+        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+        show_rcu_gp_kthreads();
+}
+/*
+ * Do a forward-progress check for rcutorture.  This is normally invoked
+ * due to an OOM event.  The argument "j" gives the time period during
+ * which rcutorture would like progress to have been made.
+ */
+void rcu_fwd_progress_check(unsigned long j)
+{
+        unsigned long cbs;
+        int cpu;
+        unsigned long max_cbs = 0;
+        int max_cpu = -1;
+        struct rcu_data *rdp;
+        if (rcu_gp_in_progress()) {
+                pr_info("%s: GP age %lu jiffies\n",
+                        __func__, jiffies - rcu_state.gp_start);
+                show_rcu_gp_kthreads();
+        } else {
+                pr_info("%s: Last GP end %lu jiffies ago\n",
+                        __func__, jiffies - rcu_state.gp_end);
+                preempt_disable();
+                rdp = this_cpu_ptr(&rcu_data);
+                rcu_check_gp_start_stall(rdp->mynode, rdp, j);
+                preempt_enable();
+        }
+        for_each_possible_cpu(cpu) {
+                cbs = rcu_get_n_cbs_cpu(cpu);
+                if (!cbs)
+                        continue;
+                if (max_cpu < 0)
+                        pr_info("%s: callbacks", __func__);
+                pr_cont(" %d: %lu", cpu, cbs);
+                if (cbs <= max_cbs)
+                        continue;
+                max_cbs = cbs;
+                max_cpu = cpu;
+        }
+        if (max_cpu >= 0)
+                pr_cont("\n");
+}
+EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
+/* Commandeer a sysrq key to dump RCU's tree. */
+static bool sysrq_rcu;
+module_param(sysrq_rcu, bool, 0444);
+/* Dump grace-period-request information due to commandeered sysrq. */
+static void sysrq_show_rcu(int key)
+{
+        show_rcu_gp_kthreads();
+}
+static struct sysrq_key_op sysrq_rcudump_op = {
+        .handler = sysrq_show_rcu,
+        .help_msg = "show-rcu(y)",
+        .action_msg = "Show RCU tree",
+        .enable_mask = SYSRQ_ENABLE_DUMP,
+};
+static int __init rcu_sysrq_init(void)
+{
+        if (sysrq_rcu)
+                return register_sysrq_key('y', &sysrq_rcudump_op);
+        return 0;
+}
+early_initcall(rcu_sysrq_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index cbaa976c5945..c3bf44ba42e5 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -424,68 +424,11 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
 #endif
 #ifdef CONFIG_RCU_STALL_COMMON
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA          (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA          0
-#endif
 int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
 EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
-static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
 module_param(rcu_cpu_stall_suppress, int, 0644);
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
 module_param(rcu_cpu_stall_timeout, int, 0644);
-int rcu_jiffies_till_stall_check(void)
-{
-        int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
-        /*
-         * Limit check must be consistent with the Kconfig limits
-         * for CONFIG_RCU_CPU_STALL_TIMEOUT.
-         */
-        if (till_stall_check < 3) {
-                WRITE_ONCE(rcu_cpu_stall_timeout, 3);
-                till_stall_check = 3;
-        } else if (till_stall_check > 300) {
-                WRITE_ONCE(rcu_cpu_stall_timeout, 300);
-                till_stall_check = 300;
-        }
-        return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
-}
-EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
-void rcu_sysrq_start(void)
-{
-        if (!rcu_cpu_stall_suppress)
-                rcu_cpu_stall_suppress = 2;
-}
-void rcu_sysrq_end(void)
-{
-        if (rcu_cpu_stall_suppress == 2)
-                rcu_cpu_stall_suppress = 0;
-}
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
-        rcu_cpu_stall_suppress = 1;
-        return NOTIFY_DONE;
-}
-static struct notifier_block rcu_panic_block = {
-        .notifier_call = rcu_panic,
-};
-static int __init check_cpu_stall_init(void)
-{
-        atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
-        return 0;
-}
-early_initcall(check_cpu_stall_init);
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
 #ifdef CONFIG_TASKS_RCU
diff --git a/kernel/resource.c b/kernel/resource.c
index 92190f62ebc5..8c15f846e8ef 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -520,21 +520,20 @@ EXPORT_SYMBOL_GPL(page_is_ram);
 int region_intersects(resource_size_t start, size_t size, unsigned long flags,
                      unsigned long desc)
 {
-        resource_size_t end = start + size - 1;
+        struct resource res;
        int type = 0; int other = 0;
        struct resource *p;
+        res.start = start;
+        res.end = start + size - 1;
        read_lock(&resource_lock);
        for (p = iomem_resource.child; p ; p = p->sibling) {
                bool is_type = (((p->flags & flags) == flags) &&
                                ((desc == IORES_DESC_NONE) ||
                                 (desc == p->desc)));
-                if (start >= p->start && start <= p->end)
+                if (resource_overlaps(p, &res))
-                        is_type ? type++ : other++;
-                if (end >= p->start && end <= p->end)
-                        is_type ? type++ : other++;
-                if (p->start >= start && p->end <= end)
                        is_type ? type++ : other++;
        }
        read_unlock(&resource_lock);
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 25e9a7b60eba..9424ee90589e 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs)
 * - signal delivery,
 * and return to user-space.
 *
- * This is how we can ensure that the entire rseq critical section,
+ * This is how we can ensure that the entire rseq critical section
- * consisting of both the C part and the assembly instruction sequence,
 * will issue the commit instruction only if executed atomically with
 * respect to other threads scheduled on the same CPU, and with respect
 * to signal handlers.
@@ -314,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
                /* Unregister rseq for current thread. */
                if (current->rseq != rseq || !current->rseq)
                        return -EINVAL;
-                if (current->rseq_len != rseq_len)
+                if (rseq_len != sizeof(*rseq))
                        return -EINVAL;
                if (current->rseq_sig != sig)
                        return -EPERM;
@@ -322,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
                if (ret)
                        return ret;
                current->rseq = NULL;
-                current->rseq_len = 0;
                current->rseq_sig = 0;
                return 0;
        }
@@ -336,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
                 * the provided address differs from the prior
                 * one.
                 */
-                if (current->rseq != rseq || current->rseq_len != rseq_len)
+                if (current->rseq != rseq || rseq_len != sizeof(*rseq))
                        return -EINVAL;
                if (current->rseq_sig != sig)
                        return -EPERM;
@@ -354,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
        if (!access_ok(rseq, rseq_len))
                return -EFAULT;
        current->rseq = rseq;
-        current->rseq_len = rseq_len;
        current->rseq_sig = sig;
        /*
         * If rseq was previously inactive, and has just been
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ead464a0f2e5..102dfcf0a29a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -792,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
                rq->nr_uninterruptible--;
        enqueue_task(rq, p, flags);
+        p->on_rq = TASK_ON_RQ_QUEUED;
 }
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
+        p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible++;
@@ -920,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 }
 /*
- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
 * __set_cpus_allowed_ptr() and select_fallback_rq().
 */
 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
@@ -1151,7 +1155,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, p, &rf);
                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-                tlb_migrate_finish(p->mm);
                return 0;
        } else if (task_on_rq_queued(p)) {
                /*
@@ -1237,11 +1240,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
                rq_pin_lock(src_rq, &srf);
                rq_pin_lock(dst_rq, &drf);
-                p->on_rq = TASK_ON_RQ_MIGRATING;
                deactivate_task(src_rq, p, 0);
                set_task_cpu(p, cpu);
                activate_task(dst_rq, p, 0);
-                p->on_rq = TASK_ON_RQ_QUEUED;
                check_preempt_curr(dst_rq, p, 0);
                rq_unpin_lock(dst_rq, &drf);
@@ -1681,16 +1682,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
                __schedstat_inc(p->se.statistics.nr_wakeups_sync);
 }
-static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
-{
-        activate_task(rq, p, en_flags);
-        p->on_rq = TASK_ON_RQ_QUEUED;
-        /* If a worker is waking up, notify the workqueue: */
-        if (p->flags & PF_WQ_WORKER)
-                wq_worker_waking_up(p, cpu_of(rq));
-}
 /*
 * Mark the task runnable and perform wakeup-preemption.
 */
@@ -1742,7 +1733,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
                en_flags |= ENQUEUE_MIGRATED;
 #endif
-        ttwu_activate(rq, p, en_flags);
+        activate_task(rq, p, en_flags);
        ttwu_do_wakeup(rq, p, wake_flags, rf);
 }
@@ -2107,56 +2098,6 @@ out:
 }
 /**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- * @rf: request-queue flags for pinning
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
-{
-        struct rq *rq = task_rq(p);
-        if (WARN_ON_ONCE(rq != this_rq()) ||
-            WARN_ON_ONCE(p == current))
-                return;
-        lockdep_assert_held(&rq->lock);
-        if (!raw_spin_trylock(&p->pi_lock)) {
-                /*
-                 * This is OK, because current is on_cpu, which avoids it being
-                 * picked for load-balance and preemption/IRQs are still
-                 * disabled avoiding further scheduler activity on it and we've
-                 * not yet picked a replacement task.
-                 */
-                rq_unlock(rq, rf);
-                raw_spin_lock(&p->pi_lock);
-                rq_relock(rq, rf);
-        }
-        if (!(p->state & TASK_NORMAL))
-                goto out;
-        trace_sched_waking(p);
-        if (!task_on_rq_queued(p)) {
-                if (p->in_iowait) {
-                        delayacct_blkio_end(p);
-                        atomic_dec(&rq->nr_iowait);
-                }
-                ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
-        }
-        ttwu_do_wakeup(rq, p, 0, rf);
-        ttwu_stat(p, smp_processor_id(), 0);
-out:
-        raw_spin_unlock(&p->pi_lock);
-}
-/**
 * wake_up_process - Wake up a specific process
 * @p: The process to be woken up.
 *
@@ -2467,7 +2408,6 @@ void wake_up_new_task(struct task_struct *p)
        post_init_entity_util_avg(p);
        activate_task(rq, p, ENQUEUE_NOCLOCK);
-        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
@@ -3466,25 +3406,11 @@ static void __sched notrace __schedule(bool preempt)
                        prev->state = TASK_RUNNING;
                } else {
                        deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
-                        prev->on_rq = 0;
                        if (prev->in_iowait) {
                                atomic_inc(&rq->nr_iowait);
                                delayacct_blkio_start();
                        }
-                        /*
-                         * If a worker went to sleep, notify and ask workqueue
-                         * whether it wants to wake up a task to maintain
-                         * concurrency.
-                         */
-                        if (prev->flags & PF_WQ_WORKER) {
-                                struct task_struct *to_wakeup;
-                                to_wakeup = wq_worker_sleeping(prev);
-                                if (to_wakeup)
-                                        try_to_wake_up_local(to_wakeup, &rf);
-                        }
                }
                switch_count = &prev->nvcsw;
        }
@@ -3544,6 +3470,20 @@ static inline void sched_submit_work(struct task_struct *tsk)
 {
        if (!tsk->state || tsk_is_pi_blocked(tsk))
                return;
+        /*
+         * If a worker went to sleep, notify and ask workqueue whether
+         * it wants to wake up a task to maintain concurrency.
+         * As this function is called inside the schedule() context,
+         * we disable preemption to avoid it calling schedule() again
+         * in the possible wakeup of a kworker.
+         */
+        if (tsk->flags & PF_WQ_WORKER) {
+                preempt_disable();
+                wq_worker_sleeping(tsk);
+                preempt_enable_no_resched();
+        }
        /*
         * If we are going to sleep and we have plugged IO queued,
         * make sure to submit it to avoid deadlocks.
@@ -3552,6 +3492,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
                blk_schedule_flush_plug(tsk);
 }
+static void sched_update_worker(struct task_struct *tsk)
+{
+        if (tsk->flags & PF_WQ_WORKER)
+                wq_worker_running(tsk);
+}
 asmlinkage __visible void __sched schedule(void)
 {
        struct task_struct *tsk = current;
@@ -3562,6 +3508,7 @@ asmlinkage __visible void __sched schedule(void)
                __schedule(false);
                sched_preempt_enable_no_resched();
        } while (need_resched());
+        sched_update_worker(tsk);
 }
 EXPORT_SYMBOL(schedule);
@@ -5918,7 +5865,7 @@ void __init sched_init_smp(void)
 static int __init migration_init(void)
 {
-        sched_rq_cpu_starting(smp_processor_id());
+        sched_cpu_starting(smp_processor_id());
        return 0;
 }
 early_initcall(migration_init);
@@ -6559,6 +6506,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                struct cftype *cftype, u64 shareval)
 {
+        if (shareval > scale_load_down(ULONG_MAX))
+                shareval = MAX_SHARES;
        return sched_group_set_shares(css_tg(css), scale_load(shareval));
 }
@@ -6574,7 +6523,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
 static DEFINE_MUTEX(cfs_constraints_mutex);
 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
@@ -6654,20 +6603,22 @@ out_unlock:
        return ret;
 }
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
 {
        u64 quota, period;
        period = ktime_to_ns(tg->cfs_bandwidth.period);
        if (cfs_quota_us < 0)
                quota = RUNTIME_INF;
-        else
+        else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
                quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+        else
+                return -EINVAL;
        return tg_set_cfs_bandwidth(tg, period, quota);
 }
-long tg_get_cfs_quota(struct task_group *tg)
+static long tg_get_cfs_quota(struct task_group *tg)
 {
        u64 quota_us;
@@ -6680,17 +6631,20 @@ long tg_get_cfs_quota(struct task_group *tg)
        return quota_us;
 }
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
 {
        u64 quota, period;
+        if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
+                return -EINVAL;
        period = (u64)cfs_period_us * NSEC_PER_USEC;
        quota = tg->cfs_bandwidth.quota;
        return tg_set_cfs_bandwidth(tg, period, quota);
 }
-long tg_get_cfs_period(struct task_group *tg)
+static long tg_get_cfs_period(struct task_group *tg)
 {
        u64 cfs_period_us;
@@ -6998,7 +6952,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,
 {
        char tok[21];   /* U64_MAX */
-        if (!sscanf(buf, "%s %llu", tok, periodp))
+        if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
                return -EINVAL;
        *periodp *= NSEC_PER_USEC;
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 835671f0f917..b5dcd1d83c7f 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -7,7 +7,7 @@
 */
 #include "sched.h"
-DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 /**
 * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2efe629425be..962cf343f798 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -13,6 +13,8 @@
 #include <linux/sched/cpufreq.h>
 #include <trace/events/power.h>
+#define IOWAIT_BOOST_MIN        (SCHED_CAPACITY_SCALE / 8)
 struct sugov_tunables {
        struct gov_attr_set     attr_set;
        unsigned int            rate_limit_us;
@@ -48,7 +50,6 @@ struct sugov_cpu {
        bool                    iowait_boost_pending;
        unsigned int            iowait_boost;
-        unsigned int            iowait_boost_max;
        u64                     last_update;
        unsigned long           bw_dl;
@@ -291,8 +292,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 *
 * The IO wait boost of a task is disabled after a tick since the last update
 * of a CPU. If a new IO wait boost is requested after more then a tick, then
- * we enable the boost starting from the minimum frequency, which improves
+ * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
- * energy efficiency by ignoring sporadic wakeups from IO.
+ * efficiency by ignoring sporadic wakeups from IO.
 */
 static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
                               bool set_iowait_boost)
@@ -303,8 +304,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
        if (delta_ns <= TICK_NSEC)
                return false;
-        sg_cpu->iowait_boost = set_iowait_boost
+        sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
-                ? sg_cpu->sg_policy->policy->min : 0;
        sg_cpu->iowait_boost_pending = set_iowait_boost;
        return true;
@@ -318,8 +318,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
 *
 * Each time a task wakes up after an IO operation, the CPU utilization can be
 * boosted to a certain utilization which doubles at each "frequent and
- * successive" wakeup from IO, ranging from the utilization of the minimum
+ * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
- * OPP to the utilization of the maximum OPP.
+ * of the maximum OPP.
+ *
 * To keep doubling, an IO boost has to be requested at least once per tick,
 * otherwise we restart from the utilization of the minimum OPP.
 */
@@ -344,14 +345,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
        /* Double the boost at each request */
        if (sg_cpu->iowait_boost) {
-                sg_cpu->iowait_boost <<= 1;
+                sg_cpu->iowait_boost =
-                if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+                        min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
-                        sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
                return;
        }
        /* First wakeup after IO: start with minimum boost */
-        sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+        sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
 }
 /**
@@ -373,47 +373,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 * This mechanism is designed to boost high frequently IO waiting tasks, while
 * being more conservative on tasks which does sporadic IO operations.
 */
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
-                               unsigned long *util, unsigned long *max)
+                                        unsigned long util, unsigned long max)
 {
-        unsigned int boost_util, boost_max;
+        unsigned long boost;
        /* No boost currently required */
        if (!sg_cpu->iowait_boost)
-                return;
+                return util;
        /* Reset boost if the CPU appears to have been idle enough */
        if (sugov_iowait_reset(sg_cpu, time, false))
-                return;
+                return util;
-        /*
+        if (!sg_cpu->iowait_boost_pending) {
-         * An IO waiting task has just woken up:
-         * allow to further double the boost value
-         */
-        if (sg_cpu->iowait_boost_pending) {
-                sg_cpu->iowait_boost_pending = false;
-        } else {
                /*
-                 * Otherwise: reduce the boost value and disable it when we
+                 * No boost pending; reduce the boost value.
-                 * reach the minimum.
                 */
                sg_cpu->iowait_boost >>= 1;
-                if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+                if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
                        sg_cpu->iowait_boost = 0;
-                        return;
+                        return util;
                }
        }
+        sg_cpu->iowait_boost_pending = false;
        /*
-         * Apply the current boost value: a CPU is boosted only if its current
+         * @util is already in capacity scale; convert iowait_boost
-         * utilization is smaller then the current IO boost level.
+         * into the same scale so we can compare.
         */
-        boost_util = sg_cpu->iowait_boost;
+        boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
-        boost_max = sg_cpu->iowait_boost_max;
+        return max(boost, util);
-        if (*util * boost_max < *max * boost_util) {
-                *util = boost_util;
-                *max = boost_max;
-        }
 }
 #ifdef CONFIG_NO_HZ_COMMON
@@ -460,7 +451,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
        util = sugov_get_util(sg_cpu);
        max = sg_cpu->max;
-        sugov_iowait_apply(sg_cpu, time, &util, &max);
+        util = sugov_iowait_apply(sg_cpu, time, util, max);
        next_f = get_next_freq(sg_policy, util, max);
        /*
         * Do not reduce the frequency if the CPU has not been idle
@@ -500,7 +491,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
                j_util = sugov_get_util(j_sg_cpu);
                j_max = j_sg_cpu->max;
-                sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
+                j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
                if (j_util * max > j_max * util) {
                        util = j_util;
@@ -609,13 +600,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count
 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
-static struct attribute *sugov_attributes[] = {
+static struct attribute *sugov_attrs[] = {
        &rate_limit_us.attr,
        NULL
 };
+ATTRIBUTE_GROUPS(sugov);
 static struct kobj_type sugov_tunables_ktype = {
-        .default_attrs = sugov_attributes,
+        .default_groups = sugov_groups,
        .sysfs_ops = &governor_sysfs_ops,
 };
@@ -782,6 +774,7 @@ out:
        return 0;
 fail:
+        kobject_put(&tunables->attr_set.kobj);
        policy->governor_data = NULL;
        sugov_tunables_free(tunables);
@@ -837,7 +830,6 @@ static int sugov_start(struct cpufreq_policy *policy)
                memset(sg_cpu, 0, sizeof(*sg_cpu));
                sg_cpu->cpu                     = cpu;
                sg_cpu->sg_policy               = sg_policy;
-                sg_cpu->iowait_boost_max        = policy->cpuinfo.max_freq;
        }
        for_each_cpu(cpu, policy->cpus) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a73e41a2016..43901fa3f269 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p)
        if (dl_entity_is_special(dl_se))
                return;
-        WARN_ON(hrtimer_active(&dl_se->inactive_timer));
        WARN_ON(dl_se->dl_non_contending);
        zerolag_time = dl_se->deadline -
@@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p)
         * If the "0-lag time" already passed, decrease the active
         * utilization now, instead of starting a timer
         */
-        if (zerolag_time < 0) {
+        if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
                if (dl_task(p))
                        sub_running_bw(dl_se, dl_rq);
                if (!dl_task(p) || p->state == TASK_DEAD) {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8039d62ae36e..678bfb9bd87f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -702,7 +702,7 @@ do {									\
 static const char *sched_tunable_scaling_names[] = {
        "none",
-        "logaritmic",
+        "logarithmic",
        "linear"
 };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ea74d43924b2..f35930f5e528 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
        if (p->last_task_numa_placement) {
                delta = runtime - p->last_sum_exec_runtime;
                *period = now - p->last_task_numa_placement;
+                /* Avoid time going backwards, prevent potential divide error: */
+                if (unlikely((s64)*period < 0))
+                        *period = 0;
        } else {
                delta = p->se.avg.load_sum;
                *period = LOAD_AVG_MAX;
@@ -2593,7 +2597,7 @@ out:
 /*
 * Drive the periodic memory faults..
 */
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
        struct callback_head *work = &curr->numa_work;
        u64 period, now;
@@ -3567,7 +3571,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 * Synchronize entity load avg of dequeued entity without locking
 * the previous rq.
 */
-void sync_entity_load_avg(struct sched_entity *se)
+static void sync_entity_load_avg(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 last_update_time;
@@ -3580,7 +3584,7 @@ void sync_entity_load_avg(struct sched_entity *se)
 * Task first catches up with cfs_rq, and then subtract
 * itself from the cfs_rq (task must be off the queue now).
 */
-void remove_entity_load_avg(struct sched_entity *se)
+static void remove_entity_load_avg(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        unsigned long flags;
@@ -4885,6 +4889,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
        return HRTIMER_NORESTART;
 }
+extern const u64 max_cfs_quota_period;
 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
        struct cfs_bandwidth *cfs_b =
@@ -4892,6 +4898,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
        unsigned long flags;
        int overrun;
        int idle = 0;
+        int count = 0;
        raw_spin_lock_irqsave(&cfs_b->lock, flags);
        for (;;) {
@@ -4899,6 +4906,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
                if (!overrun)
                        break;
+                if (++count > 3) {
+                        u64 new, old = ktime_to_ns(cfs_b->period);
+                        new = (old * 147) / 128; /* ~115% */
+                        new = min(new, max_cfs_quota_period);
+                        cfs_b->period = ns_to_ktime(new);
+                        /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
+                        cfs_b->quota *= new;
+                        cfs_b->quota = div64_u64(cfs_b->quota, old);
+                        pr_warn_ratelimited(
+        "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+                                smp_processor_id(),
+                                div_u64(new, NSEC_PER_USEC),
+                                div_u64(cfs_b->quota, NSEC_PER_USEC));
+                        /* reset count so we don't come right back in here */
+                        count = 0;
+                }
                idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
        }
        if (idle)
@@ -5116,7 +5145,6 @@ static inline void hrtick_update(struct rq *rq)
 #ifdef CONFIG_SMP
 static inline unsigned long cpu_util(int cpu);
-static unsigned long capacity_of(int cpu);
 static inline bool cpu_overutilized(int cpu)
 {
@@ -7492,7 +7520,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
        lockdep_assert_held(&env->src_rq->lock);
-        p->on_rq = TASK_ON_RQ_MIGRATING;
        deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, env->dst_cpu);
 }
@@ -7628,7 +7655,6 @@ static void attach_task(struct rq *rq, struct task_struct *p)
        BUG_ON(task_rq(p) != rq);
        activate_task(rq, p, ENQUEUE_NOCLOCK);
-        p->on_rq = TASK_ON_RQ_QUEUED;
        check_preempt_curr(rq, p, 0);
 }
@@ -7784,10 +7810,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
        if (cfs_rq->last_h_load_update == now)
                return;
-        cfs_rq->h_load_next = NULL;
+        WRITE_ONCE(cfs_rq->h_load_next, NULL);
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
-                cfs_rq->h_load_next = se;
+                WRITE_ONCE(cfs_rq->h_load_next, se);
                if (cfs_rq->last_h_load_update == now)
                        break;
        }
@@ -7797,7 +7823,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
                cfs_rq->last_h_load_update = now;
        }
-        while ((se = cfs_rq->h_load_next) != NULL) {
+        while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
                load = cfs_rq->h_load;
                load = div64_ul(load * se->avg.load_avg,
                        cfs_rq_load_avg(cfs_rq) + 1);
@@ -8060,6 +8086,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 }
 /*
+ * Check whether a rq has a misfit task and if it looks like we can actually
+ * help that task: we can migrate the task to a CPU of higher capacity, or
+ * the task's current CPU is heavily pressured.
+ */
+static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+{
+        return rq->misfit_task_load &&
+                (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+                 check_cpu_capacity(rq, sd));
+}
+/*
 * Group imbalance indicates (and tries to solve) the problem where balancing
 * groups is inadequate due to ->cpus_allowed constraints.
 *
@@ -9510,22 +9548,26 @@ static inline int on_null_domain(struct rq *rq)
 * - When one of the busy CPUs notice that there may be an idle rebalancing
 *   needed, they will kick the idle load balancer, which then does idle
 *   load balancing for all the idle CPUs.
+ * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ *   anywhere yet.
 */
 static inline int find_new_ilb(void)
 {
-        int ilb = cpumask_first(nohz.idle_cpus_mask);
+        int ilb;
-        if (ilb < nr_cpu_ids && idle_cpu(ilb))
+        for_each_cpu_and(ilb, nohz.idle_cpus_mask,
-                return ilb;
+                              housekeeping_cpumask(HK_FLAG_MISC)) {
+                if (idle_cpu(ilb))
+                        return ilb;
+        }
        return nr_cpu_ids;
 }
 /*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
- * CPU (if there is one).
 */
 static void kick_ilb(unsigned int flags)
 {
@@ -9586,35 +9628,21 @@ static void nohz_balancer_kick(struct rq *rq)
        if (time_before(now, nohz.next_balance))
                goto out;
-        if (rq->nr_running >= 2 || rq->misfit_task_load) {
+        if (rq->nr_running >= 2) {
                flags = NOHZ_KICK_MASK;
                goto out;
        }
        rcu_read_lock();
-        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-        if (sds) {
-                /*
-                 * If there is an imbalance between LLC domains (IOW we could
-                 * increase the overall cache use), we need some less-loaded LLC
-                 * domain to pull some load. Likewise, we may need to spread
-                 * load within the current LLC domain (e.g. packed SMT cores but
-                 * other CPUs are idle). We can't really know from here how busy
-                 * the others are - so just get a nohz balance going if it looks
-                 * like this LLC domain has tasks we could move.
-                 */
-                nr_busy = atomic_read(&sds->nr_busy_cpus);
-                if (nr_busy > 1) {
-                        flags = NOHZ_KICK_MASK;
-                        goto unlock;
-                }
-        }
        sd = rcu_dereference(rq->sd);
        if (sd) {
-                if ((rq->cfs.h_nr_running >= 1) &&
+                /*
-                    check_cpu_capacity(rq, sd)) {
+                 * If there's a CFS task and the current CPU has reduced
+                 * capacity; kick the ILB to see if there's a better CPU to run
+                 * on.
+                 */
+                if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
                        flags = NOHZ_KICK_MASK;
                        goto unlock;
                }
@@ -9622,6 +9650,11 @@ static void nohz_balancer_kick(struct rq *rq)
        sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
        if (sd) {
+                /*
+                 * When ASYM_PACKING; see if there's a more preferred CPU
+                 * currently idle; in which case, kick the ILB to move tasks
+                 * around.
+                 */
                for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
                        if (sched_asym_prefer(i, cpu)) {
                                flags = NOHZ_KICK_MASK;
@@ -9629,6 +9662,45 @@ static void nohz_balancer_kick(struct rq *rq)
                        }
                }
        }
+        sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
+        if (sd) {
+                /*
+                 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
+                 * to run the misfit task on.
+                 */
+                if (check_misfit_status(rq, sd)) {
+                        flags = NOHZ_KICK_MASK;
+                        goto unlock;
+                }
+                /*
+                 * For asymmetric systems, we do not want to nicely balance
+                 * cache use, instead we want to embrace asymmetry and only
+                 * ensure tasks have enough CPU capacity.
+                 *
+                 * Skip the LLC logic because it's not relevant in that case.
+                 */
+                goto unlock;
+        }
+        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+        if (sds) {
+                /*
+                 * If there is an imbalance between LLC domains (IOW we could
+                 * increase the overall cache use), we need some less-loaded LLC
+                 * domain to pull some load. Likewise, we may need to spread
+                 * load within the current LLC domain (e.g. packed SMT cores but
+                 * other CPUs are idle). We can't really know from here how busy
+                 * the others are - so just get a nohz balance going if it looks
+                 * like this LLC domain has tasks we could move.
+                 */
+                nr_busy = atomic_read(&sds->nr_busy_cpus);
+                if (nr_busy > 1) {
+                        flags = NOHZ_KICK_MASK;
+                        goto unlock;
+                }
+        }
 unlock:
        rcu_read_unlock();
 out:
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b02d148e7672..687302051a27 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -65,6 +65,7 @@ void __init housekeeping_init(void)
 static int __init housekeeping_setup(char *str, enum hk_flags flags)
 {
        cpumask_var_t non_housekeeping_mask;
+        cpumask_var_t tmp;
        int err;
        alloc_bootmem_cpumask_var(&non_housekeeping_mask);
@@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
                return 0;
        }
+        alloc_bootmem_cpumask_var(&tmp);
        if (!housekeeping_flags) {
                alloc_bootmem_cpumask_var(&housekeeping_mask);
                cpumask_andnot(housekeeping_mask,
                               cpu_possible_mask, non_housekeeping_mask);
-                if (cpumask_empty(housekeeping_mask))
+                cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+                if (cpumask_empty(tmp)) {
+                        pr_warn("Housekeeping: must include one present CPU, "
+                                "using boot CPU:%d\n", smp_processor_id());
                        __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+                        __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+                }
        } else {
-                cpumask_var_t tmp;
+                cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+                if (cpumask_empty(tmp))
-                alloc_bootmem_cpumask_var(&tmp);
+                        __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
                cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
                if (!cpumask_equal(tmp, housekeeping_mask)) {
                        pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
@@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
                        free_bootmem_cpumask_var(non_housekeeping_mask);
                        return 0;
                }
-                free_bootmem_cpumask_var(tmp);
        }
+        free_bootmem_cpumask_var(tmp);
        if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
                if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 90fa23d36565..1e6b909dca36 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
        rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
        if (rt_runtime_us < 0)
                rt_runtime = RUNTIME_INF;
+        else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
+                return -EINVAL;
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
@@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
 {
        u64 rt_runtime, rt_period;
+        if (rt_period_us > U64_MAX / NSEC_PER_USEC)
+                return -EINVAL;
        rt_period = rt_period_us * NSEC_PER_USEC;
        rt_runtime = tg->rt_bandwidth.rt_runtime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index efa686eeff26..b52ed1ada0be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -780,7 +780,7 @@ struct root_domain {
         * NULL-terminated list of performance domains intersecting with the
         * CPUs of the rd. Protected by RCU.
         */
-        struct perf_domain      *pd;
+        struct perf_domain __rcu *pd;
 };
 extern struct root_domain def_root_domain;
@@ -869,8 +869,8 @@ struct rq {
        atomic_t                nr_iowait;
 #ifdef CONFIG_SMP
-        struct root_domain      *rd;
+        struct root_domain              *rd;
-        struct sched_domain     *sd;
+        struct sched_domain __rcu       *sd;
        unsigned long           cpu_capacity;
        unsigned long           cpu_capacity_orig;
@@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
        return sd;
 }
-DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
-DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 extern struct static_key_false sched_asym_cpucapacity;
 struct sched_group_capacity {
@@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu)
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 #ifdef CONFIG_CPU_FREQ
-DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 /**
 * cpufreq_update_util - Take a note about CPU utilization changes.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ab7f371a3a17..f53f89df837d 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd)
 * the cpumask of the domain), this allows us to quickly tell if
 * two CPUs are in the same cache domain, see cpus_share_cache().
 */
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 static void update_top_cache_domain(int cpu)
@@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
        struct sched_domain *child = sd->child;
        struct sched_group *sg;
+        bool already_visited;
        if (child)
                cpu = cpumask_first(sched_domain_span(child));
@@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
        sg = *per_cpu_ptr(sdd->sg, cpu);
        sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
-        /* For claim_allocations: */
+        /* Increase refcounts for claim_allocations: */
-        atomic_inc(&sg->ref);
+        already_visited = atomic_inc_return(&sg->ref) > 1;
-        atomic_inc(&sg->sgc->ref);
+        /* sgc visits should follow a similar trend as sg */
+        WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
+        /* If we have already visited that group, it's already initialized. */
+        if (already_visited)
+                return sg;
        if (child) {
                cpumask_copy(sched_group_span(sg), sched_domain_span(child));
@@ -1087,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
 /*
 * build_sched_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
+ * covered by the given span, will set each group's ->cpumask correctly,
- * and ->cpu_capacity to 0.
+ * and will initialize their ->sgc.
 *
 * Assumes the sched_domain tree is fully constructed
 */
@@ -2075,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
 }
 /*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * Set up scheduler domains and groups.  For now this just excludes isolated
- * For now this just excludes isolated CPUs, but could be used to
+ * CPUs, but could be used to exclude other special cases in the future.
- * exclude other special cases in the future.
 */
 int sched_init_domains(const struct cpumask *cpu_map)
 {
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 54a0347ca812..a635ecba6fe2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -149,7 +149,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
        sd->nr = syscall_get_nr(task, regs);
        sd->arch = syscall_get_arch();
-        syscall_get_arguments(task, regs, 0, 6, args);
+        syscall_get_arguments(task, regs, args);
        sd->args[0] = args[0];
        sd->args[1] = args[1];
        sd->args[2] = args[2];
@@ -331,7 +331,7 @@ static int is_ancestor(struct seccomp_filter *parent,
 * Expects sighand and cred_guard_mutex locks to be held.
 *
 * Returns 0 on success, -ve on error, or the pid of a thread which was
- * either not in the correct seccomp mode or it did not have an ancestral
+ * either not in the correct seccomp mode or did not have an ancestral
 * seccomp filter.
 */
 static inline pid_t seccomp_can_sync_threads(void)
@@ -502,7 +502,10 @@ out:
 *
 * Caller must be holding current->sighand->siglock lock.
 *
- * Returns 0 on success, -ve on error.
+ * Returns 0 on success, -ve on error, or
+ *   - in TSYNC mode: the pid of a thread which was either not in the correct
+ *     seccomp mode or did not have an ancestral seccomp filter
+ *   - in NEW_LISTENER mode: the fd of the new listener
 */
 static long seccomp_attach_filter(unsigned int flags,
                                  struct seccomp_filter *filter)
@@ -1258,6 +1261,16 @@ static long seccomp_set_mode_filter(unsigned int flags,
        if (flags & ~SECCOMP_FILTER_FLAG_MASK)
                return -EINVAL;
+        /*
+         * In the successful case, NEW_LISTENER returns the new listener fd.
+         * But in the failure case, TSYNC returns the thread that died. If you
+         * combine these two flags, there's no way to tell whether something
+         * succeeded or failed. So, let's disallow this combination.
+         */
+        if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
+            (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER))
+                return -EINVAL;
        /* Prepare the new filter before holding any locks. */
        prepared = seccomp_prepare_user_filter(filter);
        if (IS_ERR(prepared))
@@ -1304,7 +1317,7 @@ out:
                mutex_unlock(&current->signal->cred_guard_mutex);
 out_put_fd:
        if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
-                if (ret < 0) {
+                if (ret) {
                        listener_f->private_data = NULL;
                        fput(listener_f);
                        put_unused_fd(listener);
diff --git a/kernel/signal.c b/kernel/signal.c
index b7953934aa99..cd83cc376767 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3513,7 +3513,6 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
        return kill_something_info(sig, &info, pid);
 }
-#ifdef CONFIG_PROC_FS
 /*
 * Verify that the signaler and signalee either are in the same pid namespace
 * or that the signaler's pid namespace is an ancestor of the signalee's pid
@@ -3550,6 +3549,14 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
        return copy_siginfo_from_user(kinfo, info);
 }
+static struct pid *pidfd_to_pid(const struct file *file)
+{
+        if (file->f_op == &pidfd_fops)
+                return file->private_data;
+        return tgid_pidfd_to_pid(file);
+}
 /**
 * sys_pidfd_send_signal - send a signal to a process through a task file
 *                          descriptor
@@ -3581,12 +3588,12 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
        if (flags)
                return -EINVAL;
-        f = fdget_raw(pidfd);
+        f = fdget(pidfd);
        if (!f.file)
                return -EBADF;
        /* Is this a pidfd? */
-        pid = tgid_pidfd_to_pid(f.file);
+        pid = pidfd_to_pid(f.file);
        if (IS_ERR(pid)) {
                ret = PTR_ERR(pid);
                goto err;
@@ -3605,16 +3612,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
                if (unlikely(sig != kinfo.si_signo))
                        goto err;
+                /* Only allow sending arbitrary signals to yourself. */
+                ret = -EPERM;
                if ((task_pid(current) != pid) &&
-                    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) {
+                    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
-                        /* Only allow sending arbitrary signals to yourself. */
+                        goto err;
-                        ret = -EPERM;
-                        if (kinfo.si_code != SI_USER)
-                                goto err;
-                        /* Turn this into a regular kill signal. */
-                        prepare_kill_siginfo(sig, &kinfo);
-                }
        } else {
                prepare_kill_siginfo(sig, &kinfo);
        }
@@ -3625,7 +3627,6 @@ err:
        fdput(f);
        return ret;
 }
-#endif /* CONFIG_PROC_FS */
 static int
 do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 10277429ed84..2c3382378d94 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -573,57 +573,6 @@ void tasklet_kill(struct tasklet_struct *t)
 }
 EXPORT_SYMBOL(tasklet_kill);
-/*
- * tasklet_hrtimer
- */
-/*
- * The trampoline is called when the hrtimer expires. It schedules a tasklet
- * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
- * hrtimer callback, but from softirq context.
- */
-static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
-{
-        struct tasklet_hrtimer *ttimer =
-                container_of(timer, struct tasklet_hrtimer, timer);
-        tasklet_hi_schedule(&ttimer->tasklet);
-        return HRTIMER_NORESTART;
-}
-/*
- * Helper function which calls the hrtimer callback from
- * tasklet/softirq context
- */
-static void __tasklet_hrtimer_trampoline(unsigned long data)
-{
-        struct tasklet_hrtimer *ttimer = (void *)data;
-        enum hrtimer_restart restart;
-        restart = ttimer->function(&ttimer->timer);
-        if (restart != HRTIMER_NORESTART)
-                hrtimer_restart(&ttimer->timer);
-}
-/**
- * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
- * @ttimer:      tasklet_hrtimer which is initialized
- * @function:    hrtimer callback function which gets called from softirq context
- * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
- * @mode:        hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
- */
-void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
-                          enum hrtimer_restart (*function)(struct hrtimer *),
-                          clockid_t which_clock, enum hrtimer_mode mode)
-{
-        hrtimer_init(&ttimer->timer, which_clock, mode);
-        ttimer->timer.function = __hrtimer_tasklet_trampoline;
-        tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
-                     (unsigned long)ttimer);
-        ttimer->function = function;
-}
-EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
 void __init softirq_init(void)
 {
        int cpu;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index f8edee9c792d..27bafc1e271e 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -5,41 +5,56 @@
 *
 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 */
+#include <linux/sched/task_stack.h>
+#include <linux/sched/debug.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
-void print_stack_trace(struct stack_trace *trace, int spaces)
+/**
+ * stack_trace_print - Print the entries in the stack trace
+ * @entries:    Pointer to storage array
+ * @nr_entries: Number of entries in the storage array
+ * @spaces:     Number of leading spaces to print
+ */
+void stack_trace_print(unsigned long *entries, unsigned int nr_entries,
+                       int spaces)
 {
-        int i;
+        unsigned int i;
-        if (WARN_ON(!trace->entries))
+        if (WARN_ON(!entries))
                return;
-        for (i = 0; i < trace->nr_entries; i++)
+        for (i = 0; i < nr_entries; i++)
-                printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]);
+                printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]);
 }
-EXPORT_SYMBOL_GPL(print_stack_trace);
+EXPORT_SYMBOL_GPL(stack_trace_print);
-int snprint_stack_trace(char *buf, size_t size,
+/**
-                        struct stack_trace *trace, int spaces)
+ * stack_trace_snprint - Print the entries in the stack trace into a buffer
+ * @buf:        Pointer to the print buffer
+ * @size:       Size of the print buffer
+ * @entries:    Pointer to storage array
+ * @nr_entries: Number of entries in the storage array
+ * @spaces:     Number of leading spaces to print
+ *
+ * Return: Number of bytes printed.
+ */
+int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
+                        unsigned int nr_entries, int spaces)
 {
-        int i;
+        unsigned int generated, i, total = 0;
-        int generated;
-        int total = 0;
-        if (WARN_ON(!trace->entries))
+        if (WARN_ON(!entries))
                return 0;
-        for (i = 0; i < trace->nr_entries; i++) {
+        for (i = 0; i < nr_entries && size; i++) {
                generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ',
-                                     (void *)trace->entries[i]);
+                                     (void *)entries[i]);
                total += generated;
-                /* Assume that generated isn't a negative number */
                if (generated >= size) {
                        buf += size;
                        size = 0;
@@ -51,7 +66,176 @@ int snprint_stack_trace(char *buf, size_t size,
        return total;
 }
-EXPORT_SYMBOL_GPL(snprint_stack_trace);
+EXPORT_SYMBOL_GPL(stack_trace_snprint);
+#ifdef CONFIG_ARCH_STACKWALK
+struct stacktrace_cookie {
+        unsigned long   *store;
+        unsigned int    size;
+        unsigned int    skip;
+        unsigned int    len;
+};
+static bool stack_trace_consume_entry(void *cookie, unsigned long addr,
+                                      bool reliable)
+{
+        struct stacktrace_cookie *c = cookie;
+        if (c->len >= c->size)
+                return false;
+        if (c->skip > 0) {
+                c->skip--;
+                return true;
+        }
+        c->store[c->len++] = addr;
+        return c->len < c->size;
+}
+static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr,
+                                              bool reliable)
+{
+        if (in_sched_functions(addr))
+                return true;
+        return stack_trace_consume_entry(cookie, addr, reliable);
+}
+/**
+ * stack_trace_save - Save a stack trace into a storage array
+ * @store:      Pointer to storage array
+ * @size:       Size of the storage array
+ * @skipnr:     Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+                              unsigned int skipnr)
+{
+        stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+        struct stacktrace_cookie c = {
+                .store  = store,
+                .size   = size,
+                .skip   = skipnr + 1,
+        };
+        arch_stack_walk(consume_entry, &c, current, NULL);
+        return c.len;
+}
+EXPORT_SYMBOL_GPL(stack_trace_save);
+/**
+ * stack_trace_save_tsk - Save a task stack trace into a storage array
+ * @task:       The task to examine
+ * @store:      Pointer to storage array
+ * @size:       Size of the storage array
+ * @skipnr:     Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
+                                  unsigned int size, unsigned int skipnr)
+{
+        stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched;
+        struct stacktrace_cookie c = {
+                .store  = store,
+                .size   = size,
+                .skip   = skipnr + 1,
+        };
+        if (!try_get_task_stack(tsk))
+                return 0;
+        arch_stack_walk(consume_entry, &c, tsk, NULL);
+        put_task_stack(tsk);
+        return c.len;
+}
+/**
+ * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+ * @regs:       Pointer to pt_regs to examine
+ * @store:      Pointer to storage array
+ * @size:       Size of the storage array
+ * @skipnr:     Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+                                   unsigned int size, unsigned int skipnr)
+{
+        stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+        struct stacktrace_cookie c = {
+                .store  = store,
+                .size   = size,
+                .skip   = skipnr,
+        };
+        arch_stack_walk(consume_entry, &c, current, regs);
+        return c.len;
+}
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+/**
+ * stack_trace_save_tsk_reliable - Save task stack with verification
+ * @tsk:        Pointer to the task to examine
+ * @store:      Pointer to storage array
+ * @size:       Size of the storage array
+ *
+ * Return:      An error if it detects any unreliable features of the
+ *              stack. Otherwise it guarantees that the stack trace is
+ *              reliable and returns the number of entries stored.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
+                                  unsigned int size)
+{
+        stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+        struct stacktrace_cookie c = {
+                .store  = store,
+                .size   = size,
+        };
+        int ret;
+        /*
+         * If the task doesn't have a stack (e.g., a zombie), the stack is
+         * "reliably" empty.
+         */
+        if (!try_get_task_stack(tsk))
+                return 0;
+        ret = arch_stack_walk_reliable(consume_entry, &c, tsk);
+        put_task_stack(tsk);
+        return ret;
+}
+#endif
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+/**
+ * stack_trace_save_user - Save a user space stack trace into a storage array
+ * @store:      Pointer to storage array
+ * @size:       Size of the storage array
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
+{
+        stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+        struct stacktrace_cookie c = {
+                .store  = store,
+                .size   = size,
+        };
+        /* Trace user stack if not a kernel thread */
+        if (!current->mm)
+                return 0;
+        arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
+        return c.len;
+}
+#endif
+#else /* CONFIG_ARCH_STACKWALK */
 /*
 * Architectures that do not implement save_stack_trace_*()
@@ -77,3 +261,118 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk,
        WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
        return -ENOSYS;
 }
+/**
+ * stack_trace_save - Save a stack trace into a storage array
+ * @store:      Pointer to storage array
+ * @size:       Size of the storage array
+ * @skipnr:     Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+                              unsigned int skipnr)
+{
+        struct stack_trace trace = {
+                .entries        = store,
+                .max_entries    = size,
+                .skip           = skipnr + 1,
+        };
+        save_stack_trace(&trace);
+        return trace.nr_entries;
+}
+EXPORT_SYMBOL_GPL(stack_trace_save);
+/**
+ * stack_trace_save_tsk - Save a task stack trace into a storage array
+ * @task:       The task to examine
+ * @store:      Pointer to storage array
+ * @size:       Size of the storage array
+ * @skipnr:     Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_tsk(struct task_struct *task,
+                                  unsigned long *store, unsigned int size,
+                                  unsigned int skipnr)
+{
+        struct stack_trace trace = {
+                .entries        = store,
+                .max_entries    = size,
+                .skip           = skipnr + 1,
+        };
+        save_stack_trace_tsk(task, &trace);
+        return trace.nr_entries;
+}
+/**
+ * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+ * @regs:       Pointer to pt_regs to examine
+ * @store:      Pointer to storage array
+ * @size:       Size of the storage array
+ * @skipnr:     Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+                                   unsigned int size, unsigned int skipnr)
+{
+        struct stack_trace trace = {
+                .entries        = store,
+                .max_entries    = size,
+                .skip           = skipnr,
+        };
+        save_stack_trace_regs(regs, &trace);
+        return trace.nr_entries;
+}
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+/**
+ * stack_trace_save_tsk_reliable - Save task stack with verification
+ * @tsk:        Pointer to the task to examine
+ * @store:      Pointer to storage array
+ * @size:       Size of the storage array
+ *
+ * Return:      An error if it detects any unreliable features of the
+ *              stack. Otherwise it guarantees that the stack trace is
+ *              reliable and returns the number of entries stored.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
+                                  unsigned int size)
+{
+        struct stack_trace trace = {
+                .entries        = store,
+                .max_entries    = size,
+        };
+        int ret = save_stack_trace_tsk_reliable(tsk, &trace);
+        return ret ? ret : trace.nr_entries;
+}
+#endif
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+/**
+ * stack_trace_save_user - Save a user space stack trace into a storage array
+ * @store:      Pointer to storage array
+ * @size:       Size of the storage array
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
+{
+        struct stack_trace trace = {
+                .entries        = store,
+                .max_entries    = size,
+        };
+        save_stack_trace_user(&trace);
+        return trace.nr_entries;
+}
+#endif /* CONFIG_USER_STACKTRACE_SUPPORT */
+#endif /* !CONFIG_ARCH_STACKWALK */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 067cb83f37ea..7231fb5953fc 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -513,7 +513,7 @@ repeat:
                }
                preempt_count_dec();
                WARN_ONCE(preempt_count(),
-                          "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
+                          "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
                goto repeat;
        }
 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d21f4befaea4..4d9ae5ea6caf 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -167,9 +167,6 @@ COND_SYSCALL(syslog);
 /* kernel/sched/core.c */
-/* kernel/signal.c */
-COND_SYSCALL(pidfd_send_signal);
 /* kernel/sys.c */
 COND_SYSCALL(setregid);
 COND_SYSCALL(setgid);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e5da394d1ca3..c9ec050bcf46 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -128,6 +128,7 @@ static int zero;
 static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
+static unsigned long zero_ul;
 static unsigned long one_ul = 1;
 static unsigned long long_max = LONG_MAX;
 static int one_hundred = 100;
@@ -1750,7 +1751,7 @@ static struct ctl_table fs_table[] = {
                .maxlen         = sizeof(files_stat.max_files),
                .mode           = 0644,
                .proc_handler   = proc_doulongvec_minmax,
-                .extra1         = &zero,
+                .extra1         = &zero_ul,
                .extra2         = &long_max,
        },
        {
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2c97e8c2d29f..0519a8805aab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -594,7 +594,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)
 {
        struct alarm *alarm = &timr->it.alarm.alarmtimer;
-        return ktime_sub(now, alarm->node.expires);
+        return ktime_sub(alarm->node.expires, now);
 }
 /**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5e77662dd2d9..f5490222e134 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -611,6 +611,22 @@ void clockevents_resume(void)
 }
 #ifdef CONFIG_HOTPLUG_CPU
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+/**
+ * tick_offline_cpu - Take CPU out of the broadcast mechanism
+ * @cpu:        The outgoing CPU
+ *
+ * Called on the outgoing CPU after it took itself offline.
+ */
+void tick_offline_cpu(unsigned int cpu)
+{
+        raw_spin_lock(&clockevents_lock);
+        tick_broadcast_offline(cpu);
+        raw_spin_unlock(&clockevents_lock);
+}
+# endif
 /**
 * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
 */
@@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu)
        raw_spin_lock_irqsave(&clockevents_lock, flags);
-        tick_shutdown_broadcast_oneshot(cpu);
-        tick_shutdown_broadcast(cpu);
        tick_shutdown(cpu);
        /*
         * Unregister the clock event devices which were
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index dc1b6f1929f9..d23b434c2ca7 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-        unsigned long seq;
+        unsigned int seq;
        u64 ret;
        do {
@@ -89,7 +89,7 @@ struct clocksource * __init __weak clocksource_default_clock(void)
        return &clocksource_jiffies;
 }
-struct clocksource refined_jiffies;
+static struct clocksource refined_jiffies;
 int register_refined_jiffies(long cycles_per_second)
 {
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 094b82ca95e5..142b07619918 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 unsigned long long notrace sched_clock(void)
 {
        u64 cyc, res;
-        unsigned long seq;
+        unsigned int seq;
        struct clock_read_data *rd;
        do {
@@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
        if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
                enable_sched_clock_irqtime();
-        pr_debug("Registered %pF as sched_clock source\n", read);
+        pr_debug("Registered %pS as sched_clock source\n", read);
 }
 void __init generic_sched_clock_init(void)
@@ -267,12 +267,12 @@ void __init generic_sched_clock_init(void)
 */
 static u64 notrace suspended_sched_clock_read(void)
 {
-        unsigned long seq = raw_read_seqcount(&cd.seq);
+        unsigned int seq = raw_read_seqcount(&cd.seq);
        return cd.read_data[seq & 1].epoch_cyc;
 }
-static int sched_clock_suspend(void)
+int sched_clock_suspend(void)
 {
        struct clock_read_data *rd = &cd.read_data[0];
@@ -283,7 +283,7 @@ static int sched_clock_suspend(void)
        return 0;
 }
-static void sched_clock_resume(void)
+void sched_clock_resume(void)
 {
        struct clock_read_data *rd = &cd.read_data[0];
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index ee834d4fb814..e51778c312f1 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -36,10 +36,16 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
 static void tick_broadcast_clear_oneshot(int cpu);
 static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+# ifdef CONFIG_HOTPLUG_CPU
+static void tick_broadcast_oneshot_offline(unsigned int cpu);
+# endif
 #else
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
 static inline void tick_broadcast_clear_oneshot(int cpu) { }
 static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
+# ifdef CONFIG_HOTPLUG_CPU
+static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { }
+# endif
 #endif
 /*
@@ -433,27 +439,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
 }
 #ifdef CONFIG_HOTPLUG_CPU
-/*
+static void tick_shutdown_broadcast(void)
- * Remove a CPU from broadcasting
- */
-void tick_shutdown_broadcast(unsigned int cpu)
 {
-        struct clock_event_device *bc;
+        struct clock_event_device *bc = tick_broadcast_device.evtdev;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-        bc = tick_broadcast_device.evtdev;
-        cpumask_clear_cpu(cpu, tick_broadcast_mask);
-        cpumask_clear_cpu(cpu, tick_broadcast_on);
        if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
                if (bc && cpumask_empty(tick_broadcast_mask))
                        clockevents_shutdown(bc);
        }
+}
-        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+/*
+ * Remove a CPU from broadcasting
+ */
+void tick_broadcast_offline(unsigned int cpu)
+{
+        raw_spin_lock(&tick_broadcast_lock);
+        cpumask_clear_cpu(cpu, tick_broadcast_mask);
+        cpumask_clear_cpu(cpu, tick_broadcast_on);
+        tick_broadcast_oneshot_offline(cpu);
+        tick_shutdown_broadcast();
+        raw_spin_unlock(&tick_broadcast_lock);
 }
 #endif
 void tick_suspend_broadcast(void)
@@ -801,13 +809,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
                         * either the CPU handling the broadcast
                         * interrupt or we got woken by something else.
                         *
-                         * We are not longer in the broadcast mask, so
+                         * We are no longer in the broadcast mask, so
                         * if the cpu local expiry time is already
                         * reached, we would reprogram the cpu local
                         * timer with an already expired event.
                         *
                         * This can lead to a ping-pong when we return
-                         * to idle and therefor rearm the broadcast
+                         * to idle and therefore rearm the broadcast
                         * timer before the cpu local timer was able
                         * to fire. This happens because the forced
                         * reprogramming makes sure that the event
@@ -950,14 +958,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
 }
 /*
- * Remove a dead CPU from broadcasting
+ * Remove a dying CPU from broadcasting
 */
-void tick_shutdown_broadcast_oneshot(unsigned int cpu)
+static void tick_broadcast_oneshot_offline(unsigned int cpu)
 {
-        unsigned long flags;
-        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        /*
         * Clear the broadcast masks for the dead cpu, but do not stop
         * the broadcast device!
@@ -965,8 +969,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu)
        cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
        cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
        cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
-        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 #endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 529143b4c8d2..59225b484e4e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -46,6 +46,14 @@ ktime_t tick_period;
 *    procedure also covers cpu hotplug.
 */
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
+#ifdef CONFIG_NO_HZ_FULL
+/*
+ * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
+ * tick_do_timer_cpu and it should be taken over by an eligible secondary
+ * when one comes online.
+ */
+static int tick_do_timer_boot_cpu __read_mostly = -1;
+#endif
 /*
 * Debugging: see timer_list.c
@@ -149,7 +157,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
            !tick_broadcast_oneshot_active()) {
                clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
        } else {
-                unsigned long seq;
+                unsigned int seq;
                ktime_t next;
                do {
@@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
        }
 }
+#ifdef CONFIG_NO_HZ_FULL
+static void giveup_do_timer(void *info)
+{
+        int cpu = *(unsigned int *)info;
+        WARN_ON(tick_do_timer_cpu != smp_processor_id());
+        tick_do_timer_cpu = cpu;
+}
+static void tick_take_do_timer_from_boot(void)
+{
+        int cpu = smp_processor_id();
+        int from = tick_do_timer_boot_cpu;
+        if (from >= 0 && from != cpu)
+                smp_call_function_single(from, giveup_do_timer, &cpu, 1);
+}
+#endif
 /*
 * Setup the tick device
 */
@@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td,
                 * this cpu:
                 */
                if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
-                        if (!tick_nohz_full_cpu(cpu))
+                        tick_do_timer_cpu = cpu;
-                                tick_do_timer_cpu = cpu;
-                        else
-                                tick_do_timer_cpu = TICK_DO_TIMER_NONE;
                        tick_next_period = ktime_get();
                        tick_period = NSEC_PER_SEC / HZ;
+#ifdef CONFIG_NO_HZ_FULL
+                        /*
+                         * The boot CPU may be nohz_full, in which case set
+                         * tick_do_timer_boot_cpu so the first housekeeping
+                         * secondary that comes up will take do_timer from
+                         * us.
+                         */
+                        if (tick_nohz_full_cpu(cpu))
+                                tick_do_timer_boot_cpu = cpu;
+                } else if (tick_do_timer_boot_cpu != -1 &&
+                                                !tick_nohz_full_cpu(cpu)) {
+                        tick_take_do_timer_from_boot();
+                        tick_do_timer_boot_cpu = -1;
+                        WARN_ON(tick_do_timer_cpu != cpu);
+#endif
                }
                /*
@@ -487,6 +529,7 @@ void tick_freeze(void)
                trace_suspend_resume(TPS("timekeeping_freeze"),
                                     smp_processor_id(), true);
                system_state = SYSTEM_SUSPEND;
+                sched_clock_suspend();
                timekeeping_suspend();
        } else {
                tick_suspend_local();
@@ -510,6 +553,7 @@ void tick_unfreeze(void)
        if (tick_freeze_depth == num_online_cpus()) {
                timekeeping_resume();
+                sched_clock_resume();
                system_state = SYSTEM_RUNNING;
                trace_suspend_resume(TPS("timekeeping_freeze"),
                                     smp_processor_id(), false);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index e277284c2831..7b2496136729 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
 extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
 extern void tick_install_broadcast_device(struct clock_event_device *dev);
 extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_shutdown_broadcast(unsigned int cpu);
 extern void tick_suspend_broadcast(void);
 extern void tick_resume_broadcast(void);
 extern bool tick_resume_check_broadcast(void);
@@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev)
 static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
 static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
 static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_shutdown_broadcast(unsigned int cpu) { }
 static inline void tick_suspend_broadcast(void) { }
 static inline void tick_resume_broadcast(void) { }
 static inline bool tick_resume_check_broadcast(void) { return false; }
@@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
 /* Functions related to oneshot broadcasting */
 #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
 extern void tick_broadcast_switch_to_oneshot(void);
-extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
 extern int tick_broadcast_oneshot_active(void);
 extern void tick_check_oneshot_broadcast_this_cpu(void);
 bool tick_broadcast_oneshot_available(void);
 extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
 #else /* !(BROADCAST && ONESHOT): */
 static inline void tick_broadcast_switch_to_oneshot(void) { }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
 static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
 static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
 #endif /* !(BROADCAST && ONESHOT) */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU)
+extern void tick_broadcast_offline(unsigned int cpu);
+#else
+static inline void tick_broadcast_offline(unsigned int cpu) { }
+#endif
 /* NO_HZ_FULL internal */
 #ifdef CONFIG_NO_HZ_FULL
 extern void tick_nohz_init(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6fa52cd6df0b..f4ee1a3428ae 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
         * into a long sleep. If two CPUs happen to assign themselves to
         * this duty, then the jiffies update is still serialized by
         * jiffies_lock.
+         *
+         * If nohz_full is enabled, this should not happen because the
+         * tick_do_timer_cpu never relinquishes.
         */
-        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
-            && !tick_nohz_full_cpu(cpu))
+#ifdef CONFIG_NO_HZ_FULL
+                WARN_ON(tick_nohz_full_running);
+#endif
                tick_do_timer_cpu = cpu;
+        }
 #endif
        /* Check, if the jiffies need an update */
@@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 static int tick_nohz_cpu_down(unsigned int cpu)
 {
        /*
-         * The boot CPU handles housekeeping duty (unbound timers,
+         * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
-         * workqueues, timekeeping, ...) on behalf of full dynticks
+         * timers, workqueues, timekeeping, ...) on behalf of full dynticks
         * CPUs. It must remain online when nohz full is enabled.
         */
        if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
@@ -423,12 +429,15 @@ void __init tick_nohz_init(void)
                return;
        }
-        cpu = smp_processor_id();
+        if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
+                        !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
+                cpu = smp_processor_id();
-        if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+                if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
-                pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n",
+                        pr_warn("NO_HZ: Clearing %d from nohz_full range "
-                        cpu);
+                                "for timekeeping\n", cpu);
-                cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+                        cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+                }
        }
        for_each_cpu(cpu, tick_nohz_full_mask)
@@ -645,7 +654,8 @@ static inline bool local_timer_softirq_pending(void)
 static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 {
        u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
-        unsigned long seq, basejiff;
+        unsigned long basejiff;
+        unsigned int seq;
        /* Read jiffies and the time when jiffies were updated last */
        do {
@@ -904,8 +914,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
                /*
                 * Boot safety: make sure the timekeeping duty has been
                 * assigned before entering dyntick-idle mode,
+                 * tick_do_timer_cpu is TICK_DO_TIMER_BOOT
                 */
-                if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+                if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT))
+                        return false;
+                /* Should not happen for nohz-full */
+                if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
                        return false;
        }
@@ -1023,6 +1038,18 @@ bool tick_nohz_idle_got_tick(void)
 }
 /**
+ * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
+ * or the tick, whatever that expires first. Note that, if the tick has been
+ * stopped, it returns the next hrtimer.
+ *
+ * Called from power state control code with interrupts disabled
+ */
+ktime_t tick_nohz_get_next_hrtimer(void)
+{
+        return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
+}
+/**
 * tick_nohz_get_sleep_length - return the expected length of the current sleep
 * @delta_next: duration until the next event if the tick cannot be stopped
 *
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 6de959a854b2..4fb06527cf64 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -24,12 +24,19 @@ enum tick_nohz_mode {
 * struct tick_sched - sched tick emulation and no idle tick control/stats
 * @sched_timer:        hrtimer to schedule the periodic tick in high
 *                      resolution mode
+ * @check_clocks:       Notification mechanism about clocksource changes
+ * @nohz_mode:          Mode - one state of tick_nohz_mode
+ * @inidle:             Indicator that the CPU is in the tick idle mode
+ * @tick_stopped:       Indicator that the idle tick has been stopped
+ * @idle_active:        Indicator that the CPU is actively in the tick idle mode;
+ *                      it is resetted during irq handling phases.
+ * @do_timer_lst:       CPU was the last one doing do_timer before going idle
+ * @got_idle_tick:      Tick timer function has run with @inidle set
 * @last_tick:          Store the last tick expiry time when the tick
 *                      timer is modified for nohz sleeps. This is necessary
 *                      to resume the tick timer operation in the timeline
 *                      when the CPU returns from nohz sleep.
 * @next_tick:          Next tick to be fired when in dynticks mode.
- * @tick_stopped:       Indicator that the idle tick has been stopped
 * @idle_jiffies:       jiffies at the entry to idle for idle time accounting
 * @idle_calls:         Total number of idle calls
 * @idle_sleeps:        Number of idle calls, where the sched tick was stopped
@@ -40,8 +47,8 @@ enum tick_nohz_mode {
 * @iowait_sleeptime:   Sum of the time slept in idle with sched tick stopped, with IO outstanding
 * @timer_expires:      Anticipated timer expiration time (in case sched tick is stopped)
 * @timer_expires_base: Base time clock monotonic for @timer_expires
- * @do_timer_lst:       CPU was the last one doing do_timer before going idle
+ * @next_timer:         Expiry time of next expiring timer for debugging purpose only
- * @got_idle_tick:      Tick timer function has run with @inidle set
+ * @tick_dep_mask:      Tick dependency mask - is set, if someone needs the tick
 */
 struct tick_sched {
        struct hrtimer                  sched_timer;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index c3f756f8534b..86656bbac232 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -171,7 +171,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
        static int firsttime = 1;
        int error = 0;
-        if (tv && !timespec64_valid(tv))
+        if (tv && !timespec64_valid_settod(tv))
                return -EINVAL;
        error = security_settime64(tv, tz);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f986e1918d12..5716e28bfa3c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -720,7 +720,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 void ktime_get_real_ts64(struct timespec64 *ts)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-        unsigned long seq;
+        unsigned int seq;
        u64 nsecs;
        WARN_ON(timekeeping_suspended);
@@ -829,7 +829,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
 ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
 {
        ktime_t *offset = offsets[offs];
-        unsigned long seq;
+        unsigned int seq;
        ktime_t tconv;
        do {
@@ -960,7 +960,7 @@ time64_t __ktime_get_real_seconds(void)
 void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-        unsigned long seq;
+        unsigned int seq;
        ktime_t base_raw;
        ktime_t base_real;
        u64 nsec_raw;
@@ -1122,7 +1122,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
        ktime_t base_real, base_raw;
        u64 nsec_real, nsec_raw;
        u8 cs_was_changed_seq;
-        unsigned long seq;
+        unsigned int seq;
        bool do_interp;
        int ret;
@@ -1221,7 +1221,7 @@ int do_settimeofday64(const struct timespec64 *ts)
        unsigned long flags;
        int ret = 0;
-        if (!timespec64_valid_strict(ts))
+        if (!timespec64_valid_settod(ts))
                return -EINVAL;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1278,7 +1278,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts)
        /* Make sure the proposed value is valid */
        tmp = timespec64_add(tk_xtime(tk), *ts);
        if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
-            !timespec64_valid_strict(&tmp)) {
+            !timespec64_valid_settod(&tmp)) {
                ret = -EINVAL;
                goto error;
        }
@@ -1409,7 +1409,7 @@ int timekeeping_notify(struct clocksource *clock)
 void ktime_get_raw_ts64(struct timespec64 *ts)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-        unsigned long seq;
+        unsigned int seq;
        u64 nsecs;
        do {
@@ -1431,7 +1431,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64);
 int timekeeping_valid_for_hres(void)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-        unsigned long seq;
+        unsigned int seq;
        int ret;
        do {
@@ -1450,7 +1450,7 @@ int timekeeping_valid_for_hres(void)
 u64 timekeeping_max_deferment(void)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-        unsigned long seq;
+        unsigned int seq;
        u64 ret;
        do {
@@ -1527,7 +1527,7 @@ void __init timekeeping_init(void)
        unsigned long flags;
        read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
-        if (timespec64_valid_strict(&wall_time) &&
+        if (timespec64_valid_settod(&wall_time) &&
            timespec64_to_ns(&wall_time) > 0) {
                persistent_clock_exists = true;
        } else if (timespec64_to_ns(&wall_time) != 0) {
@@ -2150,7 +2150,7 @@ EXPORT_SYMBOL_GPL(getboottime64);
 void ktime_get_coarse_real_ts64(struct timespec64 *ts)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
-        unsigned long seq;
+        unsigned int seq;
        do {
                seq = read_seqcount_begin(&tk_core.seq);
@@ -2164,7 +2164,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 now, mono;
-        unsigned long seq;
+        unsigned int seq;
        do {
                seq = read_seqcount_begin(&tk_core.seq);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 7a9b4eb7a1d5..141ab3ab0354 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -14,6 +14,13 @@ extern u64 timekeeping_max_deferment(void);
 extern void timekeeping_warp_clock(void);
 extern int timekeeping_suspend(void);
 extern void timekeeping_resume(void);
+#ifdef CONFIG_GENERIC_SCHED_CLOCK
+extern int sched_clock_suspend(void);
+extern void sched_clock_resume(void);
+#else
+static inline int sched_clock_suspend(void) { return 0; }
+static inline void sched_clock_resume(void) { }
+#endif
 extern void do_timer(unsigned long ticks);
 extern void update_wall_time(void);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2fce056f8a49..343c7ba33b1c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
        hlist_add_head(&timer->entry, base->vectors + idx);
        __set_bit(idx, base->pending_map);
        timer_set_idx(timer, idx);
+        trace_timer_start(timer, timer->expires, timer->flags);
 }
 static void
@@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer)
        trace_timer_init(timer);
 }
-static inline void
-debug_activate(struct timer_list *timer, unsigned long expires)
-{
-        debug_timer_activate(timer);
-        trace_timer_start(timer, expires, timer->flags);
-}
 static inline void debug_deactivate(struct timer_list *timer)
 {
        debug_timer_deactivate(timer);
@@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
                }
        }
-        debug_activate(timer, expires);
+        debug_timer_activate(timer);
        timer->expires = expires;
        /*
@@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
        }
        forward_timer_base(base);
-        debug_activate(timer, timer->expires);
+        debug_timer_activate(timer);
        internal_add_timer(base, timer);
        raw_spin_unlock_irqrestore(&base->lock, flags);
 }
@@ -1298,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
-static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
+static void call_timer_fn(struct timer_list *timer,
+                          void (*fn)(struct timer_list *),
+                          unsigned long baseclk)
 {
        int count = preempt_count();
@@ -1321,14 +1318,14 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
         */
        lock_map_acquire(&lockdep_map);
-        trace_timer_expire_entry(timer);
+        trace_timer_expire_entry(timer, baseclk);
        fn(timer);
        trace_timer_expire_exit(timer);
        lock_map_release(&lockdep_map);
        if (count != preempt_count()) {
-                WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
+                WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
                          fn, count, preempt_count());
                /*
                 * Restore the preempt count. That gives us a decent
@@ -1342,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
 static void expire_timers(struct timer_base *base, struct hlist_head *head)
 {
+        /*
+         * This value is required only for tracing. base->clk was
+         * incremented directly before expire_timers was called. But expiry
+         * is related to the old base->clk value.
+         */
+        unsigned long baseclk = base->clk - 1;
        while (!hlist_empty(head)) {
                struct timer_list *timer;
                void (*fn)(struct timer_list *);
@@ -1355,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
                if (timer->flags & TIMER_IRQSAFE) {
                        raw_spin_unlock(&base->lock);
-                        call_timer_fn(timer, fn);
+                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock(&base->lock);
                } else {
                        raw_spin_unlock_irq(&base->lock);
-                        call_timer_fn(timer, fn);
+                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock_irq(&base->lock);
                }
        }
diff --git a/kernel/torture.c b/kernel/torture.c
index 8faa1a9aaeb9..17b2be9bde12 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -88,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
        if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
                return false;
+        if (num_online_cpus() <= 1)
+                return false;  /* Can't offline the last CPU. */
        if (verbose > 1)
                pr_alert("%s" TORTURE_FLAG
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d64c00afceb5..94b0e37d90ef 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,8 @@
 #include <linux/syscalls.h>
 #include <linux/error-injection.h>
+#include <asm/tlb.h>
 #include "trace_probe.h"
 #include "trace.h"
@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
         * access_ok() should prevent writing to non-user memory, but in
         * some situations (nommu, temporary switch, etc) access_ok() does
         * not provide enough validation, hence the check on KERNEL_DS.
+         *
+         * nmi_uaccess_okay() ensures the probe is not run in an interim
+         * state, when the task or mm are switched. This is specifically
+         * required to prevent the use of temporary mm.
         */
        if (unlikely(in_interrupt() ||
@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
                return -EPERM;
        if (unlikely(uaccess_kernel()))
                return -EPERM;
+        if (unlikely(!nmi_uaccess_okay()))
+                return -EPERM;
        if (!access_ok(unsafe_ptr, size))
                return -EPERM;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa79323331b2..b920358dd8f7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -33,6 +33,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/rcupdate.h>
+#include <linux/kprobes.h>
 #include <trace/events/sched.h>
@@ -1992,7 +1993,7 @@ static void print_bug_type(void)
 * modifying the code. @failed should be one of either:
 * EFAULT - if the problem happens on reading the @ip address
 * EINVAL - if what is read at @ip is not what was expected
- * EPERM - if the problem happens on writting to the @ip address
+ * EPERM - if the problem happens on writing to the @ip address
 */
 void ftrace_bug(int failed, struct dyn_ftrace *rec)
 {
@@ -2391,7 +2392,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
                return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
        }
-        return -1; /* unknow ftrace bug */
+        return -1; /* unknown ftrace bug */
 }
 void __weak ftrace_replace_code(int mod_flags)
@@ -3004,7 +3005,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
        int cnt;
        if (!num_to_init)
-                return 0;
+                return NULL;
        start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
        if (!pg)
@@ -4755,7 +4756,7 @@ static int
 ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
                int reset, int enable)
 {
-        return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
+        return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
 }
 /**
@@ -5463,7 +5464,7 @@ void ftrace_create_filter_files(struct ftrace_ops *ops,
 /*
 * The name "destroy_filter_files" is really a misnomer. Although
- * in the future, it may actualy delete the files, but this is
+ * in the future, it may actually delete the files, but this is
 * really intended to make sure the ops passed in are disabled
 * and that when this function returns, the caller is free to
 * free the ops.
@@ -5786,7 +5787,7 @@ void ftrace_module_enable(struct module *mod)
        /*
         * If the tracing is enabled, go ahead and enable the record.
         *
-         * The reason not to enable the record immediatelly is the
+         * The reason not to enable the record immediately is the
         * inherent check of ftrace_make_nop/ftrace_make_call for
         * correct previous instructions.  Making first the NOP
         * conversion puts the module to the correct state, thus
@@ -6246,7 +6247,7 @@ void ftrace_reset_array_ops(struct trace_array *tr)
        tr->ops->func = ftrace_stub;
 }
-static inline void
+static nokprobe_inline void
 __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
                       struct ftrace_ops *ignored, struct pt_regs *regs)
 {
@@ -6306,11 +6307,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 {
        __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
 }
+NOKPROBE_SYMBOL(ftrace_ops_list_func);
 #else
 static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
 {
        __ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
 }
+NOKPROBE_SYMBOL(ftrace_ops_no_ops);
 #endif
 /*
@@ -6337,6 +6340,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
        preempt_enable_notrace();
        trace_clear_recursion(bit);
 }
+NOKPROBE_SYMBOL(ftrace_ops_assist_func);
 /**
 * ftrace_ops_get_func - get the function a trampoline should call
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41b6f96e5366..4ee8d8aa3d0f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -762,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
        preempt_disable_notrace();
        time = rb_time_stamp(buffer);
-        preempt_enable_no_resched_notrace();
+        preempt_enable_notrace();
        return time;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 21153e64bf1c..ec439999f387 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps;
 #endif /* CONFIG_TRACE_EVAL_MAP_FILE */
 static int tracing_set_tracer(struct trace_array *tr, const char *buf);
+static void ftrace_trace_userstack(struct ring_buffer *buffer,
+                                   unsigned long flags, int pc);
 #define MAX_TRACER_SIZE         100
 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -496,8 +498,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
         * not modified.
         */
        pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
-        if (!pid_list)
+        if (!pid_list) {
+                trace_parser_put(&parser);
                return -ENOMEM;
+        }
        pid_list->pid_max = READ_ONCE(pid_max);
@@ -507,6 +511,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
        pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
        if (!pid_list->pids) {
+                trace_parser_put(&parser);
                kfree(pid_list);
                return -ENOMEM;
        }
@@ -2749,12 +2754,21 @@ trace_function(struct trace_array *tr,
 #ifdef CONFIG_STACKTRACE
-#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
+/* Allow 4 levels of nesting: normal, softirq, irq, NMI */
+#define FTRACE_KSTACK_NESTING   4
+#define FTRACE_KSTACK_ENTRIES   (PAGE_SIZE / FTRACE_KSTACK_NESTING)
 struct ftrace_stack {
-        unsigned long           calls[FTRACE_STACK_MAX_ENTRIES];
+        unsigned long           calls[FTRACE_KSTACK_ENTRIES];
 };
-static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
+struct ftrace_stacks {
+        struct ftrace_stack     stacks[FTRACE_KSTACK_NESTING];
+};
+static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
 static DEFINE_PER_CPU(int, ftrace_stack_reserve);
 static void __ftrace_trace_stack(struct ring_buffer *buffer,
@@ -2763,13 +2777,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 {
        struct trace_event_call *call = &event_kernel_stack;
        struct ring_buffer_event *event;
+        unsigned int size, nr_entries;
+        struct ftrace_stack *fstack;
        struct stack_entry *entry;
-        struct stack_trace trace;
+        int stackidx;
-        int use_stack;
-        int size = FTRACE_STACK_ENTRIES;
-        trace.nr_entries        = 0;
-        trace.skip              = skip;
        /*
         * Add one, for this function and the call to save_stack_trace()
@@ -2777,7 +2788,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
         */
 #ifndef CONFIG_UNWINDER_ORC
        if (!regs)
-                trace.skip++;
+                skip++;
 #endif
        /*
@@ -2788,53 +2799,40 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
         */
        preempt_disable_notrace();
-        use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
+        stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
+        /* This should never happen. If it does, yell once and skip */
+        if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING))
+                goto out;
        /*
-         * We don't need any atomic variables, just a barrier.
+         * The above __this_cpu_inc_return() is 'atomic' cpu local. An
-         * If an interrupt comes in, we don't care, because it would
+         * interrupt will either see the value pre increment or post
-         * have exited and put the counter back to what we want.
+         * increment. If the interrupt happens pre increment it will have
-         * We just need a barrier to keep gcc from moving things
+         * restored the counter when it returns.  We just need a barrier to
-         * around.
+         * keep gcc from moving things around.
         */
        barrier();
-        if (use_stack == 1) {
-                trace.entries           = this_cpu_ptr(ftrace_stack.calls);
-                trace.max_entries       = FTRACE_STACK_MAX_ENTRIES;
-                if (regs)
+        fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx;
-                        save_stack_trace_regs(regs, &trace);
+        size = ARRAY_SIZE(fstack->calls);
-                else
-                        save_stack_trace(&trace);
-                if (trace.nr_entries > size)
-                        size = trace.nr_entries;
-        } else
-                /* From now on, use_stack is a boolean */
-                use_stack = 0;
-        size *= sizeof(unsigned long);
+        if (regs) {
+                nr_entries = stack_trace_save_regs(regs, fstack->calls,
+                                                   size, skip);
+        } else {
+                nr_entries = stack_trace_save(fstack->calls, size, skip);
+        }
+        size = nr_entries * sizeof(unsigned long);
        event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
                                            sizeof(*entry) + size, flags, pc);
        if (!event)
                goto out;
        entry = ring_buffer_event_data(event);
-        memset(&entry->caller, 0, size);
+        memcpy(&entry->caller, fstack->calls, size);
+        entry->size = nr_entries;
-        if (use_stack)
-                memcpy(&entry->caller, trace.entries,
-                       trace.nr_entries * sizeof(unsigned long));
-        else {
-                trace.max_entries       = FTRACE_STACK_ENTRIES;
-                trace.entries           = entry->caller;
-                if (regs)
-                        save_stack_trace_regs(regs, &trace);
-                else
-                        save_stack_trace(&trace);
-        }
-        entry->size = trace.nr_entries;
        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);
@@ -2904,15 +2902,15 @@ void trace_dump_stack(int skip)
 }
 EXPORT_SYMBOL_GPL(trace_dump_stack);
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
 static DEFINE_PER_CPU(int, user_stack_count);
-void
+static void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
        struct trace_event_call *call = &event_user_stack;
        struct ring_buffer_event *event;
        struct userstack_entry *entry;
-        struct stack_trace trace;
        if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
                return;
@@ -2943,12 +2941,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        entry->tgid             = current->tgid;
        memset(&entry->caller, 0, sizeof(entry->caller));
-        trace.nr_entries        = 0;
+        stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
-        trace.max_entries       = FTRACE_STACK_ENTRIES;
-        trace.skip              = 0;
-        trace.entries           = entry->caller;
-        save_stack_trace_user(&trace);
        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);
@@ -2957,13 +2950,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 out:
        preempt_enable();
 }
+#else /* CONFIG_USER_STACKTRACE_SUPPORT */
-#ifdef UNUSED
+static void ftrace_trace_userstack(struct ring_buffer *buffer,
-static void __trace_userstack(struct trace_array *tr, unsigned long flags)
+                                   unsigned long flags, int pc)
 {
-        ftrace_trace_userstack(tr, flags, preempt_count());
 }
-#endif /* UNUSED */
+#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
 #endif /* CONFIG_STACKTRACE */
@@ -7025,35 +7017,43 @@ struct buffer_ref {
        struct ring_buffer      *buffer;
        void                    *page;
        int                     cpu;
-        int                     ref;
+        refcount_t              refcount;
 };
+static void buffer_ref_release(struct buffer_ref *ref)
+{
+        if (!refcount_dec_and_test(&ref->refcount))
+                return;
+        ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
+        kfree(ref);
+}
 static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
                                    struct pipe_buffer *buf)
 {
        struct buffer_ref *ref = (struct buffer_ref *)buf->private;
-        if (--ref->ref)
+        buffer_ref_release(ref);
-                return;
-        ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
-        kfree(ref);
        buf->private = 0;
 }
-static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
                                struct pipe_buffer *buf)
 {
        struct buffer_ref *ref = (struct buffer_ref *)buf->private;
-        ref->ref++;
+        if (refcount_read(&ref->refcount) > INT_MAX/2)
+                return false;
+        refcount_inc(&ref->refcount);
+        return true;
 }
 /* Pipe buffer operations for a buffer. */
 static const struct pipe_buf_operations buffer_pipe_buf_ops = {
        .confirm                = generic_pipe_buf_confirm,
        .release                = buffer_pipe_buf_release,
-        .steal                  = generic_pipe_buf_steal,
+        .steal                  = generic_pipe_buf_nosteal,
        .get                    = buffer_pipe_buf_get,
 };
@@ -7066,11 +7066,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
        struct buffer_ref *ref =
                (struct buffer_ref *)spd->partial[i].private;
-        if (--ref->ref)
+        buffer_ref_release(ref);
-                return;
-        ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
-        kfree(ref);
        spd->partial[i].private = 0;
 }
@@ -7125,7 +7121,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                        break;
                }
-                ref->ref = 1;
+                refcount_set(&ref->refcount, 1);
                ref->buffer = iter->trace_buffer->buffer;
                ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
                if (IS_ERR(ref->page)) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d80cee49e0eb..639047b259d7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -782,17 +782,9 @@ void update_max_tr_single(struct trace_array *tr,
 #endif /* CONFIG_TRACER_MAX_TRACE */
 #ifdef CONFIG_STACKTRACE
-void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
-                            int pc);
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
                   int pc);
 #else
-static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
-                                          unsigned long flags, int pc)
-{
-}
 static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
                                 int skip, int pc)
 {
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4ad967453b6f..3ea65cdff30d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 void ftrace_likely_update(struct ftrace_likely_data *f, int val,
                          int expect, int is_constant)
 {
+        unsigned long flags = user_access_save();
        /* A constant is always correct */
        if (is_constant) {
                f->constant++;
@@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
                f->data.correct++;
        else
                f->data.incorrect++;
+        user_access_restore(flags);
 }
 EXPORT_SYMBOL(ftrace_likely_update);
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index dd1f43588d70..fa100ed3b4de 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -74,7 +74,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
 static int create_dyn_event(int argc, char **argv)
 {
        struct dyn_event_operations *ops;
-        int ret;
+        int ret = -ENODEV;
        if (argv[0][0] == '-' || argv[0][0] == '!')
                return dyn_event_release(argc, argv, NULL);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index ca46339f3009..a1d20421f4b0 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -3713,7 +3713,6 @@ static void track_data_destroy(struct hist_trigger_data *hist_data,
        struct trace_event_file *file = hist_data->event_file;
        destroy_hist_field(data->track_data.track_var, 0);
-        destroy_hist_field(data->track_data.var_ref, 0);
        if (data->action == ACTION_SNAPSHOT) {
                struct track_data *track_data;
@@ -5187,7 +5186,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
        u64 var_ref_vals[TRACING_MAP_VARS_MAX];
        char compound_key[HIST_KEY_SIZE_MAX];
        struct tracing_map_elt *elt = NULL;
-        struct stack_trace stacktrace;
        struct hist_field *key_field;
        u64 field_contents;
        void *key = NULL;
@@ -5199,14 +5197,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
                key_field = hist_data->fields[i];
                if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
-                        stacktrace.max_entries = HIST_STACKTRACE_DEPTH;
+                        memset(entries, 0, HIST_STACKTRACE_SIZE);
-                        stacktrace.entries = entries;
+                        stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
-                        stacktrace.nr_entries = 0;
+                                         HIST_STACKTRACE_SKIP);
-                        stacktrace.skip = HIST_STACKTRACE_SKIP;
-                        memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE);
-                        save_stack_trace(&stacktrace);
                        key = entries;
                } else {
                        field_contents = key_field->fn(key_field, elt, rbe, rec);
@@ -5247,7 +5240,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
        unsigned int i;
        for (i = 0; i < max_entries; i++) {
-                if (stacktrace_entries[i] == ULONG_MAX)
+                if (!stacktrace_entries[i])
                        return;
                seq_printf(m, "%*c", 1 + spaces, ' ');
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index eec648a0d673..5d16f73898db 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,44 +18,32 @@
 #include "trace.h"
-static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
+#define STACK_TRACE_ENTRIES 500
-         { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
-unsigned stack_trace_index[STACK_TRACE_ENTRIES];
-/*
+static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
- * Reserve one entry for the passed in ip. This will allow
+static unsigned stack_trace_index[STACK_TRACE_ENTRIES];
- * us to remove most or all of the stack size overhead
- * added by the stack tracer itself.
- */
-struct stack_trace stack_trace_max = {
-        .max_entries            = STACK_TRACE_ENTRIES - 1,
-        .entries                = &stack_dump_trace[0],
-};
-unsigned long stack_trace_max_size;
+static unsigned int stack_trace_nr_entries;
-arch_spinlock_t stack_trace_max_lock =
+static unsigned long stack_trace_max_size;
+static arch_spinlock_t stack_trace_max_lock =
        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 DEFINE_PER_CPU(int, disable_stack_tracer);
 static DEFINE_MUTEX(stack_sysctl_mutex);
 int stack_tracer_enabled;
-static int last_stack_tracer_enabled;
-void stack_trace_print(void)
+static void print_max_stack(void)
 {
        long i;
        int size;
        pr_emerg("        Depth    Size   Location    (%d entries)\n"
                           "        -----    ----   --------\n",
-                           stack_trace_max.nr_entries);
+                           stack_trace_nr_entries);
-        for (i = 0; i < stack_trace_max.nr_entries; i++) {
+        for (i = 0; i < stack_trace_nr_entries; i++) {
-                if (stack_dump_trace[i] == ULONG_MAX)
+                if (i + 1 == stack_trace_nr_entries)
-                        break;
-                if (i+1 == stack_trace_max.nr_entries ||
-                                stack_dump_trace[i+1] == ULONG_MAX)
                        size = stack_trace_index[i];
                else
                        size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -65,16 +53,7 @@ void stack_trace_print(void)
        }
 }
-/*
+static void check_stack(unsigned long ip, unsigned long *stack)
- * When arch-specific code overrides this function, the following
- * data should be filled up, assuming stack_trace_max_lock is held to
- * prevent concurrent updates.
- *     stack_trace_index[]
- *     stack_trace_max
- *     stack_trace_max_size
- */
-void __weak
-check_stack(unsigned long ip, unsigned long *stack)
 {
        unsigned long this_size, flags; unsigned long *p, *top, *start;
        static int tracer_frame;
@@ -110,13 +89,12 @@ check_stack(unsigned long ip, unsigned long *stack)
        stack_trace_max_size = this_size;
-        stack_trace_max.nr_entries = 0;
+        stack_trace_nr_entries = stack_trace_save(stack_dump_trace,
-        stack_trace_max.skip = 0;
+                                               ARRAY_SIZE(stack_dump_trace) - 1,
+                                               0);
-        save_stack_trace(&stack_trace_max);
        /* Skip over the overhead of the stack tracer itself */
-        for (i = 0; i < stack_trace_max.nr_entries; i++) {
+        for (i = 0; i < stack_trace_nr_entries; i++) {
                if (stack_dump_trace[i] == ip)
                        break;
        }
@@ -125,7 +103,7 @@ check_stack(unsigned long ip, unsigned long *stack)
         * Some archs may not have the passed in ip in the dump.
         * If that happens, we need to show everything.
         */
-        if (i == stack_trace_max.nr_entries)
+        if (i == stack_trace_nr_entries)
                i = 0;
        /*
@@ -143,15 +121,13 @@ check_stack(unsigned long ip, unsigned long *stack)
         * loop will only happen once. This code only takes place
         * on a new max, so it is far from a fast path.
         */
-        while (i < stack_trace_max.nr_entries) {
+        while (i < stack_trace_nr_entries) {
                int found = 0;
                stack_trace_index[x] = this_size;
                p = start;
-                for (; p < top && i < stack_trace_max.nr_entries; p++) {
+                for (; p < top && i < stack_trace_nr_entries; p++) {
-                        if (stack_dump_trace[i] == ULONG_MAX)
-                                break;
                        /*
                         * The READ_ONCE_NOCHECK is used to let KASAN know that
                         * this is not a stack-out-of-bounds error.
@@ -182,12 +158,10 @@ check_stack(unsigned long ip, unsigned long *stack)
                        i++;
        }
-        stack_trace_max.nr_entries = x;
+        stack_trace_nr_entries = x;
-        for (; x < i; x++)
-                stack_dump_trace[x] = ULONG_MAX;
        if (task_stack_end_corrupted(current)) {
-                stack_trace_print();
+                print_max_stack();
                BUG();
        }
@@ -286,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos)
 {
        long n = *pos - 1;
-        if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+        if (n >= stack_trace_nr_entries)
                return NULL;
        m->private = (void *)n;
@@ -350,7 +324,7 @@ static int t_show(struct seq_file *m, void *v)
                seq_printf(m, "        Depth    Size   Location"
                           "    (%d entries)\n"
                           "        -----    ----   --------\n",
-                           stack_trace_max.nr_entries);
+                           stack_trace_nr_entries);
                if (!stack_tracer_enabled && !stack_trace_max_size)
                        print_disabled(m);
@@ -360,12 +334,10 @@ static int t_show(struct seq_file *m, void *v)
        i = *(long *)v;
-        if (i >= stack_trace_max.nr_entries ||
+        if (i >= stack_trace_nr_entries)
-            stack_dump_trace[i] == ULONG_MAX)
                return 0;
-        if (i+1 == stack_trace_max.nr_entries ||
+        if (i + 1 == stack_trace_nr_entries)
-            stack_dump_trace[i+1] == ULONG_MAX)
                size = stack_trace_index[i];
        else
                size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -422,23 +394,21 @@ stack_trace_sysctl(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp,
                   loff_t *ppos)
 {
+        int was_enabled;
        int ret;
        mutex_lock(&stack_sysctl_mutex);
+        was_enabled = !!stack_tracer_enabled;
        ret = proc_dointvec(table, write, buffer, lenp, ppos);
-        if (ret || !write ||
+        if (ret || !write || (was_enabled == !!stack_tracer_enabled))
-            (last_stack_tracer_enabled == !!stack_tracer_enabled))
                goto out;
-        last_stack_tracer_enabled = !!stack_tracer_enabled;
        if (stack_tracer_enabled)
                register_ftrace_function(&trace_ops);
        else
                unregister_ftrace_function(&trace_ops);
 out:
        mutex_unlock(&stack_sysctl_mutex);
        return ret;
@@ -454,7 +424,6 @@ static __init int enable_stacktrace(char *str)
                strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
        stack_tracer_enabled = 1;
-        last_stack_tracer_enabled = 1;
        return 1;
 }
 __setup("stacktrace", enable_stacktrace);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f93a56d2db27..fa8fbff736d6 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -314,6 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
        unsigned long irq_flags;
+        unsigned long args[6];
        int pc;
        int syscall_nr;
        int size;
@@ -347,7 +348,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
        entry = ring_buffer_event_data(event);
        entry->nr = syscall_nr;
-        syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
+        syscall_get_arguments(current, regs, args);
+        memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
        event_trigger_unlock_commit(trace_file, buffer, event, entry,
                                    irq_flags, pc);
@@ -583,6 +585,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
        struct hlist_head *head;
+        unsigned long args[6];
        bool valid_prog_array;
        int syscall_nr;
        int rctx;
@@ -613,8 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
                return;
        rec->nr = syscall_nr;
-        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+        syscall_get_arguments(current, regs, args);
-                               (unsigned long *)&rec->args);
+        memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
        if ((valid_prog_array &&
             !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8fbfda94a67b..7f9e7b9306fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -42,9 +42,9 @@ int __read_mostly watchdog_user_enabled = 1;
 int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
 int __read_mostly soft_watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
-int __read_mostly nmi_watchdog_available;
+static int __read_mostly nmi_watchdog_available;
-struct cpumask watchdog_allowed_mask __read_mostly;
+static struct cpumask watchdog_allowed_mask __read_mostly;
 struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -554,13 +554,15 @@ static void softlockup_start_all(void)
 int lockup_detector_online_cpu(unsigned int cpu)
 {
-        watchdog_enable(cpu);
+        if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
+                watchdog_enable(cpu);
        return 0;
 }
 int lockup_detector_offline_cpu(unsigned int cpu)
 {
-        watchdog_disable(cpu);
+        if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
+                watchdog_disable(cpu);
        return 0;
 }
@@ -588,7 +590,7 @@ static void lockup_detector_reconfigure(void)
 * Create the watchdog thread infrastructure and configure the detector(s).
 *
 * The threads are not unparked as watchdog_allowed_mask is empty.  When
- * the threads are sucessfully initialized, take the proper locks and
+ * the threads are successfully initialized, take the proper locks and
 * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
 */
 static __init void lockup_detector_setup(void)
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 71381168dede..247bf0b1582c 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -135,7 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
                if (__this_cpu_read(hard_watchdog_warn) == true)
                        return;
-                pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+                pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
+                         this_cpu);
                print_modules();
                print_irqtrace_events(current);
                if (regs)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4026d1871407..faf7622246da 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -841,43 +841,32 @@ static void wake_up_worker(struct worker_pool *pool)
 }
 /**
- * wq_worker_waking_up - a worker is waking up
+ * wq_worker_running - a worker is running again
 * @task: task waking up
- * @cpu: CPU @task is waking up to
 *
- * This function is called during try_to_wake_up() when a worker is
+ * This function is called when a worker returns from schedule()
- * being awoken.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
 */
-void wq_worker_waking_up(struct task_struct *task, int cpu)
+void wq_worker_running(struct task_struct *task)
 {
        struct worker *worker = kthread_data(task);
-        if (!(worker->flags & WORKER_NOT_RUNNING)) {
+        if (!worker->sleeping)
-                WARN_ON_ONCE(worker->pool->cpu != cpu);
+                return;
+        if (!(worker->flags & WORKER_NOT_RUNNING))
                atomic_inc(&worker->pool->nr_running);
-        }
+        worker->sleeping = 0;
 }
 /**
 * wq_worker_sleeping - a worker is going to sleep
 * @task: task going to sleep
 *
- * This function is called during schedule() when a busy worker is
+ * This function is called from schedule() when a busy worker is
- * going to sleep.  Worker on the same cpu can be woken up by
+ * going to sleep.
- * returning pointer to its task.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
- *
- * Return:
- * Worker task on @cpu to wake up, %NULL if none.
 */
-struct task_struct *wq_worker_sleeping(struct task_struct *task)
+void wq_worker_sleeping(struct task_struct *task)
 {
-        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+        struct worker *next, *worker = kthread_data(task);
        struct worker_pool *pool;
        /*
@@ -886,13 +875,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
         * checking NOT_RUNNING.
         */
        if (worker->flags & WORKER_NOT_RUNNING)
-                return NULL;
+                return;
        pool = worker->pool;
-        /* this can only happen on the local cpu */
+        if (WARN_ON_ONCE(worker->sleeping))
-        if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
+                return;
-                return NULL;
+        worker->sleeping = 1;
+        spin_lock_irq(&pool->lock);
        /*
         * The counterpart of the following dec_and_test, implied mb,
@@ -906,9 +897,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
         * lock is safe.
         */
        if (atomic_dec_and_test(&pool->nr_running) &&
-            !list_empty(&pool->worklist))
+            !list_empty(&pool->worklist)) {
-                to_wakeup = first_idle_worker(pool);
+                next = first_idle_worker(pool);
-        return to_wakeup ? to_wakeup->task : NULL;
+                if (next)
+                        wake_up_process(next->task);
+        }
+        spin_unlock_irq(&pool->lock);
 }
 /**
@@ -2277,7 +2271,7 @@ __acquires(&pool->lock)
        if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
                pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
-                       "     last function: %pf\n",
+                       "     last function: %ps\n",
                       current->comm, preempt_count(), task_pid_nr(current),
                       worker->current_func);
                debug_show_held_locks(current);
@@ -2596,11 +2590,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,
        worker = current_wq_worker();
        WARN_ONCE(current->flags & PF_MEMALLOC,
-                  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
+                  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
                  current->pid, current->comm, target_wq->name, target_func);
        WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
                              (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
-                  "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
+                  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
                  worker->current_pwq->wq->name, worker->current_func,
                  target_wq->name, target_func);
 }
@@ -4266,7 +4260,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
        INIT_LIST_HEAD(&wq->list);
        if (alloc_and_link_pwqs(wq) < 0)
-                goto err_free_wq;
+                goto err_unreg_lockdep;
        if (wq_online && init_rescuer(wq) < 0)
                goto err_destroy;
@@ -4292,9 +4286,10 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
        return wq;
-err_free_wq:
+err_unreg_lockdep:
        wq_unregister_lockdep(wq);
        wq_free_lockdep(wq);
+err_free_wq:
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
        return NULL;
@@ -4586,7 +4581,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
        probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
        if (fn || name[0] || desc[0]) {
-                printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
+                printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
                if (strcmp(name, desc))
                        pr_cont(" (%s)", desc);
                pr_cont("\n");
@@ -4611,7 +4606,7 @@ static void pr_cont_work(bool comma, struct work_struct *work)
                pr_cont("%s BAR(%d)", comma ? "," : "",
                        task_pid_nr(barr->task));
        } else {
-                pr_cont("%s %pf", comma ? "," : "", work->func);
+                pr_cont("%s %ps", comma ? "," : "", work->func);
        }
 }
@@ -4643,7 +4638,7 @@ static void show_pwq(struct pool_workqueue *pwq)
                        if (worker->current_pwq != pwq)
                                continue;
-                        pr_cont("%s %d%s:%pf", comma ? "," : "",
+                        pr_cont("%s %d%s:%ps", comma ? "," : "",
                                task_pid_nr(worker->task),
                                worker == pwq->wq->rescuer ? "(RESCUER)" : "",
                                worker->current_func);
@@ -4928,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool)
                 *
                 * WRITE_ONCE() is necessary because @worker->flags may be
                 * tested without holding any lock in
-                 * wq_worker_waking_up().  Without it, NOT_RUNNING test may
+                 * wq_worker_running().  Without it, NOT_RUNNING test may
                 * fail incorrectly leading to premature concurrency
                 * management operations.
                 */
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index cb68b03ca89a..498de0e909a4 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -44,6 +44,7 @@ struct worker {
        unsigned long           last_active;    /* L: last active timestamp */
        unsigned int            flags;          /* X: flags */
        int                     id;             /* I: worker id */
+        int                     sleeping;       /* None */
        /*
         * Opaque string set with work_set_desc().  Printed out with task
@@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void)
 * Scheduler hooks for concurrency managed workqueue.  Only to be used from
 * sched/ and workqueue.c.
 */
-void wq_worker_waking_up(struct task_struct *task, int cpu);
+void wq_worker_running(struct task_struct *task);
-struct task_struct *wq_worker_sleeping(struct task_struct *task);
+void wq_worker_sleeping(struct task_struct *task);
 work_func_t wq_worker_last_func(struct task_struct *task);
 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */