257 files changed, 10205 insertions, 4708 deletions
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
index a3bb4cb52539..68646feefb3d 100644
--- a/kernel/Kconfig.freezer
+++ b/kernel/Kconfig.freezer
@@ -1,2 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config FREEZER
        def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..38ef6d06888e 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Timer Interrupt Frequency Configuration
 #
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index bf770d7556f7..e0852dc333ac 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # The ARCH_INLINE foo is necessary because select ignores "depends on"
 #
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0fee5fe6c899..dc0b682ec2d9 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 choice
        prompt "Preemption Model"
diff --git a/kernel/Makefile b/kernel/Makefile
index 298437bb2c6a..a8d923b5481b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o
+obj-$(CONFIG_IKHEADERS) += kheaders.o
 obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
 quiet_cmd_genikh = CHK     $(obj)/kheaders_data.tar.xz
-cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@
+cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
 $(obj)/kheaders_data.tar.xz: FORCE
        $(call cmd,genikh)
diff --git a/kernel/async.c b/kernel/async.c
index 12c332e4e13e..4f9c1d614016 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * async.c: Asynchronous function calls for boot performance
 *
 * (C) Copyright 2009 Intel Corporation
 * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
 */
diff --git a/kernel/audit.c b/kernel/audit.c
index b96bf69183f4..da8dc0db5bd3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* audit.c -- Auditing support
 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
 * System-call specific features have moved to auditsc.c
@@ -5,20 +6,6 @@
 * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
 * All Rights Reserved.
 *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
 *
 * Goals: 1) Integrate fully with Security Modules.
@@ -2274,6 +2261,33 @@ out:
 }
 /**
+ * audit_signal_info - record signal info for shutting down audit subsystem
+ * @sig: signal value
+ * @t: task being signaled
+ *
+ * If the audit subsystem is being terminated, record the task (pid)
+ * and uid that is doing that.
+ */
+int audit_signal_info(int sig, struct task_struct *t)
+{
+        kuid_t uid = current_uid(), auid;
+        if (auditd_test_task(t) &&
+            (sig == SIGTERM || sig == SIGHUP ||
+             sig == SIGUSR1 || sig == SIGUSR2)) {
+                audit_sig_pid = task_tgid_nr(current);
+                auid = audit_get_loginuid(current);
+                if (uid_valid(auid))
+                        audit_sig_uid = auid;
+                else
+                        audit_sig_uid = uid;
+                security_task_getsecid(current, &audit_sig_sid);
+        }
+        return audit_signal_info_syscall(t);
+}
+/**
 * audit_log_end - end one audit record
 * @ab: the audit_buffer
 *
diff --git a/kernel/audit.h b/kernel/audit.h
index 2071725a999f..6fb7160412d4 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -1,22 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* audit -- definition of audit_context structure and supporting types 
 *
 * Copyright 2003-2004 Red Hat, Inc.
 * Copyright 2005 Hewlett-Packard Development Company, L.P.
 * Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/fs.h>
@@ -299,7 +286,7 @@ extern const char *audit_tree_path(struct audit_tree *tree);
 extern void audit_put_tree(struct audit_tree *tree);
 extern void audit_kill_trees(struct audit_context *context);
-extern int audit_signal_info(int sig, struct task_struct *t);
+extern int audit_signal_info_syscall(struct task_struct *t);
 extern void audit_filter_inodes(struct task_struct *tsk,
                                struct audit_context *ctx);
 extern struct list_head *audit_killed_trees(void);
@@ -330,7 +317,11 @@ extern struct list_head *audit_killed_trees(void);
 #define audit_tree_path(rule) ""        /* never called */
 #define audit_kill_trees(context) BUG()
-#define audit_signal_info(s, t) AUDIT_DISABLED
+static inline int audit_signal_info_syscall(struct task_struct *t)
+{
+        return 0;
+}
 #define audit_filter_inodes(t, c) AUDIT_DISABLED
 #endif /* CONFIG_AUDITSYSCALL */
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index b5737b826951..f0d243318452 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* audit_fsnotify.c -- tracking inodes
 *
 * Copyright 2003-2009,2014-2015 Red Hat, Inc.
 * Copyright 2005 Hewlett-Packard Development Company, L.P.
 * Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
 */
 #include <linux/kernel.h>
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index b50c574223fa..1f31c2f1e6fc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* audit_watch.c -- watching inodes
 *
 * Copyright 2003-2009 Red Hat, Inc.
 * Copyright 2005 Hewlett-Packard Development Company, L.P.
 * Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/file.h>
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 303fb04770ce..b0126e9c0743 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* auditfilter.c -- filtering of audit events
 *
 * Copyright 2003-2004 Red Hat, Inc.
 * Copyright 2005 Hewlett-Packard Development Company, L.P.
 * Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -335,7 +322,7 @@ static u32 audit_to_op(u32 op)
 /* check if an audit field is valid */
 static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 {
-        switch(f->type) {
+        switch (f->type) {
        case AUDIT_MSGTYPE:
                if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE &&
                    entry->rule.listnr != AUDIT_FILTER_USER)
@@ -347,7 +334,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
                break;
        }
-        switch(entry->rule.listnr) {
+        switch (entry->rule.listnr) {
        case AUDIT_FILTER_FS:
                switch(f->type) {
                case AUDIT_FSTYPE:
@@ -358,9 +345,16 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
                }
        }
-        switch(f->type) {
+        /* Check for valid field type and op */
-        default:
+        switch (f->type) {
-                return -EINVAL;
+        case AUDIT_ARG0:
+        case AUDIT_ARG1:
+        case AUDIT_ARG2:
+        case AUDIT_ARG3:
+        case AUDIT_PERS: /* <uapi/linux/personality.h> */
+        case AUDIT_DEVMINOR:
+                /* all ops are valid */
+                break;
        case AUDIT_UID:
        case AUDIT_EUID:
        case AUDIT_SUID:
@@ -373,46 +367,53 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
        case AUDIT_FSGID:
        case AUDIT_OBJ_GID:
        case AUDIT_PID:
-        case AUDIT_PERS:
        case AUDIT_MSGTYPE:
        case AUDIT_PPID:
        case AUDIT_DEVMAJOR:
-        case AUDIT_DEVMINOR:
        case AUDIT_EXIT:
        case AUDIT_SUCCESS:
        case AUDIT_INODE:
        case AUDIT_SESSIONID:
+        case AUDIT_SUBJ_SEN:
+        case AUDIT_SUBJ_CLR:
+        case AUDIT_OBJ_LEV_LOW:
+        case AUDIT_OBJ_LEV_HIGH:
+        case AUDIT_SADDR_FAM:
                /* bit ops are only useful on syscall args */
                if (f->op == Audit_bitmask || f->op == Audit_bittest)
                        return -EINVAL;
                break;
-        case AUDIT_ARG0:
-        case AUDIT_ARG1:
-        case AUDIT_ARG2:
-        case AUDIT_ARG3:
        case AUDIT_SUBJ_USER:
        case AUDIT_SUBJ_ROLE:
        case AUDIT_SUBJ_TYPE:
-        case AUDIT_SUBJ_SEN:
-        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_USER:
        case AUDIT_OBJ_ROLE:
        case AUDIT_OBJ_TYPE:
-        case AUDIT_OBJ_LEV_LOW:
-        case AUDIT_OBJ_LEV_HIGH:
        case AUDIT_WATCH:
        case AUDIT_DIR:
        case AUDIT_FILTERKEY:
-                break;
        case AUDIT_LOGINUID_SET:
-                if ((f->val != 0) && (f->val != 1))
-                        return -EINVAL;
-        /* FALL THROUGH */
        case AUDIT_ARCH:
        case AUDIT_FSTYPE:
+        case AUDIT_PERM:
+        case AUDIT_FILETYPE:
+        case AUDIT_FIELD_COMPARE:
+        case AUDIT_EXE:
+                /* only equal and not equal valid ops */
                if (f->op != Audit_not_equal && f->op != Audit_equal)
                        return -EINVAL;
                break;
+        default:
+                /* field not recognized */
+                return -EINVAL;
+        }
+        /* Check for select valid field values */
+        switch (f->type) {
+        case AUDIT_LOGINUID_SET:
+                if ((f->val != 0) && (f->val != 1))
+                        return -EINVAL;
+                break;
        case AUDIT_PERM:
                if (f->val & ~15)
                        return -EINVAL;
@@ -425,11 +426,14 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
                if (f->val > AUDIT_MAX_FIELD_COMPARE)
                        return -EINVAL;
                break;
-        case AUDIT_EXE:
+        case AUDIT_SADDR_FAM:
-                if (f->op != Audit_not_equal && f->op != Audit_equal)
+                if (f->val >= AF_MAX)
                        return -EINVAL;
                break;
+        default:
+                break;
        }
        return 0;
 }
@@ -1203,7 +1207,6 @@ int audit_comparator(u32 left, u32 op, u32 right)
        case Audit_bittest:
                return ((left & right) == right);
        default:
-                BUG();
                return 0;
        }
 }
@@ -1226,7 +1229,6 @@ int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
        case Audit_bitmask:
        case Audit_bittest:
        default:
-                BUG();
                return 0;
        }
 }
@@ -1249,7 +1251,6 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
        case Audit_bitmask:
        case Audit_bittest:
        default:
-                BUG();
                return 0;
        }
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 95ae27edd417..4effe01ebbe2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -601,12 +601,20 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_WATCH:
-                        if (name)
+                        if (name) {
-                                result = audit_watch_compare(rule->watch, name->ino, name->dev);
+                                result = audit_watch_compare(rule->watch,
+                                                             name->ino,
+                                                             name->dev);
+                                if (f->op == Audit_not_equal)
+                                        result = !result;
+                        }
                        break;
                case AUDIT_DIR:
-                        if (ctx)
+                        if (ctx) {
                                result = match_tree_refs(ctx, rule->tree);
+                                if (f->op == Audit_not_equal)
+                                        result = !result;
+                        }
                        break;
                case AUDIT_LOGINUID:
                        result = audit_uid_comparator(audit_get_loginuid(tsk),
@@ -615,6 +623,11 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_LOGINUID_SET:
                        result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
                        break;
+                case AUDIT_SADDR_FAM:
+                        if (ctx->sockaddr)
+                                result = audit_comparator(ctx->sockaddr->ss_family,
+                                                          f->op, f->val);
+                        break;
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
@@ -684,9 +697,13 @@ static int audit_filter_rules(struct task_struct *tsk,
                        break;
                case AUDIT_PERM:
                        result = audit_match_perm(ctx, f->val);
+                        if (f->op == Audit_not_equal)
+                                result = !result;
                        break;
                case AUDIT_FILETYPE:
                        result = audit_match_filetype(ctx, f->val);
+                        if (f->op == Audit_not_equal)
+                                result = !result;
                        break;
                case AUDIT_FIELD_COMPARE:
                        result = audit_field_compare(tsk, cred, f, ctx, name);
@@ -2360,30 +2377,17 @@ void __audit_ptrace(struct task_struct *t)
 }
 /**
- * audit_signal_info - record signal info for shutting down audit subsystem
+ * audit_signal_info_syscall - record signal info for syscalls
- * @sig: signal value
 * @t: task being signaled
 *
 * If the audit subsystem is being terminated, record the task (pid)
 * and uid that is doing that.
 */
-int audit_signal_info(int sig, struct task_struct *t)
+int audit_signal_info_syscall(struct task_struct *t)
 {
        struct audit_aux_data_pids *axp;
        struct audit_context *ctx = audit_context();
-        kuid_t uid = current_uid(), auid, t_uid = task_uid(t);
+        kuid_t t_uid = task_uid(t);
-        if (auditd_test_task(t) &&
-            (sig == SIGTERM || sig == SIGHUP ||
-             sig == SIGUSR1 || sig == SIGUSR2)) {
-                audit_sig_pid = task_tgid_nr(current);
-                auid = audit_get_loginuid(current);
-                if (uid_valid(auid))
-                        audit_sig_uid = auid;
-                else
-                        audit_sig_uid = uid;
-                security_task_getsecid(current, &audit_sig_sid);
-        }
        if (!audit_signals || audit_dummy_context())
                return 0;
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a563c8fdad0d..a2a97fa3071b 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Simple stack backtrace regression test module
 *
 * (C) Copyright 2008 Intel Corporation
 * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
 */
 #include <linux/completion.h>
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 4c2fa3ac56f6..29d781061cd5 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-y := core.o
+CFLAGS_core.o += $(call cc-disable-warning, override-init)
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 584636c9e2eb..1c65ce0098a9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016,2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
 */
 #include <linux/bpf.h>
 #include <linux/btf.h>
@@ -83,6 +75,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
        u32 elem_size, index_mask, max_entries;
        bool unpriv = !capable(CAP_SYS_ADMIN);
        u64 cost, array_size, mask64;
+        struct bpf_map_memory mem;
        struct bpf_array *array;
        elem_size = round_up(attr->value_size, 8);
@@ -116,32 +109,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
        /* make sure there is no u32 overflow later in round_up() */
        cost = array_size;
-        if (cost >= U32_MAX - PAGE_SIZE)
+        if (percpu)
-                return ERR_PTR(-ENOMEM);
-        if (percpu) {
                cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
-                if (cost >= U32_MAX - PAGE_SIZE)
-                        return ERR_PTR(-ENOMEM);
-        }
-        cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-        ret = bpf_map_precharge_memlock(cost);
+        ret = bpf_map_charge_init(&mem, cost);
        if (ret < 0)
                return ERR_PTR(ret);
        /* allocate all map elements and zero-initialize them */
        array = bpf_map_area_alloc(array_size, numa_node);
-        if (!array)
+        if (!array) {
+                bpf_map_charge_finish(&mem);
                return ERR_PTR(-ENOMEM);
+        }
        array->index_mask = index_mask;
        array->map.unpriv_array = unpriv;
        /* copy mandatory map attributes */
        bpf_map_init_from_attr(&array->map, attr);
-        array->map.pages = cost;
+        bpf_map_charge_move(&array->map.memory, &mem);
        array->elem_size = elem_size;
        if (percpu && bpf_array_alloc_percpu(array)) {
+                bpf_map_charge_finish(&array->map.memory);
                bpf_map_area_free(array);
                return ERR_PTR(-ENOMEM);
        }
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e6ef4401a138..1b6b9349cb85 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
 */
 #include <linux/cpumask.h>
 #include <linux/spinlock.h>
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 7d4f89b7cb84..f02504640e18 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
 */
 #ifndef __BPF_LRU_LIST_H_
 #define __BPF_LRU_LIST_H_
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index cad09858a5f2..546ebee39e2a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1928,8 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
        /* Check array->index_type */
        index_type_id = array->index_type;
        index_type = btf_type_by_id(btf, index_type_id);
-        if (btf_type_is_resolve_source_only(index_type) ||
+        if (btf_type_nosize_or_null(index_type) ||
-            btf_type_nosize_or_null(index_type)) {
+            btf_type_is_resolve_source_only(index_type)) {
                btf_verifier_log_type(env, v->t, "Invalid index");
                return -EINVAL;
        }
@@ -1948,8 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
        /* Check array->type */
        elem_type_id = array->type;
        elem_type = btf_type_by_id(btf, elem_type_id);
-        if (btf_type_is_resolve_source_only(elem_type) ||
+        if (btf_type_nosize_or_null(elem_type) ||
-            btf_type_nosize_or_null(elem_type)) {
+            btf_type_is_resolve_source_only(elem_type)) {
                btf_verifier_log_type(env, v->t,
                                      "Invalid elem");
                return -EINVAL;
@@ -2170,8 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
                const struct btf_type *member_type = btf_type_by_id(env->btf,
                                                                member_type_id);
-                if (btf_type_is_resolve_source_only(member_type) ||
+                if (btf_type_nosize_or_null(member_type) ||
-                    btf_type_nosize_or_null(member_type)) {
+                    btf_type_is_resolve_source_only(member_type)) {
                        btf_verifier_log_member(env, v->t, member,
                                                "Invalid member");
                        return -EINVAL;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index fcde0f7b2585..0a00eaca6fae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Functions to manage eBPF programs attached to cgroups
 *
 * Copyright (c) 2016 Daniel Mack
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License.  See the file COPYING in the main directory of the
- * Linux distribution for more details.
 */
 #include <linux/kernel.h>
@@ -18,19 +15,34 @@
 #include <linux/bpf.h>
 #include <linux/bpf-cgroup.h>
 #include <net/sock.h>
+#include <net/bpf_sk_storage.h>
+#include "../cgroup/cgroup-internal.h"
 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+void cgroup_bpf_offline(struct cgroup *cgrp)
+{
+        cgroup_get(cgrp);
+        percpu_ref_kill(&cgrp->bpf.refcnt);
+}
 /**
- * cgroup_bpf_put() - put references of all bpf programs
+ * cgroup_bpf_release() - put references of all bpf programs and
- * @cgrp: the cgroup to modify
+ *                        release all cgroup bpf data
+ * @work: work structure embedded into the cgroup to modify
 */
-void cgroup_bpf_put(struct cgroup *cgrp)
+static void cgroup_bpf_release(struct work_struct *work)
 {
+        struct cgroup *cgrp = container_of(work, struct cgroup,
+                                           bpf.release_work);
        enum bpf_cgroup_storage_type stype;
+        struct bpf_prog_array *old_array;
        unsigned int type;
+        mutex_lock(&cgroup_mutex);
        for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
                struct list_head *progs = &cgrp->bpf.progs[type];
                struct bpf_prog_list *pl, *tmp;
@@ -45,8 +57,29 @@ void cgroup_bpf_put(struct cgroup *cgrp)
                        kfree(pl);
                        static_branch_dec(&cgroup_bpf_enabled_key);
                }
-                bpf_prog_array_free(cgrp->bpf.effective[type]);
+                old_array = rcu_dereference_protected(
+                                cgrp->bpf.effective[type],
+                                lockdep_is_held(&cgroup_mutex));
+                bpf_prog_array_free(old_array);
        }
+        mutex_unlock(&cgroup_mutex);
+        percpu_ref_exit(&cgrp->bpf.refcnt);
+        cgroup_put(cgrp);
+}
+/**
+ * cgroup_bpf_release_fn() - callback used to schedule releasing
+ *                           of bpf cgroup data
+ * @ref: percpu ref counter structure
+ */
+static void cgroup_bpf_release_fn(struct percpu_ref *ref)
+{
+        struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
+        INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
+        queue_work(system_wq, &cgrp->bpf.release_work);
 }
 /* count number of elements in the list.
@@ -101,7 +134,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
 */
 static int compute_effective_progs(struct cgroup *cgrp,
                                   enum bpf_attach_type type,
-                                   struct bpf_prog_array __rcu **array)
+                                   struct bpf_prog_array **array)
 {
        enum bpf_cgroup_storage_type stype;
        struct bpf_prog_array *progs;
@@ -139,17 +172,16 @@ static int compute_effective_progs(struct cgroup *cgrp,
                }
        } while ((p = cgroup_parent(p)));
-        rcu_assign_pointer(*array, progs);
+        *array = progs;
        return 0;
 }
 static void activate_effective_progs(struct cgroup *cgrp,
                                     enum bpf_attach_type type,
-                                     struct bpf_prog_array __rcu *array)
+                                     struct bpf_prog_array *old_array)
 {
-        struct bpf_prog_array __rcu *old_array;
+        rcu_swap_protected(cgrp->bpf.effective[type], old_array,
+                           lockdep_is_held(&cgroup_mutex));
-        old_array = xchg(&cgrp->bpf.effective[type], array);
        /* free prog array after grace period, since __cgroup_bpf_run_*()
         * might be still walking the array
         */
@@ -166,8 +198,13 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
 * that array below is variable length
 */
 #define NR ARRAY_SIZE(cgrp->bpf.effective)
-        struct bpf_prog_array __rcu *arrays[NR] = {};
+        struct bpf_prog_array *arrays[NR] = {};
-        int i;
+        int ret, i;
+        ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
+                              GFP_KERNEL);
+        if (ret)
+                return ret;
        for (i = 0; i < NR; i++)
                INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -183,6 +220,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
 cleanup:
        for (i = 0; i < NR; i++)
                bpf_prog_array_free(arrays[i]);
+        percpu_ref_exit(&cgrp->bpf.refcnt);
        return -ENOMEM;
 }
@@ -196,6 +236,9 @@ static int update_effective_progs(struct cgroup *cgrp,
        css_for_each_descendant_pre(css, &cgrp->self) {
                struct cgroup *desc = container_of(css, struct cgroup, self);
+                if (percpu_ref_is_zero(&desc->bpf.refcnt))
+                        continue;
                err = compute_effective_progs(desc, type, &desc->bpf.inactive);
                if (err)
                        goto cleanup;
@@ -205,6 +248,14 @@ static int update_effective_progs(struct cgroup *cgrp,
        css_for_each_descendant_pre(css, &cgrp->self) {
                struct cgroup *desc = container_of(css, struct cgroup, self);
+                if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
+                        if (unlikely(desc->bpf.inactive)) {
+                                bpf_prog_array_free(desc->bpf.inactive);
+                                desc->bpf.inactive = NULL;
+                        }
+                        continue;
+                }
                activate_effective_progs(desc, type, desc->bpf.inactive);
                desc->bpf.inactive = NULL;
        }
@@ -444,10 +495,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
        enum bpf_attach_type type = attr->query.attach_type;
        struct list_head *progs = &cgrp->bpf.progs[type];
        u32 flags = cgrp->bpf.flags[type];
+        struct bpf_prog_array *effective;
        int cnt, ret = 0, i;
+        effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+                                              lockdep_is_held(&cgroup_mutex));
        if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
-                cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+                cnt = bpf_prog_array_length(effective);
        else
                cnt = prog_list_length(progs);
@@ -464,8 +519,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
        }
        if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
-                return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+                return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
-                                                   prog_ids, cnt);
        } else {
                struct bpf_prog_list *pl;
                u32 id;
@@ -548,8 +602,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
 * The program type passed in via @type must be suitable for network
 * filtering. No further check is performed to assert that.
 *
- * This function will return %-EPERM if any if an attached program was found
+ * For egress packets, this function can return:
- * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ *   NET_XMIT_SUCCESS    (0)    - continue with packet output
+ *   NET_XMIT_DROP       (1)    - drop packet and notify TCP to call cwr
+ *   NET_XMIT_CN         (2)    - continue with packet output and notify TCP
+ *                                to call cwr
+ *   -EPERM                     - drop packet
+ *
+ * For ingress packets, this function will return -EPERM if any
+ * attached program was found and if it returned != 1 during execution.
+ * Otherwise 0 is returned.
 */
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
                                struct sk_buff *skb,
@@ -575,12 +637,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
        /* compute pointers for the bpf prog */
        bpf_compute_and_save_data_end(skb, &saved_data_end);
-        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+        if (type == BPF_CGROUP_INET_EGRESS) {
-                                 __bpf_prog_run_save_cb);
+                ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
+                        cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+        } else {
+                ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+                                          __bpf_prog_run_save_cb);
+                ret = (ret == 1 ? 0 : -EPERM);
+        }
        bpf_restore_data_end(skb, saved_data_end);
        __skb_pull(skb, offset);
        skb->sk = save_sk;
-        return ret == 1 ? 0 : -EPERM;
+        return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
@@ -870,6 +939,190 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
+#ifdef CONFIG_NET
+static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
+                                             enum bpf_attach_type attach_type)
+{
+        struct bpf_prog_array *prog_array;
+        bool empty;
+        rcu_read_lock();
+        prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
+        empty = bpf_prog_array_is_empty(prog_array);
+        rcu_read_unlock();
+        return empty;
+}
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+{
+        if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+                return -EINVAL;
+        ctx->optval = kzalloc(max_optlen, GFP_USER);
+        if (!ctx->optval)
+                return -ENOMEM;
+        ctx->optval_end = ctx->optval + max_optlen;
+        ctx->optlen = max_optlen;
+        return 0;
+}
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+{
+        kfree(ctx->optval);
+}
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
+                                       int *optname, char __user *optval,
+                                       int *optlen, char **kernel_optval)
+{
+        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+        struct bpf_sockopt_kern ctx = {
+                .sk = sk,
+                .level = *level,
+                .optname = *optname,
+        };
+        int ret;
+        /* Opportunistic check to see whether we have any BPF program
+         * attached to the hook so we don't waste time allocating
+         * memory and locking the socket.
+         */
+        if (!cgroup_bpf_enabled ||
+            __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+                return 0;
+        ret = sockopt_alloc_buf(&ctx, *optlen);
+        if (ret)
+                return ret;
+        if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+                ret = -EFAULT;
+                goto out;
+        }
+        lock_sock(sk);
+        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
+                                 &ctx, BPF_PROG_RUN);
+        release_sock(sk);
+        if (!ret) {
+                ret = -EPERM;
+                goto out;
+        }
+        if (ctx.optlen == -1) {
+                /* optlen set to -1, bypass kernel */
+                ret = 1;
+        } else if (ctx.optlen > *optlen || ctx.optlen < -1) {
+                /* optlen is out of bounds */
+                ret = -EFAULT;
+        } else {
+                /* optlen within bounds, run kernel handler */
+                ret = 0;
+                /* export any potential modifications */
+                *level = ctx.level;
+                *optname = ctx.optname;
+                *optlen = ctx.optlen;
+                *kernel_optval = ctx.optval;
+        }
+out:
+        if (ret)
+                sockopt_free_buf(&ctx);
+        return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+                                       int optname, char __user *optval,
+                                       int __user *optlen, int max_optlen,
+                                       int retval)
+{
+        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+        struct bpf_sockopt_kern ctx = {
+                .sk = sk,
+                .level = level,
+                .optname = optname,
+                .retval = retval,
+        };
+        int ret;
+        /* Opportunistic check to see whether we have any BPF program
+         * attached to the hook so we don't waste time allocating
+         * memory and locking the socket.
+         */
+        if (!cgroup_bpf_enabled ||
+            __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+                return retval;
+        ret = sockopt_alloc_buf(&ctx, max_optlen);
+        if (ret)
+                return ret;
+        if (!retval) {
+                /* If kernel getsockopt finished successfully,
+                 * copy whatever was returned to the user back
+                 * into our temporary buffer. Set optlen to the
+                 * one that kernel returned as well to let
+                 * BPF programs inspect the value.
+                 */
+                if (get_user(ctx.optlen, optlen)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                if (ctx.optlen > max_optlen)
+                        ctx.optlen = max_optlen;
+                if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+        }
+        lock_sock(sk);
+        ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+                                 &ctx, BPF_PROG_RUN);
+        release_sock(sk);
+        if (!ret) {
+                ret = -EPERM;
+                goto out;
+        }
+        if (ctx.optlen > max_optlen) {
+                ret = -EFAULT;
+                goto out;
+        }
+        /* BPF programs only allowed to set retval to 0, not some
+         * arbitrary value.
+         */
+        if (ctx.retval != 0 && ctx.retval != retval) {
+                ret = -EFAULT;
+                goto out;
+        }
+        if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+            put_user(ctx.optlen, optlen)) {
+                ret = -EFAULT;
+                goto out;
+        }
+        ret = ctx.retval;
+out:
+        sockopt_free_buf(&ctx);
+        return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
+#endif
 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
                              size_t *lenp)
 {
@@ -1130,3 +1383,155 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
 const struct bpf_prog_ops cg_sysctl_prog_ops = {
 };
+static const struct bpf_func_proto *
+cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+        switch (func_id) {
+#ifdef CONFIG_NET
+        case BPF_FUNC_sk_storage_get:
+                return &bpf_sk_storage_get_proto;
+        case BPF_FUNC_sk_storage_delete:
+                return &bpf_sk_storage_delete_proto;
+#endif
+#ifdef CONFIG_INET
+        case BPF_FUNC_tcp_sock:
+                return &bpf_tcp_sock_proto;
+#endif
+        default:
+                return cgroup_base_func_proto(func_id, prog);
+        }
+}
+static bool cg_sockopt_is_valid_access(int off, int size,
+                                       enum bpf_access_type type,
+                                       const struct bpf_prog *prog,
+                                       struct bpf_insn_access_aux *info)
+{
+        const int size_default = sizeof(__u32);
+        if (off < 0 || off >= sizeof(struct bpf_sockopt))
+                return false;
+        if (off % size != 0)
+                return false;
+        if (type == BPF_WRITE) {
+                switch (off) {
+                case offsetof(struct bpf_sockopt, retval):
+                        if (size != size_default)
+                                return false;
+                        return prog->expected_attach_type ==
+                                BPF_CGROUP_GETSOCKOPT;
+                case offsetof(struct bpf_sockopt, optname):
+                        /* fallthrough */
+                case offsetof(struct bpf_sockopt, level):
+                        if (size != size_default)
+                                return false;
+                        return prog->expected_attach_type ==
+                                BPF_CGROUP_SETSOCKOPT;
+                case offsetof(struct bpf_sockopt, optlen):
+                        return size == size_default;
+                default:
+                        return false;
+                }
+        }
+        switch (off) {
+        case offsetof(struct bpf_sockopt, sk):
+                if (size != sizeof(__u64))
+                        return false;
+                info->reg_type = PTR_TO_SOCKET;
+                break;
+        case offsetof(struct bpf_sockopt, optval):
+                if (size != sizeof(__u64))
+                        return false;
+                info->reg_type = PTR_TO_PACKET;
+                break;
+        case offsetof(struct bpf_sockopt, optval_end):
+                if (size != sizeof(__u64))
+                        return false;
+                info->reg_type = PTR_TO_PACKET_END;
+                break;
+        case offsetof(struct bpf_sockopt, retval):
+                if (size != size_default)
+                        return false;
+                return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
+        default:
+                if (size != size_default)
+                        return false;
+                break;
+        }
+        return true;
+}
+#define CG_SOCKOPT_ACCESS_FIELD(T, F)                                   \
+        T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),                 \
+          si->dst_reg, si->src_reg,                                     \
+          offsetof(struct bpf_sockopt_kern, F))
+static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
+                                         const struct bpf_insn *si,
+                                         struct bpf_insn *insn_buf,
+                                         struct bpf_prog *prog,
+                                         u32 *target_size)
+{
+        struct bpf_insn *insn = insn_buf;
+        switch (si->off) {
+        case offsetof(struct bpf_sockopt, sk):
+                *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+                break;
+        case offsetof(struct bpf_sockopt, level):
+                if (type == BPF_WRITE)
+                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+                else
+                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+                break;
+        case offsetof(struct bpf_sockopt, optname):
+                if (type == BPF_WRITE)
+                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+                else
+                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+                break;
+        case offsetof(struct bpf_sockopt, optlen):
+                if (type == BPF_WRITE)
+                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+                else
+                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+                break;
+        case offsetof(struct bpf_sockopt, retval):
+                if (type == BPF_WRITE)
+                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
+                else
+                        *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+                break;
+        case offsetof(struct bpf_sockopt, optval):
+                *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+                break;
+        case offsetof(struct bpf_sockopt, optval_end):
+                *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+                break;
+        }
+        return insn - insn_buf;
+}
+static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
+                                   bool direct_write,
+                                   const struct bpf_prog *prog)
+{
+        /* Nothing to do for sockopt argument. The data is kzalloc'ated.
+         */
+        return 0;
+}
+const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
+        .get_func_proto         = cg_sockopt_func_proto,
+        .is_valid_access        = cg_sockopt_is_valid_access,
+        .convert_ctx_access     = cg_sockopt_convert_ctx_access,
+        .gen_prologue           = cg_sockopt_get_prologue,
+};
+const struct bpf_prog_ops cg_sockopt_prog_ops = {
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 242a643af82f..16079550db6d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * Linux Socket Filter - Kernel level socket filtering
 *
@@ -12,11 +13,6 @@
 *      Alexei Starovoitov <ast@plumgrid.com>
 *      Daniel Borkmann <dborkman@redhat.com>
 *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */
@@ -1368,10 +1364,10 @@ select_insn:
                insn++;
                CONT;
        ALU_ARSH_X:
-                DST = (u64) (u32) ((*(s32 *) &DST) >> SRC);
+                DST = (u64) (u32) (((s32) DST) >> SRC);
                CONT;
        ALU_ARSH_K:
-                DST = (u64) (u32) ((*(s32 *) &DST) >> IMM);
+                DST = (u64) (u32) (((s32) DST) >> IMM);
                CONT;
        ALU64_ARSH_X:
                (*(s64 *) &DST) >>= SRC;
@@ -1795,38 +1791,42 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
        return &empty_prog_array.hdr;
 }
-void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+void bpf_prog_array_free(struct bpf_prog_array *progs)
 {
-        if (!progs ||
+        if (!progs || progs == &empty_prog_array.hdr)
-            progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
                return;
        kfree_rcu(progs, rcu);
 }
-int bpf_prog_array_length(struct bpf_prog_array __rcu *array)
+int bpf_prog_array_length(struct bpf_prog_array *array)
 {
        struct bpf_prog_array_item *item;
        u32 cnt = 0;
-        rcu_read_lock();
+        for (item = array->items; item->prog; item++)
-        item = rcu_dereference(array)->items;
-        for (; item->prog; item++)
                if (item->prog != &dummy_bpf_prog.prog)
                        cnt++;
-        rcu_read_unlock();
        return cnt;
 }
+bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
+{
+        struct bpf_prog_array_item *item;
+        for (item = array->items; item->prog; item++)
+                if (item->prog != &dummy_bpf_prog.prog)
+                        return false;
+        return true;
+}
-static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
+static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
                                     u32 *prog_ids,
                                     u32 request_cnt)
 {
        struct bpf_prog_array_item *item;
        int i = 0;
-        item = rcu_dereference_check(array, 1)->items;
+        for (item = array->items; item->prog; item++) {
-        for (; item->prog; item++) {
                if (item->prog == &dummy_bpf_prog.prog)
                        continue;
                prog_ids[i] = item->prog->aux->id;
@@ -1839,7 +1839,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
        return !!(item->prog);
 }
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
                                __u32 __user *prog_ids, u32 cnt)
 {
        unsigned long err = 0;
@@ -1850,18 +1850,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
         * cnt = bpf_prog_array_length();
         * if (cnt > 0)
         *     bpf_prog_array_copy_to_user(..., cnt);
-         * so below kcalloc doesn't need extra cnt > 0 check, but
+         * so below kcalloc doesn't need extra cnt > 0 check.
-         * bpf_prog_array_length() releases rcu lock and
-         * prog array could have been swapped with empty or larger array,
-         * so always copy 'cnt' prog_ids to the user.
-         * In a rare race the user will see zero prog_ids
         */
        ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
        if (!ids)
                return -ENOMEM;
-        rcu_read_lock();
        nospc = bpf_prog_array_copy_core(array, ids, cnt);
-        rcu_read_unlock();
        err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
        kfree(ids);
        if (err)
@@ -1871,19 +1865,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
        return 0;
 }
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array,
+void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
                                struct bpf_prog *old_prog)
 {
-        struct bpf_prog_array_item *item = array->items;
+        struct bpf_prog_array_item *item;
-        for (; item->prog; item++)
+        for (item = array->items; item->prog; item++)
                if (item->prog == old_prog) {
                        WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
                        break;
                }
 }
-int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
                        struct bpf_prog_array **new_array)
@@ -1947,7 +1941,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
        return 0;
 }
-int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_info(struct bpf_prog_array *array,
                             u32 *prog_ids, u32 request_cnt,
                             u32 *prog_cnt)
 {
@@ -2090,6 +2084,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
        return false;
 }
+/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
+ * analysis code and wants explicit zero extension inserted by verifier.
+ * Otherwise, return FALSE.
+ */
+bool __weak bpf_jit_needs_zext(void)
+{
+        return false;
+}
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
 */
@@ -2101,10 +2104,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);
-int sysctl_bpf_stats_enabled __read_mostly;
 /* All definitions of tracepoints related to BPF. */
 #define CREATE_TRACE_POINTS
 #include <linux/bpf_trace.h>
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index cf727d77c6c6..ef49e17ae47c 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* bpf/cpumap.c
 *
 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
- * Released under terms in GPL version 2.  See COPYING.
 */
 /* The 'cpumap' is primarily used as a backend map for XDP BPF helper
@@ -32,14 +32,19 @@
 /* General idea: XDP packets getting XDP redirected to another CPU,
 * will maximum be stored/queued for one driver ->poll() call.  It is
- * guaranteed that setting flush bit and flush operation happen on
+ * guaranteed that queueing the frame and the flush operation happen on
 * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
 * which queue in bpf_cpu_map_entry contains packets.
 */
 #define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */
+struct bpf_cpu_map_entry;
+struct bpf_cpu_map;
 struct xdp_bulk_queue {
        void *q[CPU_MAP_BULK_SIZE];
+        struct list_head flush_node;
+        struct bpf_cpu_map_entry *obj;
        unsigned int count;
 };
@@ -52,6 +57,8 @@ struct bpf_cpu_map_entry {
        /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
        struct xdp_bulk_queue __percpu *bulkq;
+        struct bpf_cpu_map *cmap;
        /* Queue with potential multi-producers, and single-consumer kthread */
        struct ptr_ring *queue;
        struct task_struct *kthread;
@@ -65,23 +72,17 @@ struct bpf_cpu_map {
        struct bpf_map map;
        /* Below members specific for map type */
        struct bpf_cpu_map_entry **cpu_map;
-        unsigned long __percpu *flush_needed;
+        struct list_head __percpu *flush_list;
 };
-static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
-                             struct xdp_bulk_queue *bq, bool in_napi_ctx);
-static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
-{
-        return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
-}
 static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 {
        struct bpf_cpu_map *cmap;
        int err = -ENOMEM;
+        int ret, cpu;
        u64 cost;
-        int ret;
        if (!capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
@@ -105,23 +106,21 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
        /* make sure page count doesn't overflow */
        cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
-        cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+        cost += sizeof(struct list_head) * num_possible_cpus();
-        if (cost >= U32_MAX - PAGE_SIZE)
-                goto free_cmap;
-        cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
        /* Notice returns -EPERM on if map size is larger than memlock limit */
-        ret = bpf_map_precharge_memlock(cmap->map.pages);
+        ret = bpf_map_charge_init(&cmap->map.memory, cost);
        if (ret) {
                err = ret;
                goto free_cmap;
        }
-        /* A per cpu bitfield with a bit per possible CPU in map  */
+        cmap->flush_list = alloc_percpu(struct list_head);
-        cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+        if (!cmap->flush_list)
-                                            __alignof__(unsigned long));
+                goto free_charge;
-        if (!cmap->flush_needed)
-                goto free_cmap;
+        for_each_possible_cpu(cpu)
+                INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
        /* Alloc array for possible remote "destination" CPUs */
        cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
@@ -132,7 +131,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
        return &cmap->map;
 free_percpu:
-        free_percpu(cmap->flush_needed);
+        free_percpu(cmap->flush_list);
+free_charge:
+        bpf_map_charge_finish(&cmap->map.memory);
 free_cmap:
        kfree(cmap);
        return ERR_PTR(err);
@@ -209,6 +210,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
         * - RX ring dev queue index    (skb_record_rx_queue)
         */
+        /* Until page_pool get SKB return path, release DMA here */
+        xdp_release_frame(xdpf);
        /* Allow SKB to reuse area used by xdp_frame */
        xdp_scrub_frame(xdpf);
@@ -332,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
 {
        gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
        struct bpf_cpu_map_entry *rcpu;
-        int numa, err;
+        struct xdp_bulk_queue *bq;
+        int numa, err, i;
        /* Have map->numa_node, but choose node of redirect target CPU */
        numa = cpu_to_node(cpu);
@@ -347,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
        if (!rcpu->bulkq)
                goto free_rcu;
+        for_each_possible_cpu(i) {
+                bq = per_cpu_ptr(rcpu->bulkq, i);
+                bq->obj = rcpu;
+        }
        /* Alloc queue */
        rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
        if (!rcpu->queue)
@@ -403,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
                struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
                /* No concurrent bq_enqueue can run at this point */
-                bq_flush_to_queue(rcpu, bq, false);
+                bq_flush_to_queue(bq, false);
        }
        free_percpu(rcpu->bulkq);
        /* Cannot kthread_stop() here, last put free rcpu resources */
@@ -486,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
                rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
                if (!rcpu)
                        return -ENOMEM;
+                rcpu->cmap = cmap;
        }
        rcu_read_lock();
        __cpu_map_entry_replace(cmap, key_cpu, rcpu);
@@ -512,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map)
        synchronize_rcu();
        /* To ensure all pending flush operations have completed wait for flush
-         * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+         * list be empty on _all_ cpus. Because the above synchronize_rcu()
-         * Because the above synchronize_rcu() ensures the map is disconnected
+         * ensures the map is disconnected from the program we can assume no new
-         * from the program we can assume no new bits will be set.
+         * items will be added to the list.
         */
        for_each_online_cpu(cpu) {
-                unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+                struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
-                while (!bitmap_empty(bitmap, cmap->map.max_entries))
+                while (!list_empty(flush_list))
                        cond_resched();
        }
@@ -536,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map)
                /* bq flush and cleanup happens after RCU graze-period */
                __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
        }
-        free_percpu(cmap->flush_needed);
+        free_percpu(cmap->flush_list);
        bpf_map_area_free(cmap->cpu_map);
        kfree(cmap);
 }
@@ -588,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = {
        .map_check_btf          = map_check_no_btf,
 };
-static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
-                             struct xdp_bulk_queue *bq, bool in_napi_ctx)
 {
+        struct bpf_cpu_map_entry *rcpu = bq->obj;
        unsigned int processed = 0, drops = 0;
        const int to_cpu = rcpu->cpu;
        struct ptr_ring *q;
@@ -619,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
        bq->count = 0;
        spin_unlock(&q->producer_lock);
+        __list_del_clearprev(&bq->flush_node);
        /* Feedback loop via tracepoints */
        trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
        return 0;
@@ -629,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 */
 static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
 {
+        struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
        struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
        if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
-                bq_flush_to_queue(rcpu, bq, true);
+                bq_flush_to_queue(bq, true);
        /* Notice, xdp_buff/page MUST be queued here, long enough for
         * driver to code invoking us to finished, due to driver
@@ -644,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
         * operation, when completing napi->poll call.
         */
        bq->q[bq->count++] = xdpf;
+        if (!bq->flush_node.prev)
+                list_add(&bq->flush_node, flush_list);
        return 0;
 }
@@ -663,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
        return 0;
 }
-void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
-{
-        struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
-        unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
-        __set_bit(bit, bitmap);
-}
 void __cpu_map_flush(struct bpf_map *map)
 {
        struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
-        unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+        struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
-        u32 bit;
+        struct xdp_bulk_queue *bq, *tmp;
-        /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
-         * and __cpu_map_flush() happen on same CPU. Thus, the percpu
-         * bitmap indicate which percpu bulkq have packets.
-         */
-        for_each_set_bit(bit, bitmap, map->max_entries) {
-                struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
-                struct xdp_bulk_queue *bq;
-                /* This is possible if entry is removed by user space
-                 * between xdp redirect and flush op.
-                 */
-                if (unlikely(!rcpu))
-                        continue;
-                __clear_bit(bit, bitmap);
-                /* Flush all frames in bulkq to real queue */
+        list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
-                bq = this_cpu_ptr(rcpu->bulkq);
+                bq_flush_to_queue(bq, true);
-                bq_flush_to_queue(rcpu, bq, true);
                /* If already running, costs spin_lock_irqsave + smb_mb */
-                wake_up_process(rcpu->kthread);
+                wake_up_process(bq->obj->kthread);
        }
 }
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..d83cf8ccc872 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
 */
 /* Devmaps primary use is as a backend map for XDP BPF helper call
@@ -25,9 +17,8 @@
 * datapath always has a valid copy. However, the datapath does a "flush"
 * operation that pushes any pending packets in the driver outside the RCU
 * critical section. Each bpf_dtab_netdev tracks these pending operations using
- * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
+ * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed  until
- * until all bits are cleared indicating outstanding flush operations have
+ * this list is empty, indicating outstanding flush operations have completed.
- * completed.
 *
 * BPF syscalls may race with BPF program calls on any of the update, delete
 * or lookup operations. As noted above the xchg() operation also keep the
@@ -56,9 +47,13 @@
        (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 #define DEV_MAP_BULK_SIZE 16
+struct bpf_dtab_netdev;
 struct xdp_bulk_queue {
        struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+        struct list_head flush_node;
        struct net_device *dev_rx;
+        struct bpf_dtab_netdev *obj;
        unsigned int count;
 };
@@ -73,22 +68,17 @@ struct bpf_dtab_netdev {
 struct bpf_dtab {
        struct bpf_map map;
        struct bpf_dtab_netdev **netdev_map;
-        unsigned long __percpu *flush_needed;
+        struct list_head __percpu *flush_list;
        struct list_head list;
 };
 static DEFINE_SPINLOCK(dev_map_lock);
 static LIST_HEAD(dev_map_list);
-static u64 dev_map_bitmap_size(const union bpf_attr *attr)
-{
-        return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
-}
 static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 {
        struct bpf_dtab *dtab;
-        int err = -EINVAL;
+        int err, cpu;
        u64 cost;
        if (!capable(CAP_NET_ADMIN))
@@ -99,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
            attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
                return ERR_PTR(-EINVAL);
+        /* Lookup returns a pointer straight to dev->ifindex, so make sure the
+         * verifier prevents writes from the BPF side
+         */
+        attr->map_flags |= BPF_F_RDONLY_PROG;
        dtab = kzalloc(sizeof(*dtab), GFP_USER);
        if (!dtab)
                return ERR_PTR(-ENOMEM);
@@ -107,39 +102,39 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
        /* make sure page count doesn't overflow */
        cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
-        cost += dev_map_bitmap_size(attr) * num_possible_cpus();
+        cost += sizeof(struct list_head) * num_possible_cpus();
-        if (cost >= U32_MAX - PAGE_SIZE)
-                goto free_dtab;
-        dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+        /* if map size is larger than memlock limit, reject it */
+        err = bpf_map_charge_init(&dtab->map.memory, cost);
-        /* if map size is larger than memlock limit, reject it early */
-        err = bpf_map_precharge_memlock(dtab->map.pages);
        if (err)
                goto free_dtab;
        err = -ENOMEM;
-        /* A per cpu bitfield with a bit per possible net device */
+        dtab->flush_list = alloc_percpu(struct list_head);
-        dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
+        if (!dtab->flush_list)
-                                                __alignof__(unsigned long),
+                goto free_charge;
-                                                GFP_KERNEL | __GFP_NOWARN);
-        if (!dtab->flush_needed)
+        for_each_possible_cpu(cpu)
-                goto free_dtab;
+                INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
        dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
                                              sizeof(struct bpf_dtab_netdev *),
                                              dtab->map.numa_node);
        if (!dtab->netdev_map)
-                goto free_dtab;
+                goto free_percpu;
        spin_lock(&dev_map_lock);
        list_add_tail_rcu(&dtab->list, &dev_map_list);
        spin_unlock(&dev_map_lock);
        return &dtab->map;
+free_percpu:
+        free_percpu(dtab->flush_list);
+free_charge:
+        bpf_map_charge_finish(&dtab->map.memory);
 free_dtab:
-        free_percpu(dtab->flush_needed);
        kfree(dtab);
        return ERR_PTR(err);
 }
@@ -164,15 +159,18 @@ static void dev_map_free(struct bpf_map *map)
        bpf_clear_redirect_map(map);
        synchronize_rcu();
+        /* Make sure prior __dev_map_entry_free() have completed. */
+        rcu_barrier();
        /* To ensure all pending flush operations have completed wait for flush
-         * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+         * list to empty on _all_ cpus.
         * Because the above synchronize_rcu() ensures the map is disconnected
-         * from the program we can assume no new bits will be set.
+         * from the program we can assume no new items will be added.
         */
        for_each_online_cpu(cpu) {
-                unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+                struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
-                while (!bitmap_empty(bitmap, dtab->map.max_entries))
+                while (!list_empty(flush_list))
                        cond_resched();
        }
@@ -183,11 +181,12 @@ static void dev_map_free(struct bpf_map *map)
                if (!dev)
                        continue;
+                free_percpu(dev->bulkq);
                dev_put(dev->dev);
                kfree(dev);
        }
-        free_percpu(dtab->flush_needed);
+        free_percpu(dtab->flush_list);
        bpf_map_area_free(dtab->netdev_map);
        kfree(dtab);
 }
@@ -209,18 +208,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
        return 0;
 }
-void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
+static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
-{
-        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-        unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
-        __set_bit(bit, bitmap);
-}
-static int bq_xmit_all(struct bpf_dtab_netdev *obj,
-                       struct xdp_bulk_queue *bq, u32 flags,
                       bool in_napi_ctx)
 {
+        struct bpf_dtab_netdev *obj = bq->obj;
        struct net_device *dev = obj->dev;
        int sent = 0, drops = 0, err = 0;
        int i;
@@ -247,6 +238,7 @@ out:
        trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
                              sent, drops, bq->dev_rx, dev, err);
        bq->dev_rx = NULL;
+        __list_del_clearprev(&bq->flush_node);
        return 0;
 error:
        /* If ndo_xdp_xmit fails with an errno, no frames have been
@@ -269,30 +261,19 @@ error:
 * from the driver before returning from its napi->poll() routine. The poll()
 * routine is called either from busy_poll context or net_rx_action signaled
 * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
- * net device can be torn down. On devmap tear down we ensure the ctx bitmap
+ * net device can be torn down. On devmap tear down we ensure the flush list
- * is zeroed before completing to ensure all flush operations have completed.
+ * is empty before completing to ensure all flush operations have completed.
 */
 void __dev_map_flush(struct bpf_map *map)
 {
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-        unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+        struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
-        u32 bit;
+        struct xdp_bulk_queue *bq, *tmp;
-        for_each_set_bit(bit, bitmap, map->max_entries) {
-                struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
-                struct xdp_bulk_queue *bq;
-                /* This is possible if the dev entry is removed by user space
-                 * between xdp redirect and flush op.
-                 */
-                if (unlikely(!dev))
-                        continue;
-                __clear_bit(bit, bitmap);
-                bq = this_cpu_ptr(dev->bulkq);
+        rcu_read_lock();
-                bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
+        list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
-        }
+                bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
+        rcu_read_unlock();
 }
 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -318,10 +299,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
                      struct net_device *dev_rx)
 {
+        struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
        struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
        if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
-                bq_xmit_all(obj, bq, 0, true);
+                bq_xmit_all(bq, 0, true);
        /* Ingress dev_rx will be the same for all xdp_frame's in
         * bulk_queue, because bq stored per-CPU and must be flushed
@@ -331,6 +313,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
                bq->dev_rx = dev_rx;
        bq->q[bq->count++] = xdpf;
+        if (!bq->flush_node.prev)
+                list_add(&bq->flush_node, flush_list);
        return 0;
 }
@@ -381,17 +367,14 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
        if (dev->dev->netdev_ops->ndo_xdp_xmit) {
                struct xdp_bulk_queue *bq;
-                unsigned long *bitmap;
                int cpu;
+                rcu_read_lock();
                for_each_online_cpu(cpu) {
-                        bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
-                        __clear_bit(dev->bit, bitmap);
                        bq = per_cpu_ptr(dev->bulkq, cpu);
-                        bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false);
+                        bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
                }
+                rcu_read_unlock();
        }
 }
@@ -436,8 +419,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
        struct net *net = current->nsproxy->net_ns;
        gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
        struct bpf_dtab_netdev *dev, *old_dev;
-        u32 i = *(u32 *)key;
        u32 ifindex = *(u32 *)value;
+        struct xdp_bulk_queue *bq;
+        u32 i = *(u32 *)key;
+        int cpu;
        if (unlikely(map_flags > BPF_EXIST))
                return -EINVAL;
@@ -460,6 +445,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
                        return -ENOMEM;
                }
+                for_each_possible_cpu(cpu) {
+                        bq = per_cpu_ptr(dev->bulkq, cpu);
+                        bq->obj = dev;
+                }
                dev->dev = dev_get_by_index(net, ifindex);
                if (!dev->dev) {
                        free_percpu(dev->bulkq);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index d9ce383c0f9c..b44d8c447afd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
 */
 #include <linux/bpf.h>
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e1324a834a24..e546b18d27da 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,14 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
 */
 #ifndef __BPF_DISASM_H__
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 192d32e77db3..22066a62c8c9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
 */
 #include <linux/bpf.h>
 #include <linux/btf.h>
@@ -360,14 +352,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
        else
               cost += (u64) htab->elem_size * num_possible_cpus();
-        if (cost >= U32_MAX - PAGE_SIZE)
+        /* if map size is larger than memlock limit, reject it */
-                /* make sure page count doesn't overflow */
+        err = bpf_map_charge_init(&htab->map.memory, cost);
-                goto free_htab;
-        htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-        /* if map size is larger than memlock limit, reject it early */
-        err = bpf_map_precharge_memlock(htab->map.pages);
        if (err)
                goto free_htab;
@@ -376,7 +362,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
                                           sizeof(struct bucket),
                                           htab->map.numa_node);
        if (!htab->buckets)
-                goto free_htab;
+                goto free_charge;
        if (htab->map.map_flags & BPF_F_ZERO_SEED)
                htab->hashrnd = 0;
@@ -409,6 +395,8 @@ free_prealloc:
        prealloc_destroy(htab);
 free_buckets:
        bpf_map_area_free(htab->buckets);
+free_charge:
+        bpf_map_charge_finish(&htab->map.memory);
 free_htab:
        kfree(htab);
        return ERR_PTR(err);
@@ -527,18 +515,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
        return insn - insn_buf;
 }
-static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
+                                                        void *key, const bool mark)
 {
        struct htab_elem *l = __htab_map_lookup_elem(map, key);
        if (l) {
-                bpf_lru_node_set_ref(&l->lru_node);
+                if (mark)
+                        bpf_lru_node_set_ref(&l->lru_node);
                return l->key + round_up(map->key_size, 8);
        }
        return NULL;
 }
+static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+{
+        return __htab_lru_map_lookup_elem(map, key, true);
+}
+static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
+{
+        return __htab_lru_map_lookup_elem(map, key, false);
+}
 static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
                                   struct bpf_insn *insn_buf)
 {
@@ -1250,6 +1250,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
        .map_lookup_elem = htab_lru_map_lookup_elem,
+        .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
        .map_update_elem = htab_lru_map_update_elem,
        .map_delete_elem = htab_lru_map_delete_elem,
        .map_gen_lookup = htab_lru_map_gen_lookup,
@@ -1281,7 +1282,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
 {
-        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
        struct htab_elem *l;
        void __percpu *pptr;
        int ret = -ENOENT;
@@ -1297,8 +1297,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
        l = __htab_map_lookup_elem(map, key);
        if (!l)
                goto out;
-        if (htab_is_lru(htab))
+        /* We do not mark LRU map element here in order to not mess up
-                bpf_lru_node_set_ref(&l->lru_node);
+         * eviction heuristics when user space does a map walk.
+         */
        pptr = htab_elem_get_ptr(l, map->key_size);
        for_each_possible_cpu(cpu) {
                bpf_long_memcpy(value + off,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 4266ffde07ca..5e28718928ca 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
 */
 #include <linux/bpf.h>
 #include <linux/rcupdate.h>
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index bc53e5b20ddc..cc0d0cf114e3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Minimal file system backend for holding eBPF maps and programs,
 * used by bpf(2) object pinning.
@@ -5,10 +6,6 @@
 * Authors:
 *
 *      Daniel Borkmann <daniel@iogearbox.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
 */
 #include <linux/init.h>
@@ -518,7 +515,7 @@ out:
 static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
 {
        struct bpf_prog *prog;
-        int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+        int ret = inode_permission(inode, MAY_READ);
        if (ret)
                return ERR_PTR(ret);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 980e8f1f6cb5..addd6fdceec8 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -272,6 +272,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 {
        int numa_node = bpf_map_attr_numa_node(attr);
        struct bpf_cgroup_storage_map *map;
+        struct bpf_map_memory mem;
+        int ret;
        if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
                return ERR_PTR(-EINVAL);
@@ -290,13 +292,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
                /* max_entries is not used and enforced to be 0 */
                return ERR_PTR(-EINVAL);
+        ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map));
+        if (ret < 0)
+                return ERR_PTR(ret);
        map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
                           __GFP_ZERO | GFP_USER, numa_node);
-        if (!map)
+        if (!map) {
+                bpf_map_charge_finish(&mem);
                return ERR_PTR(-ENOMEM);
+        }
-        map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map),
+        bpf_map_charge_move(&map->map.memory, &mem);
-                                  PAGE_SIZE) >> PAGE_SHIFT;
        /* copy mandatory map attributes */
        bpf_map_init_from_attr(&map->map, attr);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e61630c2e50b..56e6c75d354d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Longest prefix match list implementation
 *
 * Copyright (c) 2016,2017 Daniel Mack
 * Copyright (c) 2016 David Herrmann
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License.  See the file COPYING in the main directory of the
- * Linux distribution for more details.
 */
 #include <linux/bpf.h>
@@ -573,14 +570,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
        cost_per_node = sizeof(struct lpm_trie_node) +
                        attr->value_size + trie->data_size;
        cost += (u64) attr->max_entries * cost_per_node;
-        if (cost >= U32_MAX - PAGE_SIZE) {
-                ret = -E2BIG;
-                goto out_err;
-        }
-        trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-        ret = bpf_map_precharge_memlock(trie->map.pages);
+        ret = bpf_map_charge_init(&trie->map.memory, cost);
        if (ret)
                goto out_err;
@@ -716,9 +707,14 @@ find_leftmost:
         * have exact two children, so this function will never return NULL.
         */
        for (node = search_root; node;) {
-                if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+                if (node->flags & LPM_TREE_NODE_FLAG_IM) {
+                        node = rcu_dereference(node->child[0]);
+                } else {
                        next_node = node;
-                node = rcu_dereference(node->child[0]);
+                        node = rcu_dereference(node->child[0]);
+                        if (!node)
+                                node = rcu_dereference(next_node->child[1]);
+                }
        }
 do_copy:
        next_key->prefixlen = next_node->prefixlen;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 3dff41403583..fab4fb134547 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
 */
 #include <linux/slab.h>
 #include <linux/bpf.h>
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 6183db9ec08c..a507bf6ef8b9 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
 */
 #ifndef __MAP_IN_MAP_H__
 #define __MAP_IN_MAP_H__
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 0c1b4ba9e90e..6e090140b924 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
 */
 #include "percpu_freelist.h"
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index c3960118e617..fbf8a8a28979 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
 */
 #ifndef __PERCPU_FREELIST_H__
 #define __PERCPU_FREELIST_H__
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 0b140d236889..f697647ceb54 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -67,29 +67,28 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
 static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 {
        int ret, numa_node = bpf_map_attr_numa_node(attr);
+        struct bpf_map_memory mem = {0};
        struct bpf_queue_stack *qs;
        u64 size, queue_size, cost;
        size = (u64) attr->max_entries + 1;
        cost = queue_size = sizeof(*qs) + size * attr->value_size;
-        if (cost >= U32_MAX - PAGE_SIZE)
-                return ERR_PTR(-E2BIG);
-        cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+        ret = bpf_map_charge_init(&mem, cost);
-        ret = bpf_map_precharge_memlock(cost);
        if (ret < 0)
                return ERR_PTR(ret);
        qs = bpf_map_area_alloc(queue_size, numa_node);
-        if (!qs)
+        if (!qs) {
+                bpf_map_charge_finish(&mem);
                return ERR_PTR(-ENOMEM);
+        }
        memset(qs, 0, sizeof(*qs));
        bpf_map_init_from_attr(&qs->map, attr);
-        qs->map.pages = cost;
+        bpf_map_charge_move(&qs->map.memory, &mem);
        qs->size = size;
        raw_spin_lock_init(&qs->lock);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 18e225de80ff..50c083ba978c 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -151,7 +151,8 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 {
        int err, numa_node = bpf_map_attr_numa_node(attr);
        struct reuseport_array *array;
-        u64 cost, array_size;
+        struct bpf_map_memory mem;
+        u64 array_size;
        if (!capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
@@ -159,24 +160,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
        array_size = sizeof(*array);
        array_size += (u64)attr->max_entries * sizeof(struct sock *);
-        /* make sure there is no u32 overflow later in round_up() */
+        err = bpf_map_charge_init(&mem, array_size);
-        cost = array_size;
-        if (cost >= U32_MAX - PAGE_SIZE)
-                return ERR_PTR(-ENOMEM);
-        cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-        err = bpf_map_precharge_memlock(cost);
        if (err)
                return ERR_PTR(err);
        /* allocate all map elements and zero-initialize them */
        array = bpf_map_area_alloc(array_size, numa_node);
-        if (!array)
+        if (!array) {
+                bpf_map_charge_finish(&mem);
                return ERR_PTR(-ENOMEM);
+        }
        /* copy mandatory map attributes */
        bpf_map_init_from_attr(&array->map, attr);
-        array->map.pages = cost;
+        bpf_map_charge_move(&array->map.memory, &mem);
        return &array->map;
 }
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 950ab2f28922..052580c33d26 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
 */
 #include <linux/bpf.h>
 #include <linux/jhash.h>
@@ -89,6 +86,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 {
        u32 value_size = attr->value_size;
        struct bpf_stack_map *smap;
+        struct bpf_map_memory mem;
        u64 cost, n_buckets;
        int err;
@@ -116,40 +114,37 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
        n_buckets = roundup_pow_of_two(attr->max_entries);
        cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
-        if (cost >= U32_MAX - PAGE_SIZE)
+        cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
-                return ERR_PTR(-E2BIG);
+        err = bpf_map_charge_init(&mem, cost);
+        if (err)
+                return ERR_PTR(err);
        smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
-        if (!smap)
+        if (!smap) {
+                bpf_map_charge_finish(&mem);
                return ERR_PTR(-ENOMEM);
+        }
-        err = -E2BIG;
-        cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
-        if (cost >= U32_MAX - PAGE_SIZE)
-                goto free_smap;
        bpf_map_init_from_attr(&smap->map, attr);
        smap->map.value_size = value_size;
        smap->n_buckets = n_buckets;
-        smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-        err = bpf_map_precharge_memlock(smap->map.pages);
-        if (err)
-                goto free_smap;
        err = get_callchain_buffers(sysctl_perf_event_max_stack);
        if (err)
-                goto free_smap;
+                goto free_charge;
        err = prealloc_elems_and_freelist(smap);
        if (err)
                goto put_buffers;
+        bpf_map_charge_move(&smap->map.memory, &mem);
        return &smap->map;
 put_buffers:
        put_callchain_buffers();
-free_smap:
+free_charge:
+        bpf_map_charge_finish(&mem);
        bpf_map_area_free(smap);
        return ERR_PTR(err);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ad3ccf82f31d..5d141f16f6fa 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
 */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
@@ -188,19 +180,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
        map->numa_node = bpf_map_attr_numa_node(attr);
 }
-int bpf_map_precharge_memlock(u32 pages)
-{
-        struct user_struct *user = get_current_user();
-        unsigned long memlock_limit, cur;
-        memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-        cur = atomic_long_read(&user->locked_vm);
-        free_uid(user);
-        if (cur + pages > memlock_limit)
-                return -EPERM;
-        return 0;
-}
 static int bpf_charge_memlock(struct user_struct *user, u32 pages)
 {
        unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -214,45 +193,62 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages)
 static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
 {
-        atomic_long_sub(pages, &user->locked_vm);
+        if (user)
+                atomic_long_sub(pages, &user->locked_vm);
 }
-static int bpf_map_init_memlock(struct bpf_map *map)
+int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size)
 {
-        struct user_struct *user = get_current_user();
+        u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
+        struct user_struct *user;
        int ret;
-        ret = bpf_charge_memlock(user, map->pages);
+        if (size >= U32_MAX - PAGE_SIZE)
+                return -E2BIG;
+        user = get_current_user();
+        ret = bpf_charge_memlock(user, pages);
        if (ret) {
                free_uid(user);
                return ret;
        }
-        map->user = user;
-        return ret;
+        mem->pages = pages;
+        mem->user = user;
+        return 0;
 }
-static void bpf_map_release_memlock(struct bpf_map *map)
+void bpf_map_charge_finish(struct bpf_map_memory *mem)
 {
-        struct user_struct *user = map->user;
+        bpf_uncharge_memlock(mem->user, mem->pages);
-        bpf_uncharge_memlock(user, map->pages);
+        free_uid(mem->user);
-        free_uid(user);
+}
+void bpf_map_charge_move(struct bpf_map_memory *dst,
+                         struct bpf_map_memory *src)
+{
+        *dst = *src;
+        /* Make sure src will not be used for the redundant uncharging. */
+        memset(src, 0, sizeof(struct bpf_map_memory));
 }
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
 {
        int ret;
-        ret = bpf_charge_memlock(map->user, pages);
+        ret = bpf_charge_memlock(map->memory.user, pages);
        if (ret)
                return ret;
-        map->pages += pages;
+        map->memory.pages += pages;
        return ret;
 }
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
 {
-        bpf_uncharge_memlock(map->user, pages);
+        bpf_uncharge_memlock(map->memory.user, pages);
-        map->pages -= pages;
+        map->memory.pages -= pages;
 }
 static int bpf_map_alloc_id(struct bpf_map *map)
@@ -303,11 +299,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 static void bpf_map_free_deferred(struct work_struct *work)
 {
        struct bpf_map *map = container_of(work, struct bpf_map, work);
+        struct bpf_map_memory mem;
-        bpf_map_release_memlock(map);
+        bpf_map_charge_move(&mem, &map->memory);
        security_bpf_map_free(map);
        /* implementation dependent freeing */
        map->ops->map_free(map);
+        bpf_map_charge_finish(&mem);
 }
 static void bpf_map_put_uref(struct bpf_map *map)
@@ -395,7 +393,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
                   map->value_size,
                   map->max_entries,
                   map->map_flags,
-                   map->pages * 1ULL << PAGE_SHIFT,
+                   map->memory.pages * 1ULL << PAGE_SHIFT,
                   map->id,
                   READ_ONCE(map->frozen));
@@ -549,6 +547,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 static int map_create(union bpf_attr *attr)
 {
        int numa_node = bpf_map_attr_numa_node(attr);
+        struct bpf_map_memory mem;
        struct bpf_map *map;
        int f_flags;
        int err;
@@ -573,7 +572,7 @@ static int map_create(union bpf_attr *attr)
        err = bpf_obj_name_cpy(map->name, attr->map_name);
        if (err)
-                goto free_map_nouncharge;
+                goto free_map;
        atomic_set(&map->refcnt, 1);
        atomic_set(&map->usercnt, 1);
@@ -583,20 +582,20 @@ static int map_create(union bpf_attr *attr)
                if (!attr->btf_value_type_id) {
                        err = -EINVAL;
-                        goto free_map_nouncharge;
+                        goto free_map;
                }
                btf = btf_get_by_fd(attr->btf_fd);
                if (IS_ERR(btf)) {
                        err = PTR_ERR(btf);
-                        goto free_map_nouncharge;
+                        goto free_map;
                }
                err = map_check_btf(map, btf, attr->btf_key_type_id,
                                    attr->btf_value_type_id);
                if (err) {
                        btf_put(btf);
-                        goto free_map_nouncharge;
+                        goto free_map;
                }
                map->btf = btf;
@@ -608,15 +607,11 @@ static int map_create(union bpf_attr *attr)
        err = security_bpf_map_alloc(map);
        if (err)
-                goto free_map_nouncharge;
+                goto free_map;
-        err = bpf_map_init_memlock(map);
-        if (err)
-                goto free_map_sec;
        err = bpf_map_alloc_id(map);
        if (err)
-                goto free_map;
+                goto free_map_sec;
        err = bpf_map_new_fd(map, f_flags);
        if (err < 0) {
@@ -632,13 +627,13 @@ static int map_create(union bpf_attr *attr)
        return err;
-free_map:
-        bpf_map_release_memlock(map);
 free_map_sec:
        security_bpf_map_free(map);
-free_map_nouncharge:
+free_map:
        btf_put(map->btf);
+        bpf_map_charge_move(&mem, &map->memory);
        map->ops->map_free(map);
+        bpf_map_charge_finish(&mem);
        return err;
 }
@@ -808,7 +803,10 @@ static int map_lookup_elem(union bpf_attr *attr)
                err = map->ops->map_peek_elem(map, value);
        } else {
                rcu_read_lock();
-                ptr = map->ops->map_lookup_elem(map, key);
+                if (map->ops->map_lookup_elem_sys_only)
+                        ptr = map->ops->map_lookup_elem_sys_only(map, key);
+                else
+                        ptr = map->ops->map_lookup_elem(map, key);
                if (IS_ERR(ptr)) {
                        err = PTR_ERR(ptr);
                } else if (!ptr) {
@@ -1578,6 +1576,24 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
+                case BPF_CGROUP_UDP4_RECVMSG:
+                case BPF_CGROUP_UDP6_RECVMSG:
+                        return 0;
+                default:
+                        return -EINVAL;
+                }
+        case BPF_PROG_TYPE_CGROUP_SKB:
+                switch (expected_attach_type) {
+                case BPF_CGROUP_INET_INGRESS:
+                case BPF_CGROUP_INET_EGRESS:
+                        return 0;
+                default:
+                        return -EINVAL;
+                }
+        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+                switch (expected_attach_type) {
+                case BPF_CGROUP_SETSOCKOPT:
+                case BPF_CGROUP_GETSOCKOPT:
                        return 0;
                default:
                        return -EINVAL;
@@ -1601,7 +1617,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
        if (CHECK_ATTR(BPF_PROG_LOAD))
                return -EINVAL;
-        if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT))
+        if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
+                                 BPF_F_ANY_ALIGNMENT |
+                                 BPF_F_TEST_RND_HI32))
                return -EINVAL;
        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -1671,7 +1689,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
        if (err < 0)
                goto free_prog;
-        prog->aux->load_time = ktime_get_boot_ns();
+        prog->aux->load_time = ktime_get_boottime_ns();
        err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
        if (err)
                goto free_prog;
@@ -1830,7 +1848,12 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
        switch (prog->type) {
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+        case BPF_PROG_TYPE_CGROUP_SKB:
+                return prog->enforce_expected_attach_type &&
+                        prog->expected_attach_type != attach_type ?
+                        -EINVAL : 0;
        default:
                return 0;
        }
@@ -1872,6 +1895,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
        case BPF_CGROUP_INET6_CONNECT:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
+        case BPF_CGROUP_UDP4_RECVMSG:
+        case BPF_CGROUP_UDP6_RECVMSG:
                ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
                break;
        case BPF_CGROUP_SOCK_OPS:
@@ -1896,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr)
        case BPF_CGROUP_SYSCTL:
                ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
                break;
+        case BPF_CGROUP_GETSOCKOPT:
+        case BPF_CGROUP_SETSOCKOPT:
+                ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+                break;
        default:
                return -EINVAL;
        }
@@ -1957,6 +1986,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
        case BPF_CGROUP_INET6_CONNECT:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
+        case BPF_CGROUP_UDP4_RECVMSG:
+        case BPF_CGROUP_UDP6_RECVMSG:
                ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
                break;
        case BPF_CGROUP_SOCK_OPS:
@@ -1977,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
        case BPF_CGROUP_SYSCTL:
                ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
                break;
+        case BPF_CGROUP_GETSOCKOPT:
+        case BPF_CGROUP_SETSOCKOPT:
+                ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+                break;
        default:
                return -EINVAL;
        }
@@ -2008,9 +2043,13 @@ static int bpf_prog_query(const union bpf_attr *attr,
        case BPF_CGROUP_INET6_CONNECT:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
+        case BPF_CGROUP_UDP4_RECVMSG:
+        case BPF_CGROUP_UDP6_RECVMSG:
        case BPF_CGROUP_SOCK_OPS:
        case BPF_CGROUP_DEVICE:
        case BPF_CGROUP_SYSCTL:
+        case BPF_CGROUP_GETSOCKOPT:
+        case BPF_CGROUP_SETSOCKOPT:
                break;
        case BPF_LIRC_MODE2:
                return lirc_prog_query(attr, uattr);
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 938d41211be7..ca52b9642943 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* tnum: tracked (or tristate) numbers
 *
 * A tnum tracks knowledge about the bits of a value.  Each bit can be either
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 95f9354495ad..a2e763703c30 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016 Facebook
 * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
 */
 #include <uapi/linux/btf.h>
 #include <linux/kernel.h>
@@ -176,7 +168,7 @@ struct bpf_verifier_stack_elem {
        struct bpf_verifier_stack_elem *next;
 };
-#define BPF_COMPLEXITY_LIMIT_STACK      1024
+#define BPF_COMPLEXITY_LIMIT_JMP_SEQ    8192
 #define BPF_COMPLEXITY_LIMIT_STATES     64
 #define BPF_MAP_PTR_UNPRIV      1UL
@@ -334,7 +326,8 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
 {
        return type == PTR_TO_SOCKET ||
                type == PTR_TO_SOCK_COMMON ||
-                type == PTR_TO_TCP_SOCK;
+                type == PTR_TO_TCP_SOCK ||
+                type == PTR_TO_XDP_SOCK;
 }
 static bool reg_type_may_be_null(enum bpf_reg_type type)
@@ -406,6 +399,7 @@ static const char * const reg_type_str[] = {
        [PTR_TO_TCP_SOCK]       = "tcp_sock",
        [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
        [PTR_TO_TP_BUFFER]      = "tp_buffer",
+        [PTR_TO_XDP_SOCK]       = "xdp_sock",
 };
 static char slot_type_char[] = {
@@ -453,12 +447,12 @@ static void print_verifier_state(struct bpf_verifier_env *env,
                verbose(env, " R%d", i);
                print_liveness(env, reg->live);
                verbose(env, "=%s", reg_type_str[t]);
+                if (t == SCALAR_VALUE && reg->precise)
+                        verbose(env, "P");
                if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
                    tnum_is_const(reg->var_off)) {
                        /* reg->off should be 0 for SCALAR_VALUE */
                        verbose(env, "%lld", reg->var_off.value + reg->off);
-                        if (t == PTR_TO_STACK)
-                                verbose(env, ",call_%d", func(env, reg)->callsite);
                } else {
                        verbose(env, "(id=%d", reg->id);
                        if (reg_type_may_be_refcounted_or_null(t))
@@ -520,11 +514,17 @@ static void print_verifier_state(struct bpf_verifier_env *env,
                        continue;
                verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
                print_liveness(env, state->stack[i].spilled_ptr.live);
-                if (state->stack[i].slot_type[0] == STACK_SPILL)
+                if (state->stack[i].slot_type[0] == STACK_SPILL) {
-                        verbose(env, "=%s",
+                        reg = &state->stack[i].spilled_ptr;
-                                reg_type_str[state->stack[i].spilled_ptr.type]);
+                        t = reg->type;
-                else
+                        verbose(env, "=%s", reg_type_str[t]);
+                        if (t == SCALAR_VALUE && reg->precise)
+                                verbose(env, "P");
+                        if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
+                                verbose(env, "%lld", reg->var_off.value + reg->off);
+                } else {
                        verbose(env, "=%s", types_buf);
+                }
        }
        if (state->acquired_refs && state->refs[0].id) {
                verbose(env, " refs=%d", state->refs[0].id);
@@ -673,6 +673,13 @@ static void free_func_state(struct bpf_func_state *state)
        kfree(state);
 }
+static void clear_jmp_history(struct bpf_verifier_state *state)
+{
+        kfree(state->jmp_history);
+        state->jmp_history = NULL;
+        state->jmp_history_cnt = 0;
+}
 static void free_verifier_state(struct bpf_verifier_state *state,
                                bool free_self)
 {
@@ -682,6 +689,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
                free_func_state(state->frame[i]);
                state->frame[i] = NULL;
        }
+        clear_jmp_history(state);
        if (free_self)
                kfree(state);
 }
@@ -709,8 +717,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
                               const struct bpf_verifier_state *src)
 {
        struct bpf_func_state *dst;
+        u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
        int i, err;
+        if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
+                kfree(dst_state->jmp_history);
+                dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
+                if (!dst_state->jmp_history)
+                        return -ENOMEM;
+        }
+        memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
+        dst_state->jmp_history_cnt = src->jmp_history_cnt;
        /* if dst has more stack frames then src frame, free them */
        for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
                free_func_state(dst_state->frame[i]);
@@ -719,6 +737,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
        dst_state->speculative = src->speculative;
        dst_state->curframe = src->curframe;
        dst_state->active_spin_lock = src->active_spin_lock;
+        dst_state->branches = src->branches;
+        dst_state->parent = src->parent;
+        dst_state->first_insn_idx = src->first_insn_idx;
+        dst_state->last_insn_idx = src->last_insn_idx;
        for (i = 0; i <= src->curframe; i++) {
                dst = dst_state->frame[i];
                if (!dst) {
@@ -734,6 +756,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
        return 0;
 }
+static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+        while (st) {
+                u32 br = --st->branches;
+                /* WARN_ON(br > 1) technically makes sense here,
+                 * but see comment in push_stack(), hence:
+                 */
+                WARN_ONCE((int)br < 0,
+                          "BUG update_branch_counts:branches_to_explore=%d\n",
+                          br);
+                if (br)
+                        break;
+                st = st->parent;
+        }
+}
 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
                     int *insn_idx)
 {
@@ -782,10 +821,23 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
        if (err)
                goto err;
        elem->st.speculative |= speculative;
-        if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
+        if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
-                verbose(env, "BPF program is too complex\n");
+                verbose(env, "The sequence of %d jumps is too complex.\n",
+                        env->stack_size);
                goto err;
        }
+        if (elem->st.parent) {
+                ++elem->st.parent->branches;
+                /* WARN_ON(branches > 2) technically makes sense here,
+                 * but
+                 * 1. speculative states will bump 'branches' for non-branch
+                 * instructions
+                 * 2. is_state_visited() heuristics may decide not to create
+                 * a new state for a sequence of branches and all such current
+                 * and cloned states will be pointing to a single parent state
+                 * which might have large 'branches' count.
+                 */
+        }
        return &elem->st;
 err:
        free_verifier_state(env->cur_state, true);
@@ -933,6 +985,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
        reg->smax_value = S64_MAX;
        reg->umin_value = 0;
        reg->umax_value = U64_MAX;
+        /* constant backtracking is enabled for root only for now */
+        reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
 }
 /* Mark a register as having a completely unknown (scalar) value. */
@@ -981,6 +1036,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
        __mark_reg_not_init(regs + regno);
 }
+#define DEF_NOT_SUBREG  (0)
 static void init_reg_state(struct bpf_verifier_env *env,
                           struct bpf_func_state *state)
 {
@@ -991,6 +1047,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
                mark_reg_not_init(env, regs, i);
                regs[i].live = REG_LIVE_NONE;
                regs[i].parent = NULL;
+                regs[i].subreg_def = DEF_NOT_SUBREG;
        }
        /* frame pointer */
@@ -1136,7 +1193,7 @@ next:
 */
 static int mark_reg_read(struct bpf_verifier_env *env,
                         const struct bpf_reg_state *state,
-                         struct bpf_reg_state *parent)
+                         struct bpf_reg_state *parent, u8 flag)
 {
        bool writes = parent == state->parent; /* Observe write marks */
        int cnt = 0;
@@ -1151,17 +1208,26 @@ static int mark_reg_read(struct bpf_verifier_env *env,
                                parent->var_off.value, parent->off);
                        return -EFAULT;
                }
-                if (parent->live & REG_LIVE_READ)
+                /* The first condition is more likely to be true than the
+                 * second, checked it first.
+                 */
+                if ((parent->live & REG_LIVE_READ) == flag ||
+                    parent->live & REG_LIVE_READ64)
                        /* The parentage chain never changes and
                         * this parent was already marked as LIVE_READ.
                         * There is no need to keep walking the chain again and
                         * keep re-marking all parents as LIVE_READ.
                         * This case happens when the same register is read
                         * multiple times without writes into it in-between.
+                         * Also, if parent has the stronger REG_LIVE_READ64 set,
+                         * then no need to set the weak REG_LIVE_READ32.
                         */
                        break;
                /* ... then we depend on parent's value */
-                parent->live |= REG_LIVE_READ;
+                parent->live |= flag;
+                /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
+                if (flag == REG_LIVE_READ64)
+                        parent->live &= ~REG_LIVE_READ32;
                state = parent;
                parent = state->parent;
                writes = true;
@@ -1173,12 +1239,129 @@ static int mark_reg_read(struct bpf_verifier_env *env,
        return 0;
 }
+/* This function is supposed to be used by the following 32-bit optimization
+ * code only. It returns TRUE if the source or destination register operates
+ * on 64-bit, otherwise return FALSE.
+ */
+static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
+                     u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
+{
+        u8 code, class, op;
+        code = insn->code;
+        class = BPF_CLASS(code);
+        op = BPF_OP(code);
+        if (class == BPF_JMP) {
+                /* BPF_EXIT for "main" will reach here. Return TRUE
+                 * conservatively.
+                 */
+                if (op == BPF_EXIT)
+                        return true;
+                if (op == BPF_CALL) {
+                        /* BPF to BPF call will reach here because of marking
+                         * caller saved clobber with DST_OP_NO_MARK for which we
+                         * don't care the register def because they are anyway
+                         * marked as NOT_INIT already.
+                         */
+                        if (insn->src_reg == BPF_PSEUDO_CALL)
+                                return false;
+                        /* Helper call will reach here because of arg type
+                         * check, conservatively return TRUE.
+                         */
+                        if (t == SRC_OP)
+                                return true;
+                        return false;
+                }
+        }
+        if (class == BPF_ALU64 || class == BPF_JMP ||
+            /* BPF_END always use BPF_ALU class. */
+            (class == BPF_ALU && op == BPF_END && insn->imm == 64))
+                return true;
+        if (class == BPF_ALU || class == BPF_JMP32)
+                return false;
+        if (class == BPF_LDX) {
+                if (t != SRC_OP)
+                        return BPF_SIZE(code) == BPF_DW;
+                /* LDX source must be ptr. */
+                return true;
+        }
+        if (class == BPF_STX) {
+                if (reg->type != SCALAR_VALUE)
+                        return true;
+                return BPF_SIZE(code) == BPF_DW;
+        }
+        if (class == BPF_LD) {
+                u8 mode = BPF_MODE(code);
+                /* LD_IMM64 */
+                if (mode == BPF_IMM)
+                        return true;
+                /* Both LD_IND and LD_ABS return 32-bit data. */
+                if (t != SRC_OP)
+                        return  false;
+                /* Implicit ctx ptr. */
+                if (regno == BPF_REG_6)
+                        return true;
+                /* Explicit source could be any width. */
+                return true;
+        }
+        if (class == BPF_ST)
+                /* The only source register for BPF_ST is a ptr. */
+                return true;
+        /* Conservatively return true at default. */
+        return true;
+}
+/* Return TRUE if INSN doesn't have explicit value define. */
+static bool insn_no_def(struct bpf_insn *insn)
+{
+        u8 class = BPF_CLASS(insn->code);
+        return (class == BPF_JMP || class == BPF_JMP32 ||
+                class == BPF_STX || class == BPF_ST);
+}
+/* Return TRUE if INSN has defined any 32-bit value explicitly. */
+static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+        if (insn_no_def(insn))
+                return false;
+        return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP);
+}
+static void mark_insn_zext(struct bpf_verifier_env *env,
+                           struct bpf_reg_state *reg)
+{
+        s32 def_idx = reg->subreg_def;
+        if (def_idx == DEF_NOT_SUBREG)
+                return;
+        env->insn_aux_data[def_idx - 1].zext_dst = true;
+        /* The dst will be zero extended, so won't be sub-register anymore. */
+        reg->subreg_def = DEF_NOT_SUBREG;
+}
 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
                         enum reg_arg_type t)
 {
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
+        struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
        struct bpf_reg_state *reg, *regs = state->regs;
+        bool rw64;
        if (regno >= MAX_BPF_REG) {
                verbose(env, "R%d is invalid\n", regno);
@@ -1186,6 +1369,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
        }
        reg = &regs[regno];
+        rw64 = is_reg64(env, insn, regno, reg, t);
        if (t == SRC_OP) {
                /* check whether register used as source operand can be read */
                if (reg->type == NOT_INIT) {
@@ -1196,7 +1380,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
                if (regno == BPF_REG_FP)
                        return 0;
-                return mark_reg_read(env, reg, reg->parent);
+                if (rw64)
+                        mark_insn_zext(env, reg);
+                return mark_reg_read(env, reg, reg->parent,
+                                     rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
        } else {
                /* check whether register used as dest operand can be written to */
                if (regno == BPF_REG_FP) {
@@ -1204,12 +1392,441 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
                        return -EACCES;
                }
                reg->live |= REG_LIVE_WRITTEN;
+                reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
                if (t == DST_OP)
                        mark_reg_unknown(env, regs, regno);
        }
        return 0;
 }
+/* for any branch, call, exit record the history of jmps in the given state */
+static int push_jmp_history(struct bpf_verifier_env *env,
+                            struct bpf_verifier_state *cur)
+{
+        u32 cnt = cur->jmp_history_cnt;
+        struct bpf_idx_pair *p;
+        cnt++;
+        p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
+        if (!p)
+                return -ENOMEM;
+        p[cnt - 1].idx = env->insn_idx;
+        p[cnt - 1].prev_idx = env->prev_insn_idx;
+        cur->jmp_history = p;
+        cur->jmp_history_cnt = cnt;
+        return 0;
+}
+/* Backtrack one insn at a time. If idx is not at the top of recorded
+ * history then previous instruction came from straight line execution.
+ */
+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
+                             u32 *history)
+{
+        u32 cnt = *history;
+        if (cnt && st->jmp_history[cnt - 1].idx == i) {
+                i = st->jmp_history[cnt - 1].prev_idx;
+                (*history)--;
+        } else {
+                i--;
+        }
+        return i;
+}
+/* For given verifier state backtrack_insn() is called from the last insn to
+ * the first insn. Its purpose is to compute a bitmask of registers and
+ * stack slots that needs precision in the parent verifier state.
+ */
+static int backtrack_insn(struct bpf_verifier_env *env, int idx,
+                          u32 *reg_mask, u64 *stack_mask)
+{
+        const struct bpf_insn_cbs cbs = {
+                .cb_print       = verbose,
+                .private_data   = env,
+        };
+        struct bpf_insn *insn = env->prog->insnsi + idx;
+        u8 class = BPF_CLASS(insn->code);
+        u8 opcode = BPF_OP(insn->code);
+        u8 mode = BPF_MODE(insn->code);
+        u32 dreg = 1u << insn->dst_reg;
+        u32 sreg = 1u << insn->src_reg;
+        u32 spi;
+        if (insn->code == 0)
+                return 0;
+        if (env->log.level & BPF_LOG_LEVEL) {
+                verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
+                verbose(env, "%d: ", idx);
+                print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+        }
+        if (class == BPF_ALU || class == BPF_ALU64) {
+                if (!(*reg_mask & dreg))
+                        return 0;
+                if (opcode == BPF_MOV) {
+                        if (BPF_SRC(insn->code) == BPF_X) {
+                                /* dreg = sreg
+                                 * dreg needs precision after this insn
+                                 * sreg needs precision before this insn
+                                 */
+                                *reg_mask &= ~dreg;
+                                *reg_mask |= sreg;
+                        } else {
+                                /* dreg = K
+                                 * dreg needs precision after this insn.
+                                 * Corresponding register is already marked
+                                 * as precise=true in this verifier state.
+                                 * No further markings in parent are necessary
+                                 */
+                                *reg_mask &= ~dreg;
+                        }
+                } else {
+                        if (BPF_SRC(insn->code) == BPF_X) {
+                                /* dreg += sreg
+                                 * both dreg and sreg need precision
+                                 * before this insn
+                                 */
+                                *reg_mask |= sreg;
+                        } /* else dreg += K
+                           * dreg still needs precision before this insn
+                           */
+                }
+        } else if (class == BPF_LDX) {
+                if (!(*reg_mask & dreg))
+                        return 0;
+                *reg_mask &= ~dreg;
+                /* scalars can only be spilled into stack w/o losing precision.
+                 * Load from any other memory can be zero extended.
+                 * The desire to keep that precision is already indicated
+                 * by 'precise' mark in corresponding register of this state.
+                 * No further tracking necessary.
+                 */
+                if (insn->src_reg != BPF_REG_FP)
+                        return 0;
+                if (BPF_SIZE(insn->code) != BPF_DW)
+                        return 0;
+                /* dreg = *(u64 *)[fp - off] was a fill from the stack.
+                 * that [fp - off] slot contains scalar that needs to be
+                 * tracked with precision
+                 */
+                spi = (-insn->off - 1) / BPF_REG_SIZE;
+                if (spi >= 64) {
+                        verbose(env, "BUG spi %d\n", spi);
+                        WARN_ONCE(1, "verifier backtracking bug");
+                        return -EFAULT;
+                }
+                *stack_mask |= 1ull << spi;
+        } else if (class == BPF_STX) {
+                if (*reg_mask & dreg)
+                        /* stx shouldn't be using _scalar_ dst_reg
+                         * to access memory. It means backtracking
+                         * encountered a case of pointer subtraction.
+                         */
+                        return -ENOTSUPP;
+                /* scalars can only be spilled into stack */
+                if (insn->dst_reg != BPF_REG_FP)
+                        return 0;
+                if (BPF_SIZE(insn->code) != BPF_DW)
+                        return 0;
+                spi = (-insn->off - 1) / BPF_REG_SIZE;
+                if (spi >= 64) {
+                        verbose(env, "BUG spi %d\n", spi);
+                        WARN_ONCE(1, "verifier backtracking bug");
+                        return -EFAULT;
+                }
+                if (!(*stack_mask & (1ull << spi)))
+                        return 0;
+                *stack_mask &= ~(1ull << spi);
+                *reg_mask |= sreg;
+        } else if (class == BPF_JMP || class == BPF_JMP32) {
+                if (opcode == BPF_CALL) {
+                        if (insn->src_reg == BPF_PSEUDO_CALL)
+                                return -ENOTSUPP;
+                        /* regular helper call sets R0 */
+                        *reg_mask &= ~1;
+                        if (*reg_mask & 0x3f) {
+                                /* if backtracing was looking for registers R1-R5
+                                 * they should have been found already.
+                                 */
+                                verbose(env, "BUG regs %x\n", *reg_mask);
+                                WARN_ONCE(1, "verifier backtracking bug");
+                                return -EFAULT;
+                        }
+                } else if (opcode == BPF_EXIT) {
+                        return -ENOTSUPP;
+                }
+        } else if (class == BPF_LD) {
+                if (!(*reg_mask & dreg))
+                        return 0;
+                *reg_mask &= ~dreg;
+                /* It's ld_imm64 or ld_abs or ld_ind.
+                 * For ld_imm64 no further tracking of precision
+                 * into parent is necessary
+                 */
+                if (mode == BPF_IND || mode == BPF_ABS)
+                        /* to be analyzed */
+                        return -ENOTSUPP;
+        } else if (class == BPF_ST) {
+                if (*reg_mask & dreg)
+                        /* likely pointer subtraction */
+                        return -ENOTSUPP;
+        }
+        return 0;
+}
+/* the scalar precision tracking algorithm:
+ * . at the start all registers have precise=false.
+ * . scalar ranges are tracked as normal through alu and jmp insns.
+ * . once precise value of the scalar register is used in:
+ *   .  ptr + scalar alu
+ *   . if (scalar cond K|scalar)
+ *   .  helper_call(.., scalar, ...) where ARG_CONST is expected
+ *   backtrack through the verifier states and mark all registers and
+ *   stack slots with spilled constants that these scalar regisers
+ *   should be precise.
+ * . during state pruning two registers (or spilled stack slots)
+ *   are equivalent if both are not precise.
+ *
+ * Note the verifier cannot simply walk register parentage chain,
+ * since many different registers and stack slots could have been
+ * used to compute single precise scalar.
+ *
+ * The approach of starting with precise=true for all registers and then
+ * backtrack to mark a register as not precise when the verifier detects
+ * that program doesn't care about specific value (e.g., when helper
+ * takes register as ARG_ANYTHING parameter) is not safe.
+ *
+ * It's ok to walk single parentage chain of the verifier states.
+ * It's possible that this backtracking will go all the way till 1st insn.
+ * All other branches will be explored for needing precision later.
+ *
+ * The backtracking needs to deal with cases like:
+ *   R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
+ * r9 -= r8
+ * r5 = r9
+ * if r5 > 0x79f goto pc+7
+ *    R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
+ * r5 += 1
+ * ...
+ * call bpf_perf_event_output#25
+ *   where .arg5_type = ARG_CONST_SIZE_OR_ZERO
+ *
+ * and this case:
+ * r6 = 1
+ * call foo // uses callee's r6 inside to compute r0
+ * r0 += r6
+ * if r0 == 0 goto
+ *
+ * to track above reg_mask/stack_mask needs to be independent for each frame.
+ *
+ * Also if parent's curframe > frame where backtracking started,
+ * the verifier need to mark registers in both frames, otherwise callees
+ * may incorrectly prune callers. This is similar to
+ * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
+ *
+ * For now backtracking falls back into conservative marking.
+ */
+static void mark_all_scalars_precise(struct bpf_verifier_env *env,
+                                     struct bpf_verifier_state *st)
+{
+        struct bpf_func_state *func;
+        struct bpf_reg_state *reg;
+        int i, j;
+        /* big hammer: mark all scalars precise in this path.
+         * pop_stack may still get !precise scalars.
+         */
+        for (; st; st = st->parent)
+                for (i = 0; i <= st->curframe; i++) {
+                        func = st->frame[i];
+                        for (j = 0; j < BPF_REG_FP; j++) {
+                                reg = &func->regs[j];
+                                if (reg->type != SCALAR_VALUE)
+                                        continue;
+                                reg->precise = true;
+                        }
+                        for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+                                if (func->stack[j].slot_type[0] != STACK_SPILL)
+                                        continue;
+                                reg = &func->stack[j].spilled_ptr;
+                                if (reg->type != SCALAR_VALUE)
+                                        continue;
+                                reg->precise = true;
+                        }
+                }
+}
+static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
+                                  int spi)
+{
+        struct bpf_verifier_state *st = env->cur_state;
+        int first_idx = st->first_insn_idx;
+        int last_idx = env->insn_idx;
+        struct bpf_func_state *func;
+        struct bpf_reg_state *reg;
+        u32 reg_mask = regno >= 0 ? 1u << regno : 0;
+        u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
+        bool skip_first = true;
+        bool new_marks = false;
+        int i, err;
+        if (!env->allow_ptr_leaks)
+                /* backtracking is root only for now */
+                return 0;
+        func = st->frame[st->curframe];
+        if (regno >= 0) {
+                reg = &func->regs[regno];
+                if (reg->type != SCALAR_VALUE) {
+                        WARN_ONCE(1, "backtracing misuse");
+                        return -EFAULT;
+                }
+                if (!reg->precise)
+                        new_marks = true;
+                else
+                        reg_mask = 0;
+                reg->precise = true;
+        }
+        while (spi >= 0) {
+                if (func->stack[spi].slot_type[0] != STACK_SPILL) {
+                        stack_mask = 0;
+                        break;
+                }
+                reg = &func->stack[spi].spilled_ptr;
+                if (reg->type != SCALAR_VALUE) {
+                        stack_mask = 0;
+                        break;
+                }
+                if (!reg->precise)
+                        new_marks = true;
+                else
+                        stack_mask = 0;
+                reg->precise = true;
+                break;
+        }
+        if (!new_marks)
+                return 0;
+        if (!reg_mask && !stack_mask)
+                return 0;
+        for (;;) {
+                DECLARE_BITMAP(mask, 64);
+                u32 history = st->jmp_history_cnt;
+                if (env->log.level & BPF_LOG_LEVEL)
+                        verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+                for (i = last_idx;;) {
+                        if (skip_first) {
+                                err = 0;
+                                skip_first = false;
+                        } else {
+                                err = backtrack_insn(env, i, &reg_mask, &stack_mask);
+                        }
+                        if (err == -ENOTSUPP) {
+                                mark_all_scalars_precise(env, st);
+                                return 0;
+                        } else if (err) {
+                                return err;
+                        }
+                        if (!reg_mask && !stack_mask)
+                                /* Found assignment(s) into tracked register in this state.
+                                 * Since this state is already marked, just return.
+                                 * Nothing to be tracked further in the parent state.
+                                 */
+                                return 0;
+                        if (i == first_idx)
+                                break;
+                        i = get_prev_insn_idx(st, i, &history);
+                        if (i >= env->prog->len) {
+                                /* This can happen if backtracking reached insn 0
+                                 * and there are still reg_mask or stack_mask
+                                 * to backtrack.
+                                 * It means the backtracking missed the spot where
+                                 * particular register was initialized with a constant.
+                                 */
+                                verbose(env, "BUG backtracking idx %d\n", i);
+                                WARN_ONCE(1, "verifier backtracking bug");
+                                return -EFAULT;
+                        }
+                }
+                st = st->parent;
+                if (!st)
+                        break;
+                new_marks = false;
+                func = st->frame[st->curframe];
+                bitmap_from_u64(mask, reg_mask);
+                for_each_set_bit(i, mask, 32) {
+                        reg = &func->regs[i];
+                        if (reg->type != SCALAR_VALUE) {
+                                reg_mask &= ~(1u << i);
+                                continue;
+                        }
+                        if (!reg->precise)
+                                new_marks = true;
+                        reg->precise = true;
+                }
+                bitmap_from_u64(mask, stack_mask);
+                for_each_set_bit(i, mask, 64) {
+                        if (i >= func->allocated_stack / BPF_REG_SIZE) {
+                                /* This can happen if backtracking
+                                 * is propagating stack precision where
+                                 * caller has larger stack frame
+                                 * than callee, but backtrack_insn() should
+                                 * have returned -ENOTSUPP.
+                                 */
+                                verbose(env, "BUG spi %d stack_size %d\n",
+                                        i, func->allocated_stack);
+                                WARN_ONCE(1, "verifier backtracking bug");
+                                return -EFAULT;
+                        }
+                        if (func->stack[i].slot_type[0] != STACK_SPILL) {
+                                stack_mask &= ~(1ull << i);
+                                continue;
+                        }
+                        reg = &func->stack[i].spilled_ptr;
+                        if (reg->type != SCALAR_VALUE) {
+                                stack_mask &= ~(1ull << i);
+                                continue;
+                        }
+                        if (!reg->precise)
+                                new_marks = true;
+                        reg->precise = true;
+                }
+                if (env->log.level & BPF_LOG_LEVEL) {
+                        print_verifier_state(env, func);
+                        verbose(env, "parent %s regs=%x stack=%llx marks\n",
+                                new_marks ? "didn't have" : "already had",
+                                reg_mask, stack_mask);
+                }
+                if (!reg_mask && !stack_mask)
+                        break;
+                if (!new_marks)
+                        break;
+                last_idx = st->last_insn_idx;
+                first_idx = st->first_insn_idx;
+        }
+        return 0;
+}
+static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+{
+        return __mark_chain_precision(env, regno, -1);
+}
+static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
+{
+        return __mark_chain_precision(env, -1, spi);
+}
 static bool is_spillable_regtype(enum bpf_reg_type type)
 {
        switch (type) {
@@ -1228,6 +1845,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
        case PTR_TO_SOCK_COMMON_OR_NULL:
        case PTR_TO_TCP_SOCK:
        case PTR_TO_TCP_SOCK_OR_NULL:
+        case PTR_TO_XDP_SOCK:
                return true;
        default:
                return false;
@@ -1240,6 +1858,23 @@ static bool register_is_null(struct bpf_reg_state *reg)
        return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
 }
+static bool register_is_const(struct bpf_reg_state *reg)
+{
+        return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
+}
+static void save_register_state(struct bpf_func_state *state,
+                                int spi, struct bpf_reg_state *reg)
+{
+        int i;
+        state->stack[spi].spilled_ptr = *reg;
+        state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+        for (i = 0; i < BPF_REG_SIZE; i++)
+                state->stack[spi].slot_type[i] = STACK_SPILL;
+}
 /* check_stack_read/write functions track spill/fill of registers,
 * stack boundary and alignment are checked in check_mem_access()
 */
@@ -1249,7 +1884,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
 {
        struct bpf_func_state *cur; /* state of the current function */
        int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
-        enum bpf_reg_type type;
+        u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
+        struct bpf_reg_state *reg = NULL;
        err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
                                 state->acquired_refs, true);
@@ -1266,27 +1902,48 @@ static int check_stack_write(struct bpf_verifier_env *env,
        }
        cur = env->cur_state->frame[env->cur_state->curframe];
-        if (value_regno >= 0 &&
+        if (value_regno >= 0)
-            is_spillable_regtype((type = cur->regs[value_regno].type))) {
+                reg = &cur->regs[value_regno];
+        if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
+            !register_is_null(reg) && env->allow_ptr_leaks) {
+                if (dst_reg != BPF_REG_FP) {
+                        /* The backtracking logic can only recognize explicit
+                         * stack slot address like [fp - 8]. Other spill of
+                         * scalar via different register has to be conervative.
+                         * Backtrack from here and mark all registers as precise
+                         * that contributed into 'reg' being a constant.
+                         */
+                        err = mark_chain_precision(env, value_regno);
+                        if (err)
+                                return err;
+                }
+                save_register_state(state, spi, reg);
+        } else if (reg && is_spillable_regtype(reg->type)) {
                /* register containing pointer is being spilled into stack */
                if (size != BPF_REG_SIZE) {
+                        verbose_linfo(env, insn_idx, "; ");
                        verbose(env, "invalid size of register spill\n");
                        return -EACCES;
                }
-                if (state != cur && type == PTR_TO_STACK) {
+                if (state != cur && reg->type == PTR_TO_STACK) {
                        verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
                        return -EINVAL;
                }
-                /* save register state */
+                if (!env->allow_ptr_leaks) {
-                state->stack[spi].spilled_ptr = cur->regs[value_regno];
+                        bool sanitize = false;
-                state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
-                for (i = 0; i < BPF_REG_SIZE; i++) {
+                        if (state->stack[spi].slot_type[0] == STACK_SPILL &&
-                        if (state->stack[spi].slot_type[i] == STACK_MISC &&
+                            register_is_const(&state->stack[spi].spilled_ptr))
-                            !env->allow_ptr_leaks) {
+                                sanitize = true;
+                        for (i = 0; i < BPF_REG_SIZE; i++)
+                                if (state->stack[spi].slot_type[i] == STACK_MISC) {
+                                        sanitize = true;
+                                        break;
+                                }
+                        if (sanitize) {
                                int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
                                int soff = (-spi - 1) * BPF_REG_SIZE;
@@ -1309,8 +1966,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
                                }
                                *poff = soff;
                        }
-                        state->stack[spi].slot_type[i] = STACK_SPILL;
                }
+                save_register_state(state, spi, reg);
        } else {
                u8 type = STACK_MISC;
@@ -1333,9 +1990,13 @@ static int check_stack_write(struct bpf_verifier_env *env,
                        state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
                /* when we zero initialize stack slots mark them as such */
-                if (value_regno >= 0 &&
+                if (reg && register_is_null(reg)) {
-                    register_is_null(&cur->regs[value_regno]))
+                        /* backtracking doesn't work for STACK_ZERO yet. */
+                        err = mark_chain_precision(env, value_regno);
+                        if (err)
+                                return err;
                        type = STACK_ZERO;
+                }
                /* Mark slots affected by this stack write. */
                for (i = 0; i < size; i++)
@@ -1352,6 +2013,7 @@ static int check_stack_read(struct bpf_verifier_env *env,
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+        struct bpf_reg_state *reg;
        u8 *stype;
        if (reg_state->allocated_stack <= slot) {
@@ -1360,11 +2022,21 @@ static int check_stack_read(struct bpf_verifier_env *env,
                return -EACCES;
        }
        stype = reg_state->stack[spi].slot_type;
+        reg = &reg_state->stack[spi].spilled_ptr;
        if (stype[0] == STACK_SPILL) {
                if (size != BPF_REG_SIZE) {
-                        verbose(env, "invalid size of register spill\n");
+                        if (reg->type != SCALAR_VALUE) {
-                        return -EACCES;
+                                verbose_linfo(env, env->insn_idx, "; ");
+                                verbose(env, "invalid size of register fill\n");
+                                return -EACCES;
+                        }
+                        if (value_regno >= 0) {
+                                mark_reg_unknown(env, state->regs, value_regno);
+                                state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+                        }
+                        mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+                        return 0;
                }
                for (i = 1; i < BPF_REG_SIZE; i++) {
                        if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
@@ -1375,16 +2047,14 @@ static int check_stack_read(struct bpf_verifier_env *env,
                if (value_regno >= 0) {
                        /* restore register state from stack */
-                        state->regs[value_regno] = reg_state->stack[spi].spilled_ptr;
+                        state->regs[value_regno] = *reg;
                        /* mark reg as written since spilled pointer state likely
                         * has its liveness marks cleared by is_state_visited()
                         * which resets stack/reg liveness for state transitions
                         */
                        state->regs[value_regno].live |= REG_LIVE_WRITTEN;
                }
-                mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
+                mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
-                              reg_state->stack[spi].spilled_ptr.parent);
-                return 0;
        } else {
                int zeros = 0;
@@ -1399,22 +2069,32 @@ static int check_stack_read(struct bpf_verifier_env *env,
                                off, i, size);
                        return -EACCES;
                }
-                mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
+                mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
-                              reg_state->stack[spi].spilled_ptr.parent);
                if (value_regno >= 0) {
                        if (zeros == size) {
                                /* any size read into register is zero extended,
                                 * so the whole register == const_zero
                                 */
                                __mark_reg_const_zero(&state->regs[value_regno]);
+                                /* backtracking doesn't support STACK_ZERO yet,
+                                 * so mark it precise here, so that later
+                                 * backtracking can stop here.
+                                 * Backtracking may not need this if this register
+                                 * doesn't participate in pointer adjustment.
+                                 * Forward propagation of precise flag is not
+                                 * necessary either. This mark is only to stop
+                                 * backtracking. Any register that contributed
+                                 * to const 0 was marked precise before spill.
+                                 */
+                                state->regs[value_regno].precise = true;
                        } else {
                                /* have read misc data from the stack */
                                mark_reg_unknown(env, state->regs, value_regno);
                        }
                        state->regs[value_regno].live |= REG_LIVE_WRITTEN;
                }
-                return 0;
        }
+        return 0;
 }
 static int check_stack_access(struct bpf_verifier_env *env,
@@ -1580,6 +2260,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
                env->seen_direct_write = true;
                return true;
+        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+                if (t == BPF_WRITE)
+                        env->seen_direct_write = true;
+                return true;
        default:
                return false;
        }
@@ -1706,6 +2393,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
        case PTR_TO_TCP_SOCK:
                valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
                break;
+        case PTR_TO_XDP_SOCK:
+                valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
+                break;
        default:
                valid = false;
        }
@@ -1870,6 +2560,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
        case PTR_TO_TCP_SOCK:
                pointer_desc = "tcp_sock ";
                break;
+        case PTR_TO_XDP_SOCK:
+                pointer_desc = "xdp_sock ";
+                break;
        default:
                break;
        }
@@ -2109,6 +2802,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                                                    value_regno);
                                if (reg_type_may_be_null(reg_type))
                                        regs[value_regno].id = ++env->id_gen;
+                                /* A load of ctx field could have different
+                                 * actual load size with the one encoded in the
+                                 * insn. When the dst is PTR, it is for sure not
+                                 * a sub-register.
+                                 */
+                                regs[value_regno].subreg_def = DEF_NOT_SUBREG;
                        }
                        regs[value_regno].type = reg_type;
                }
@@ -2263,7 +2962,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 {
        struct bpf_reg_state *reg = reg_state(env, regno);
        struct bpf_func_state *state = func(env, reg);
-        int err, min_off, max_off, i, slot, spi;
+        int err, min_off, max_off, i, j, slot, spi;
        if (reg->type != PTR_TO_STACK) {
                /* Allow zero-byte read from NULL, regardless of pointer type */
@@ -2351,6 +3050,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
                        *stype = STACK_MISC;
                        goto mark;
                }
+                if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+                    state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
+                        __mark_reg_unknown(&state->stack[spi].spilled_ptr);
+                        for (j = 0; j < BPF_REG_SIZE; j++)
+                                state->stack[spi].slot_type[j] = STACK_MISC;
+                        goto mark;
+                }
 err:
                if (tnum_is_const(reg->var_off)) {
                        verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
@@ -2368,7 +3075,8 @@ mark:
                 * the whole slot to be marked as 'read'
                 */
                mark_reg_read(env, &state->stack[spi].spilled_ptr,
-                              state->stack[spi].spilled_ptr.parent);
+                              state->stack[spi].spilled_ptr.parent,
+                              REG_LIVE_READ64);
        }
        return update_stack_depth(env, state, min_off);
 }
@@ -2701,6 +3409,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                err = check_helper_mem_access(env, regno - 1,
                                              reg->umax_value,
                                              zero_size_allowed, meta);
+                if (!err)
+                        err = mark_chain_precision(env, regno);
        } else if (arg_type_is_int_ptr(arg_type)) {
                int size = int_ptr_type_to_size(arg_type);
@@ -2749,22 +3459,23 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
                if (func_id != BPF_FUNC_get_local_storage)
                        goto error;
                break;
-        /* devmap returns a pointer to a live net_device ifindex that we cannot
-         * allow to be modified from bpf side. So do not allow lookup elements
-         * for now.
-         */
        case BPF_MAP_TYPE_DEVMAP:
-                if (func_id != BPF_FUNC_redirect_map)
+                if (func_id != BPF_FUNC_redirect_map &&
+                    func_id != BPF_FUNC_map_lookup_elem)
                        goto error;
                break;
        /* Restrict bpf side of cpumap and xskmap, open when use-cases
         * appear.
         */
        case BPF_MAP_TYPE_CPUMAP:
-        case BPF_MAP_TYPE_XSKMAP:
                if (func_id != BPF_FUNC_redirect_map)
                        goto error;
                break;
+        case BPF_MAP_TYPE_XSKMAP:
+                if (func_id != BPF_FUNC_redirect_map &&
+                    func_id != BPF_FUNC_map_lookup_elem)
+                        goto error;
+                break;
        case BPF_MAP_TYPE_ARRAY_OF_MAPS:
        case BPF_MAP_TYPE_HASH_OF_MAPS:
                if (func_id != BPF_FUNC_map_lookup_elem)
@@ -3332,6 +4043,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
                check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
        }
+        /* helper call returns 64-bit value. */
+        regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
        /* update return register (already marked as written above) */
        if (fn->ret_type == RET_INTEGER) {
                /* sets type to SCALAR_VALUE */
@@ -3652,6 +4366,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
        case PTR_TO_SOCK_COMMON_OR_NULL:
        case PTR_TO_TCP_SOCK:
        case PTR_TO_TCP_SOCK_OR_NULL:
+        case PTR_TO_XDP_SOCK:
                verbose(env, "R%d pointer arithmetic on %s prohibited\n",
                        dst, reg_type_str[ptr_reg->type]);
                return -EACCES;
@@ -4129,6 +4844,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
        struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
        struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
        u8 opcode = BPF_OP(insn->code);
+        int err;
        dst_reg = &regs[insn->dst_reg];
        src_reg = NULL;
@@ -4155,11 +4871,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                                 * This is legal, but we have to reverse our
                                 * src/dest handling in computing the range
                                 */
+                                err = mark_chain_precision(env, insn->dst_reg);
+                                if (err)
+                                        return err;
                                return adjust_ptr_min_max_vals(env, insn,
                                                               src_reg, dst_reg);
                        }
                } else if (ptr_reg) {
                        /* pointer += scalar */
+                        err = mark_chain_precision(env, insn->src_reg);
+                        if (err)
+                                return err;
                        return adjust_ptr_min_max_vals(env, insn,
                                                       dst_reg, src_reg);
                }
@@ -4263,6 +4985,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                 */
                                *dst_reg = *src_reg;
                                dst_reg->live |= REG_LIVE_WRITTEN;
+                                dst_reg->subreg_def = DEF_NOT_SUBREG;
                        } else {
                                /* R1 = (u32) R2 */
                                if (is_pointer_value(env, insn->src_reg)) {
@@ -4273,6 +4996,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                } else if (src_reg->type == SCALAR_VALUE) {
                                        *dst_reg = *src_reg;
                                        dst_reg->live |= REG_LIVE_WRITTEN;
+                                        dst_reg->subreg_def = env->insn_idx + 1;
                                } else {
                                        mark_reg_unknown(env, regs,
                                                         insn->dst_reg);
@@ -4889,6 +5613,9 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
                        if (reg->map_ptr->inner_map_meta) {
                                reg->type = CONST_PTR_TO_MAP;
                                reg->map_ptr = reg->map_ptr->inner_map_meta;
+                        } else if (reg->map_ptr->map_type ==
+                                   BPF_MAP_TYPE_XSKMAP) {
+                                reg->type = PTR_TO_XDP_SOCK;
                        } else {
                                reg->type = PTR_TO_MAP_VALUE;
                        }
@@ -5060,9 +5787,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
        struct bpf_verifier_state *this_branch = env->cur_state;
        struct bpf_verifier_state *other_branch;
        struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
-        struct bpf_reg_state *dst_reg, *other_branch_regs;
+        struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
        u8 opcode = BPF_OP(insn->code);
        bool is_jmp32;
+        int pred = -1;
        int err;
        /* Only conditional jumps are expected to reach here. */
@@ -5087,6 +5815,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
                                insn->src_reg);
                        return -EACCES;
                }
+                src_reg = &regs[insn->src_reg];
        } else {
                if (insn->src_reg != BPF_REG_0) {
                        verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -5102,20 +5831,29 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
        dst_reg = &regs[insn->dst_reg];
        is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
-        if (BPF_SRC(insn->code) == BPF_K) {
+        if (BPF_SRC(insn->code) == BPF_K)
-                int pred = is_branch_taken(dst_reg, insn->imm, opcode,
+                pred = is_branch_taken(dst_reg, insn->imm,
-                                           is_jmp32);
+                                       opcode, is_jmp32);
+        else if (src_reg->type == SCALAR_VALUE &&
-                if (pred == 1) {
+                 tnum_is_const(src_reg->var_off))
-                         /* only follow the goto, ignore fall-through */
+                pred = is_branch_taken(dst_reg, src_reg->var_off.value,
-                        *insn_idx += insn->off;
+                                       opcode, is_jmp32);
-                        return 0;
+        if (pred >= 0) {
-                } else if (pred == 0) {
+                err = mark_chain_precision(env, insn->dst_reg);
-                        /* only follow fall-through branch, since
+                if (BPF_SRC(insn->code) == BPF_X && !err)
-                         * that's where the program will go
+                        err = mark_chain_precision(env, insn->src_reg);
-                         */
+                if (err)
-                        return 0;
+                        return err;
-                }
+        }
+        if (pred == 1) {
+                /* only follow the goto, ignore fall-through */
+                *insn_idx += insn->off;
+                return 0;
+        } else if (pred == 0) {
+                /* only follow fall-through branch, since
+                 * that's where the program will go
+                 */
+                return 0;
        }
        other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
@@ -5352,21 +6090,32 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
         * Already marked as written above.
         */
        mark_reg_unknown(env, regs, BPF_REG_0);
+        /* ld_abs load up to 32-bit skb data. */
+        regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
        return 0;
 }
 static int check_return_code(struct bpf_verifier_env *env)
 {
+        struct tnum enforce_attach_type_range = tnum_unknown;
        struct bpf_reg_state *reg;
        struct tnum range = tnum_range(0, 1);
        switch (env->prog->type) {
+        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+                if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
+                    env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+                        range = tnum_range(1, 1);
        case BPF_PROG_TYPE_CGROUP_SKB:
+                if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
+                        range = tnum_range(0, 3);
+                        enforce_attach_type_range = tnum_range(2, 3);
+                }
        case BPF_PROG_TYPE_CGROUP_SOCK:
-        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_SOCK_OPS:
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
+        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                break;
        default:
                return 0;
@@ -5380,18 +6129,23 @@ static int check_return_code(struct bpf_verifier_env *env)
        }
        if (!tnum_in(range, reg->var_off)) {
+                char tn_buf[48];
                verbose(env, "At program exit the register R0 ");
                if (!tnum_is_unknown(reg->var_off)) {
-                        char tn_buf[48];
                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                        verbose(env, "has value %s", tn_buf);
                } else {
                        verbose(env, "has unknown scalar value");
                }
-                verbose(env, " should have been 0 or 1\n");
+                tnum_strn(tn_buf, sizeof(tn_buf), range);
+                verbose(env, " should have been in %s\n", tn_buf);
                return -EINVAL;
        }
+        if (!tnum_is_unknown(enforce_attach_type_range) &&
+            tnum_in(enforce_attach_type_range, reg->var_off))
+                env->prog->enforce_expected_attach_type = 1;
        return 0;
 }
@@ -5435,14 +6189,33 @@ enum {
        BRANCH = 2,
 };
-#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
+static u32 state_htab_size(struct bpf_verifier_env *env)
+{
+        return env->prog->len;
+}
+static struct bpf_verifier_state_list **explored_state(
+                                        struct bpf_verifier_env *env,
+                                        int idx)
+{
+        struct bpf_verifier_state *cur = env->cur_state;
+        struct bpf_func_state *state = cur->frame[cur->curframe];
+        return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+}
+static void init_explored_state(struct bpf_verifier_env *env, int idx)
+{
+        env->insn_aux_data[idx].prune_point = true;
+}
 /* t, w, e - match pseudo-code above:
 * t - index of current instruction
 * w - next instruction
 * e - edge
 */
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
+                     bool loop_ok)
 {
        int *insn_stack = env->cfg.insn_stack;
        int *insn_state = env->cfg.insn_state;
@@ -5461,7 +6234,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
        if (e == BRANCH)
                /* mark branch target for state pruning */
-                env->explored_states[w] = STATE_LIST_MARK;
+                init_explored_state(env, w);
        if (insn_state[w] == 0) {
                /* tree-edge */
@@ -5472,6 +6245,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
                insn_stack[env->cfg.cur_stack++] = w;
                return 1;
        } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+                if (loop_ok && env->allow_ptr_leaks)
+                        return 0;
                verbose_linfo(env, t, "%d: ", t);
                verbose_linfo(env, w, "%d: ", w);
                verbose(env, "back-edge from insn %d to %d\n", t, w);
@@ -5523,16 +6298,17 @@ peek_stack:
                if (opcode == BPF_EXIT) {
                        goto mark_explored;
                } else if (opcode == BPF_CALL) {
-                        ret = push_insn(t, t + 1, FALLTHROUGH, env);
+                        ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
                        if (ret == 1)
                                goto peek_stack;
                        else if (ret < 0)
                                goto err_free;
                        if (t + 1 < insn_cnt)
-                                env->explored_states[t + 1] = STATE_LIST_MARK;
+                                init_explored_state(env, t + 1);
                        if (insns[t].src_reg == BPF_PSEUDO_CALL) {
-                                env->explored_states[t] = STATE_LIST_MARK;
+                                init_explored_state(env, t);
-                                ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+                                ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
+                                                env, false);
                                if (ret == 1)
                                        goto peek_stack;
                                else if (ret < 0)
@@ -5545,26 +6321,31 @@ peek_stack:
                        }
                        /* unconditional jump with single edge */
                        ret = push_insn(t, t + insns[t].off + 1,
-                                        FALLTHROUGH, env);
+                                        FALLTHROUGH, env, true);
                        if (ret == 1)
                                goto peek_stack;
                        else if (ret < 0)
                                goto err_free;
+                        /* unconditional jmp is not a good pruning point,
+                         * but it's marked, since backtracking needs
+                         * to record jmp history in is_state_visited().
+                         */
+                        init_explored_state(env, t + insns[t].off + 1);
                        /* tell verifier to check for equivalent states
                         * after every call and jump
                         */
                        if (t + 1 < insn_cnt)
-                                env->explored_states[t + 1] = STATE_LIST_MARK;
+                                init_explored_state(env, t + 1);
                } else {
                        /* conditional jump with two edges */
-                        env->explored_states[t] = STATE_LIST_MARK;
+                        init_explored_state(env, t);
-                        ret = push_insn(t, t + 1, FALLTHROUGH, env);
+                        ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
                        if (ret == 1)
                                goto peek_stack;
                        else if (ret < 0)
                                goto err_free;
-                        ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+                        ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
                        if (ret == 1)
                                goto peek_stack;
                        else if (ret < 0)
@@ -5574,7 +6355,7 @@ peek_stack:
                /* all other non-branch instructions with single
                 * fall-through edge
                 */
-                ret = push_insn(t, t + 1, FALLTHROUGH, env);
+                ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
                if (ret == 1)
                        goto peek_stack;
                else if (ret < 0)
@@ -6005,12 +6786,12 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
        struct bpf_verifier_state_list *sl;
        int i;
-        sl = env->explored_states[insn];
+        sl = *explored_state(env, insn);
-        if (!sl)
+        while (sl) {
-                return;
+                if (sl->state.branches)
+                        goto next;
-        while (sl != STATE_LIST_MARK) {
+                if (sl->state.insn_idx != insn ||
-                if (sl->state.curframe != cur->curframe)
+                    sl->state.curframe != cur->curframe)
                        goto next;
                for (i = 0; i <= cur->curframe; i++)
                        if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
@@ -6050,6 +6831,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
        switch (rold->type) {
        case SCALAR_VALUE:
                if (rcur->type == SCALAR_VALUE) {
+                        if (!rold->precise && !rcur->precise)
+                                return true;
                        /* new val must satisfy old val knowledge */
                        return range_within(rold, rcur) &&
                               tnum_in(rold->var_off, rcur->var_off);
@@ -6122,6 +6905,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
        case PTR_TO_SOCK_COMMON_OR_NULL:
        case PTR_TO_TCP_SOCK:
        case PTR_TO_TCP_SOCK_OR_NULL:
+        case PTR_TO_XDP_SOCK:
                /* Only valid matches are exact, which memcmp() above
                 * would have accepted
                 */
@@ -6292,20 +7076,33 @@ static bool states_equal(struct bpf_verifier_env *env,
        return true;
 }
+/* Return 0 if no propagation happened. Return negative error code if error
+ * happened. Otherwise, return the propagated bit.
+ */
 static int propagate_liveness_reg(struct bpf_verifier_env *env,
                                  struct bpf_reg_state *reg,
                                  struct bpf_reg_state *parent_reg)
 {
+        u8 parent_flag = parent_reg->live & REG_LIVE_READ;
+        u8 flag = reg->live & REG_LIVE_READ;
        int err;
-        if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ))
+        /* When comes here, read flags of PARENT_REG or REG could be any of
+         * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
+         * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
+         */
+        if (parent_flag == REG_LIVE_READ64 ||
+            /* Or if there is no read flag from REG. */
+            !flag ||
+            /* Or if the read flag from REG is the same as PARENT_REG. */
+            parent_flag == flag)
                return 0;
-        err = mark_reg_read(env, reg, parent_reg);
+        err = mark_reg_read(env, reg, parent_reg, flag);
        if (err)
                return err;
-        return 0;
+        return flag;
 }
 /* A write screens off any subsequent reads; but write marks come from the
@@ -6339,8 +7136,10 @@ static int propagate_liveness(struct bpf_verifier_env *env,
                for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
                        err = propagate_liveness_reg(env, &state_reg[i],
                                                     &parent_reg[i]);
-                        if (err)
+                        if (err < 0)
                                return err;
+                        if (err == REG_LIVE_READ64)
+                                mark_insn_zext(env, &parent_reg[i]);
                }
                /* Propagate stack slots. */
@@ -6350,32 +7149,132 @@ static int propagate_liveness(struct bpf_verifier_env *env,
                        state_reg = &state->stack[i].spilled_ptr;
                        err = propagate_liveness_reg(env, state_reg,
                                                     parent_reg);
-                        if (err)
+                        if (err < 0)
                                return err;
                }
        }
-        return err;
+        return 0;
 }
+/* find precise scalars in the previous equivalent state and
+ * propagate them into the current state
+ */
+static int propagate_precision(struct bpf_verifier_env *env,
+                               const struct bpf_verifier_state *old)
+{
+        struct bpf_reg_state *state_reg;
+        struct bpf_func_state *state;
+        int i, err = 0;
+        state = old->frame[old->curframe];
+        state_reg = state->regs;
+        for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+                if (state_reg->type != SCALAR_VALUE ||
+                    !state_reg->precise)
+                        continue;
+                if (env->log.level & BPF_LOG_LEVEL2)
+                        verbose(env, "propagating r%d\n", i);
+                err = mark_chain_precision(env, i);
+                if (err < 0)
+                        return err;
+        }
+        for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+                if (state->stack[i].slot_type[0] != STACK_SPILL)
+                        continue;
+                state_reg = &state->stack[i].spilled_ptr;
+                if (state_reg->type != SCALAR_VALUE ||
+                    !state_reg->precise)
+                        continue;
+                if (env->log.level & BPF_LOG_LEVEL2)
+                        verbose(env, "propagating fp%d\n",
+                                (-i - 1) * BPF_REG_SIZE);
+                err = mark_chain_precision_stack(env, i);
+                if (err < 0)
+                        return err;
+        }
+        return 0;
+}
+static bool states_maybe_looping(struct bpf_verifier_state *old,
+                                 struct bpf_verifier_state *cur)
+{
+        struct bpf_func_state *fold, *fcur;
+        int i, fr = cur->curframe;
+        if (old->curframe != fr)
+                return false;
+        fold = old->frame[fr];
+        fcur = cur->frame[fr];
+        for (i = 0; i < MAX_BPF_REG; i++)
+                if (memcmp(&fold->regs[i], &fcur->regs[i],
+                           offsetof(struct bpf_reg_state, parent)))
+                        return false;
+        return true;
+}
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
        struct bpf_verifier_state_list *new_sl;
        struct bpf_verifier_state_list *sl, **pprev;
        struct bpf_verifier_state *cur = env->cur_state, *new;
        int i, j, err, states_cnt = 0;
+        bool add_new_state = false;
-        pprev = &env->explored_states[insn_idx];
+        cur->last_insn_idx = env->prev_insn_idx;
-        sl = *pprev;
+        if (!env->insn_aux_data[insn_idx].prune_point)
-        if (!sl)
                /* this 'insn_idx' instruction wasn't marked, so we will not
                 * be doing state search here
                 */
                return 0;
+        /* bpf progs typically have pruning point every 4 instructions
+         * http://vger.kernel.org/bpfconf2019.html#session-1
+         * Do not add new state for future pruning if the verifier hasn't seen
+         * at least 2 jumps and at least 8 instructions.
+         * This heuristics helps decrease 'total_states' and 'peak_states' metric.
+         * In tests that amounts to up to 50% reduction into total verifier
+         * memory consumption and 20% verifier time speedup.
+         */
+        if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
+            env->insn_processed - env->prev_insn_processed >= 8)
+                add_new_state = true;
+        pprev = explored_state(env, insn_idx);
+        sl = *pprev;
        clean_live_states(env, insn_idx, cur);
-        while (sl != STATE_LIST_MARK) {
+        while (sl) {
+                states_cnt++;
+                if (sl->state.insn_idx != insn_idx)
+                        goto next;
+                if (sl->state.branches) {
+                        if (states_maybe_looping(&sl->state, cur) &&
+                            states_equal(env, &sl->state, cur)) {
+                                verbose_linfo(env, insn_idx, "; ");
+                                verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+                                return -EINVAL;
+                        }
+                        /* if the verifier is processing a loop, avoid adding new state
+                         * too often, since different loop iterations have distinct
+                         * states and may not help future pruning.
+                         * This threshold shouldn't be too low to make sure that
+                         * a loop with large bound will be rejected quickly.
+                         * The most abusive loop will be:
+                         * r1 += 1
+                         * if r1 < 1000000 goto pc-2
+                         * 1M insn_procssed limit / 100 == 10k peak states.
+                         * This threshold shouldn't be too high either, since states
+                         * at the end of the loop are likely to be useful in pruning.
+                         */
+                        if (env->jmps_processed - env->prev_jmps_processed < 20 &&
+                            env->insn_processed - env->prev_insn_processed < 100)
+                                add_new_state = false;
+                        goto miss;
+                }
                if (states_equal(env, &sl->state, cur)) {
                        sl->hit_cnt++;
                        /* reached equivalent register/stack state,
@@ -6389,12 +7288,27 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
                         * this state and will pop a new one.
                         */
                        err = propagate_liveness(env, &sl->state, cur);
+                        /* if previous state reached the exit with precision and
+                         * current state is equivalent to it (except precsion marks)
+                         * the precision needs to be propagated back in
+                         * the current state.
+                         */
+                        err = err ? : push_jmp_history(env, cur);
+                        err = err ? : propagate_precision(env, &sl->state);
                        if (err)
                                return err;
                        return 1;
                }
-                states_cnt++;
+miss:
-                sl->miss_cnt++;
+                /* when new state is not going to be added do not increase miss count.
+                 * Otherwise several loop iterations will remove the state
+                 * recorded earlier. The goal of these heuristics is to have
+                 * states from some iterations of the loop (some in the beginning
+                 * and some at the end) to help pruning.
+                 */
+                if (add_new_state)
+                        sl->miss_cnt++;
                /* heuristic to determine whether this state is beneficial
                 * to keep checking from state equivalence point of view.
                 * Higher numbers increase max_states_per_insn and verification time,
@@ -6406,6 +7320,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
                         */
                        *pprev = sl->next;
                        if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+                                u32 br = sl->state.branches;
+                                WARN_ONCE(br,
+                                          "BUG live_done but branches_to_explore %d\n",
+                                          br);
                                free_verifier_state(&sl->state, false);
                                kfree(sl);
                                env->peak_states--;
@@ -6420,6 +7339,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
                        sl = *pprev;
                        continue;
                }
+next:
                pprev = &sl->next;
                sl = *pprev;
        }
@@ -6428,20 +7348,27 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
                env->max_states_per_insn = states_cnt;
        if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
-                return 0;
+                return push_jmp_history(env, cur);
-        /* there were no equivalent states, remember current one.
+        if (!add_new_state)
-         * technically the current state is not proven to be safe yet,
+                return push_jmp_history(env, cur);
+        /* There were no equivalent states, remember the current one.
+         * Technically the current state is not proven to be safe yet,
         * but it will either reach outer most bpf_exit (which means it's safe)
-         * or it will be rejected. Since there are no loops, we won't be
+         * or it will be rejected. When there are no loops the verifier won't be
         * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
-         * again on the way to bpf_exit
+         * again on the way to bpf_exit.
+         * When looping the sl->state.branches will be > 0 and this state
+         * will not be considered for equivalence until branches == 0.
         */
        new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
        if (!new_sl)
                return -ENOMEM;
        env->total_states++;
        env->peak_states++;
+        env->prev_jmps_processed = env->jmps_processed;
+        env->prev_insn_processed = env->insn_processed;
        /* add new state to the head of linked list */
        new = &new_sl->state;
@@ -6451,8 +7378,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
                kfree(new_sl);
                return err;
        }
-        new_sl->next = env->explored_states[insn_idx];
+        new->insn_idx = insn_idx;
-        env->explored_states[insn_idx] = new_sl;
+        WARN_ONCE(new->branches != 1,
+                  "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
+        cur->parent = new;
+        cur->first_insn_idx = insn_idx;
+        clear_jmp_history(cur);
+        new_sl->next = *explored_state(env, insn_idx);
+        *explored_state(env, insn_idx) = new_sl;
        /* connect new state to parentage chain. Current frame needs all
         * registers connected. Only r6 - r9 of the callers are alive (pushed
         * to the stack implicitly by JITs) so in callers' frames connect just
@@ -6460,17 +7394,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
         * the state of the call instruction (with WRITTEN set), and r0 comes
         * from callee with its full parentage chain, anyway.
         */
-        for (j = 0; j <= cur->curframe; j++)
-                for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
-                        cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
        /* clear write marks in current state: the writes we did are not writes
         * our child did, so they don't screen off its reads from us.
         * (There are no read marks in current state, because reads always mark
         * their parent and current state never has children yet.  Only
         * explored_states can get read marks.)
         */
-        for (i = 0; i < BPF_REG_FP; i++)
+        for (j = 0; j <= cur->curframe; j++) {
-                cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE;
+                for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
+                        cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
+                for (i = 0; i < BPF_REG_FP; i++)
+                        cur->frame[j]->regs[i].live = REG_LIVE_NONE;
+        }
        /* all stack frames are accessible from callee, clear them all */
        for (j = 0; j <= cur->curframe; j++) {
@@ -6497,6 +7432,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
        case PTR_TO_SOCK_COMMON_OR_NULL:
        case PTR_TO_TCP_SOCK:
        case PTR_TO_TCP_SOCK_OR_NULL:
+        case PTR_TO_XDP_SOCK:
                return false;
        default:
                return true;
@@ -6528,6 +7464,7 @@ static int do_check(struct bpf_verifier_env *env)
        struct bpf_reg_state *regs;
        int insn_cnt = env->prog->len;
        bool do_print_state = false;
+        int prev_insn_idx = -1;
        env->prev_linfo = NULL;
@@ -6536,6 +7473,7 @@ static int do_check(struct bpf_verifier_env *env)
                return -ENOMEM;
        state->curframe = 0;
        state->speculative = false;
+        state->branches = 1;
        state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
        if (!state->frame[0]) {
                kfree(state);
@@ -6552,6 +7490,7 @@ static int do_check(struct bpf_verifier_env *env)
                u8 class;
                int err;
+                env->prev_insn_idx = prev_insn_idx;
                if (env->insn_idx >= insn_cnt) {
                        verbose(env, "invalid insn idx %d insn_cnt %d\n",
                                env->insn_idx, insn_cnt);
@@ -6624,6 +7563,7 @@ static int do_check(struct bpf_verifier_env *env)
                regs = cur_regs(env);
                env->insn_aux_data[env->insn_idx].seen = true;
+                prev_insn_idx = env->insn_idx;
                if (class == BPF_ALU || class == BPF_ALU64) {
                        err = check_alu_op(env, insn);
@@ -6742,6 +7682,7 @@ static int do_check(struct bpf_verifier_env *env)
                } else if (class == BPF_JMP || class == BPF_JMP32) {
                        u8 opcode = BPF_OP(insn->code);
+                        env->jmps_processed++;
                        if (opcode == BPF_CALL) {
                                if (BPF_SRC(insn->code) != BPF_K ||
                                    insn->off != 0 ||
@@ -6796,7 +7737,6 @@ static int do_check(struct bpf_verifier_env *env)
                                if (state->curframe) {
                                        /* exit from nested function */
-                                        env->prev_insn_idx = env->insn_idx;
                                        err = prepare_func_exit(env, &env->insn_idx);
                                        if (err)
                                                return err;
@@ -6827,7 +7767,8 @@ static int do_check(struct bpf_verifier_env *env)
                                if (err)
                                        return err;
 process_bpf_exit:
-                                err = pop_stack(env, &env->prev_insn_idx,
+                                update_branch_counts(env, env->cur_state);
+                                err = pop_stack(env, &prev_insn_idx,
                                                &env->insn_idx);
                                if (err < 0) {
                                        if (err != -ENOENT)
@@ -7130,14 +8071,23 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
 * insni[off, off + cnt).  Adjust corresponding insn_aux_data by copying
 * [0, off) and [off, end) to new locations, so the patched range stays zero
 */
-static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
+static int adjust_insn_aux_data(struct bpf_verifier_env *env,
-                                u32 off, u32 cnt)
+                                struct bpf_prog *new_prog, u32 off, u32 cnt)
 {
        struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+        struct bpf_insn *insn = new_prog->insnsi;
+        u32 prog_len;
        int i;
+        /* aux info at OFF always needs adjustment, no matter fast path
+         * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
+         * original insn at old prog.
+         */
+        old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
        if (cnt == 1)
                return 0;
+        prog_len = new_prog->len;
        new_data = vzalloc(array_size(prog_len,
                                      sizeof(struct bpf_insn_aux_data)));
        if (!new_data)
@@ -7145,8 +8095,10 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
        memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
        memcpy(new_data + off + cnt - 1, old_data + off,
               sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
-        for (i = off; i < off + cnt - 1; i++)
+        for (i = off; i < off + cnt - 1; i++) {
                new_data[i].seen = true;
+                new_data[i].zext_dst = insn_has_def32(env, insn + i);
+        }
        env->insn_aux_data = new_data;
        vfree(old_data);
        return 0;
@@ -7179,7 +8131,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
                                env->insn_aux_data[off].orig_idx);
                return NULL;
        }
-        if (adjust_insn_aux_data(env, new_prog->len, off, len))
+        if (adjust_insn_aux_data(env, new_prog, off, len))
                return NULL;
        adjust_subprog_starts(env, off, len);
        return new_prog;
@@ -7443,6 +8395,84 @@ static int opt_remove_nops(struct bpf_verifier_env *env)
        return 0;
 }
+static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
+                                         const union bpf_attr *attr)
+{
+        struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4];
+        struct bpf_insn_aux_data *aux = env->insn_aux_data;
+        int i, patch_len, delta = 0, len = env->prog->len;
+        struct bpf_insn *insns = env->prog->insnsi;
+        struct bpf_prog *new_prog;
+        bool rnd_hi32;
+        rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
+        zext_patch[1] = BPF_ZEXT_REG(0);
+        rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
+        rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+        rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
+        for (i = 0; i < len; i++) {
+                int adj_idx = i + delta;
+                struct bpf_insn insn;
+                insn = insns[adj_idx];
+                if (!aux[adj_idx].zext_dst) {
+                        u8 code, class;
+                        u32 imm_rnd;
+                        if (!rnd_hi32)
+                                continue;
+                        code = insn.code;
+                        class = BPF_CLASS(code);
+                        if (insn_no_def(&insn))
+                                continue;
+                        /* NOTE: arg "reg" (the fourth one) is only used for
+                         *       BPF_STX which has been ruled out in above
+                         *       check, it is safe to pass NULL here.
+                         */
+                        if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) {
+                                if (class == BPF_LD &&
+                                    BPF_MODE(code) == BPF_IMM)
+                                        i++;
+                                continue;
+                        }
+                        /* ctx load could be transformed into wider load. */
+                        if (class == BPF_LDX &&
+                            aux[adj_idx].ptr_type == PTR_TO_CTX)
+                                continue;
+                        imm_rnd = get_random_int();
+                        rnd_hi32_patch[0] = insn;
+                        rnd_hi32_patch[1].imm = imm_rnd;
+                        rnd_hi32_patch[3].dst_reg = insn.dst_reg;
+                        patch = rnd_hi32_patch;
+                        patch_len = 4;
+                        goto apply_patch_buffer;
+                }
+                if (!bpf_jit_needs_zext())
+                        continue;
+                zext_patch[0] = insn;
+                zext_patch[1].dst_reg = insn.dst_reg;
+                zext_patch[1].src_reg = insn.dst_reg;
+                patch = zext_patch;
+                patch_len = 2;
+apply_patch_buffer:
+                new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
+                if (!new_prog)
+                        return -ENOMEM;
+                env->prog = new_prog;
+                insns = new_prog->insnsi;
+                aux = env->insn_aux_data;
+                delta += patch_len - 1;
+        }
+        return 0;
+}
 /* convert load instructions that access fields of a context type into a
 * sequence of instructions that access fields of the underlying structure:
 *     struct __sk_buff    -> struct sk_buff
@@ -7541,6 +8571,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                case PTR_TO_TCP_SOCK:
                        convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
                        break;
+                case PTR_TO_XDP_SOCK:
+                        convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
+                        break;
                default:
                        continue;
                }
@@ -8130,16 +9163,15 @@ static void free_states(struct bpf_verifier_env *env)
        if (!env->explored_states)
                return;
-        for (i = 0; i < env->prog->len; i++) {
+        for (i = 0; i < state_htab_size(env); i++) {
                sl = env->explored_states[i];
-                if (sl)
+                while (sl) {
-                        while (sl != STATE_LIST_MARK) {
+                        sln = sl->next;
-                                sln = sl->next;
+                        free_verifier_state(&sl->state, false);
-                                free_verifier_state(&sl->state, false);
+                        kfree(sl);
-                                kfree(sl);
+                        sl = sln;
-                                sl = sln;
+                }
-                        }
        }
        kvfree(env->explored_states);
@@ -8239,7 +9271,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
                        goto skip_full_check;
        }
-        env->explored_states = kvcalloc(env->prog->len,
+        env->explored_states = kvcalloc(state_htab_size(env),
                                       sizeof(struct bpf_verifier_state_list *),
                                       GFP_USER);
        ret = -ENOMEM;
@@ -8294,6 +9326,15 @@ skip_full_check:
        if (ret == 0)
                ret = fixup_bpf_calls(env);
+        /* do 32-bit optimization after insn patching has done so those patched
+         * insns could be handled correctly.
+         */
+        if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
+                ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
+                env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
+                                                                     : false;
+        }
        if (ret == 0)
                ret = fixup_call_args(env);
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 686d244e798d..9bb96ace9fa1 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -17,8 +17,8 @@ struct xsk_map {
 static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 {
-        int cpu, err = -EINVAL;
        struct xsk_map *m;
+        int cpu, err;
        u64 cost;
        if (!capable(CAP_NET_ADMIN))
@@ -37,13 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
        cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
        cost += sizeof(struct list_head) * num_possible_cpus();
-        if (cost >= U32_MAX - PAGE_SIZE)
-                goto free_m;
-        m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
        /* Notice returns -EPERM on if map size is larger than memlock limit */
-        err = bpf_map_precharge_memlock(m->map.pages);
+        err = bpf_map_charge_init(&m->map.memory, cost);
        if (err)
                goto free_m;
@@ -51,7 +47,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
        m->flush_list = alloc_percpu(struct list_head);
        if (!m->flush_list)
-                goto free_m;
+                goto free_charge;
        for_each_possible_cpu(cpu)
                INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
@@ -65,6 +61,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 free_percpu:
        free_percpu(m->flush_list);
+free_charge:
+        bpf_map_charge_finish(&m->map.memory);
 free_m:
        kfree(m);
        return ERR_PTR(err);
@@ -147,13 +145,18 @@ void __xsk_map_flush(struct bpf_map *map)
        list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
                xsk_flush(xs);
-                __list_del(xs->flush_node.prev, xs->flush_node.next);
+                __list_del_clearprev(&xs->flush_node);
-                xs->flush_node.prev = NULL;
        }
 }
 static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
 {
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        return __xsk_map_lookup_elem(map, *(u32 *)key);
+}
+static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
+{
        return ERR_PTR(-EOPNOTSUPP);
 }
@@ -220,6 +223,7 @@ const struct bpf_map_ops xsk_map_ops = {
        .map_free = xsk_map_free,
        .map_get_next_key = xsk_map_get_next_key,
        .map_lookup_elem = xsk_map_lookup_elem,
+        .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
        .map_update_elem = xsk_map_update_elem,
        .map_delete_elem = xsk_map_delete_elem,
        .map_check_btf = map_check_no_btf,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 68ca5de7ec27..88006be40ea3 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include "cgroup-internal.h"
 #include <linux/ctype.h>
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 327f37c9fdfa..300b0c416341 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -101,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
 */
 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
 #define cgroup_assert_mutex_or_rcu_locked()                             \
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
@@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[];
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
-static void css_task_iter_advance(struct css_task_iter *it);
+static void css_task_iter_skip(struct css_task_iter *it,
+                               struct task_struct *task);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss);
@@ -738,6 +739,7 @@ struct css_set init_css_set = {
        .dom_cset               = &init_css_set,
        .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
+        .dying_tasks            = LIST_HEAD_INIT(init_css_set.dying_tasks),
        .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
        .threaded_csets         = LIST_HEAD_INIT(init_css_set.threaded_csets),
        .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
@@ -843,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
                cgroup_update_populated(link->cgrp, populated);
 }
+/*
+ * @task is leaving, advance task iterators which are pointing to it so
+ * that they can resume at the next position.  Advancing an iterator might
+ * remove it from the list, use safe walk.  See css_task_iter_skip() for
+ * details.
+ */
+static void css_set_skip_task_iters(struct css_set *cset,
+                                    struct task_struct *task)
+{
+        struct css_task_iter *it, *pos;
+        list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
+                css_task_iter_skip(it, task);
+}
 /**
 * css_set_move_task - move a task from one css_set to another
 * @task: task being moved
@@ -868,22 +885,9 @@ static void css_set_move_task(struct task_struct *task,
                css_set_update_populated(to_cset, true);
        if (from_cset) {
-                struct css_task_iter *it, *pos;
                WARN_ON_ONCE(list_empty(&task->cg_list));
-                /*
+                css_set_skip_task_iters(from_cset, task);
-                 * @task is leaving, advance task iterators which are
-                 * pointing to it so that they can resume at the next
-                 * position.  Advancing an iterator might remove it from
-                 * the list, use safe walk.  See css_task_iter_advance*()
-                 * for details.
-                 */
-                list_for_each_entry_safe(it, pos, &from_cset->task_iters,
-                                         iters_node)
-                        if (it->task_pos == &task->cg_list)
-                                css_task_iter_advance(it);
                list_del_init(&task->cg_list);
                if (!css_set_populated(from_cset))
                        css_set_update_populated(from_cset, false);
@@ -1210,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        cset->dom_cset = cset;
        INIT_LIST_HEAD(&cset->tasks);
        INIT_LIST_HEAD(&cset->mg_tasks);
+        INIT_LIST_HEAD(&cset->dying_tasks);
        INIT_LIST_HEAD(&cset->task_iters);
        INIT_LIST_HEAD(&cset->threaded_csets);
        INIT_HLIST_NODE(&cset->hlist);
@@ -1810,11 +1815,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 enum cgroup2_param {
        Opt_nsdelegate,
+        Opt_memory_localevents,
        nr__cgroup2_params
 };
 static const struct fs_parameter_spec cgroup2_param_specs[] = {
-        fsparam_flag  ("nsdelegate",            Opt_nsdelegate),
+        fsparam_flag("nsdelegate",              Opt_nsdelegate),
+        fsparam_flag("memory_localevents",      Opt_memory_localevents),
        {}
 };
@@ -1837,6 +1844,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
        case Opt_nsdelegate:
                ctx->flags |= CGRP_ROOT_NS_DELEGATE;
                return 0;
+        case Opt_memory_localevents:
+                ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+                return 0;
        }
        return -EINVAL;
 }
@@ -1848,6 +1858,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
                        cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+                if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+                else
+                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
        }
 }
@@ -1855,6 +1870,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
 {
        if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
                seq_puts(seq, ",nsdelegate");
+        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+                seq_puts(seq, ",memory_localevents");
        return 0;
 }
@@ -3540,17 +3557,84 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
 #ifdef CONFIG_PSI
 static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
 {
-        return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
+        struct cgroup *cgroup = seq_css(seq)->cgroup;
+        struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+        return psi_show(seq, psi, PSI_IO);
 }
 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
 {
-        return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
+        struct cgroup *cgroup = seq_css(seq)->cgroup;
+        struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+        return psi_show(seq, psi, PSI_MEM);
 }
 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 {
-        return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
+        struct cgroup *cgroup = seq_css(seq)->cgroup;
+        struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+        return psi_show(seq, psi, PSI_CPU);
+}
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
+                                          size_t nbytes, enum psi_res res)
+{
+        struct psi_trigger *new;
+        struct cgroup *cgrp;
+        cgrp = cgroup_kn_lock_live(of->kn, false);
+        if (!cgrp)
+                return -ENODEV;
+        cgroup_get(cgrp);
+        cgroup_kn_unlock(of->kn);
+        new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+        if (IS_ERR(new)) {
+                cgroup_put(cgrp);
+                return PTR_ERR(new);
+        }
+        psi_trigger_replace(&of->priv, new);
+        cgroup_put(cgrp);
+        return nbytes;
+}
+static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
+                                          char *buf, size_t nbytes,
+                                          loff_t off)
+{
+        return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
 }
-#endif
+static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
+                                          char *buf, size_t nbytes,
+                                          loff_t off)
+{
+        return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+}
+static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
+                                          char *buf, size_t nbytes,
+                                          loff_t off)
+{
+        return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+}
+static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
+                                          poll_table *pt)
+{
+        return psi_trigger_poll(&of->priv, of->file, pt);
+}
+static void cgroup_pressure_release(struct kernfs_open_file *of)
+{
+        psi_trigger_replace(&of->priv, NULL);
+}
+#endif /* CONFIG_PSI */
 static int cgroup_freeze_show(struct seq_file *seq, void *v)
 {
@@ -4142,6 +4226,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
        return NULL;
 }
+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
 /**
 * css_rightmost_descendant - return the rightmost descendant of a css
@@ -4329,15 +4414,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
                        it->task_pos = NULL;
                        return;
                }
-        } while (!css_set_populated(cset));
+        } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
        if (!list_empty(&cset->tasks))
                it->task_pos = cset->tasks.next;
-        else
+        else if (!list_empty(&cset->mg_tasks))
                it->task_pos = cset->mg_tasks.next;
+        else
+                it->task_pos = cset->dying_tasks.next;
        it->tasks_head = &cset->tasks;
        it->mg_tasks_head = &cset->mg_tasks;
+        it->dying_tasks_head = &cset->dying_tasks;
        /*
         * We don't keep css_sets locked across iteration steps and thus
@@ -4363,9 +4451,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
        list_add(&it->iters_node, &cset->task_iters);
 }
+static void css_task_iter_skip(struct css_task_iter *it,
+                               struct task_struct *task)
+{
+        lockdep_assert_held(&css_set_lock);
+        if (it->task_pos == &task->cg_list) {
+                it->task_pos = it->task_pos->next;
+                it->flags |= CSS_TASK_ITER_SKIPPED;
+        }
+}
 static void css_task_iter_advance(struct css_task_iter *it)
 {
-        struct list_head *next;
+        struct task_struct *task;
        lockdep_assert_held(&css_set_lock);
 repeat:
@@ -4375,25 +4474,40 @@ repeat:
                 * consumed first and then ->mg_tasks.  After ->mg_tasks,
                 * we move onto the next cset.
                 */
-                next = it->task_pos->next;
+                if (it->flags & CSS_TASK_ITER_SKIPPED)
+                        it->flags &= ~CSS_TASK_ITER_SKIPPED;
-                if (next == it->tasks_head)
+                else
-                        next = it->mg_tasks_head->next;
+                        it->task_pos = it->task_pos->next;
-                if (next == it->mg_tasks_head)
+                if (it->task_pos == it->tasks_head)
+                        it->task_pos = it->mg_tasks_head->next;
+                if (it->task_pos == it->mg_tasks_head)
+                        it->task_pos = it->dying_tasks_head->next;
+                if (it->task_pos == it->dying_tasks_head)
                        css_task_iter_advance_css_set(it);
-                else
-                        it->task_pos = next;
        } else {
                /* called from start, proceed to the first cset */
                css_task_iter_advance_css_set(it);
        }
-        /* if PROCS, skip over tasks which aren't group leaders */
+        if (!it->task_pos)
-        if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+                return;
-            !thread_group_leader(list_entry(it->task_pos, struct task_struct,
-                                            cg_list)))
+        task = list_entry(it->task_pos, struct task_struct, cg_list);
-                goto repeat;
+        if (it->flags & CSS_TASK_ITER_PROCS) {
+                /* if PROCS, skip over tasks which aren't group leaders */
+                if (!thread_group_leader(task))
+                        goto repeat;
+                /* and dying leaders w/o live member threads */
+                if (!atomic_read(&task->signal->live))
+                        goto repeat;
+        } else {
+                /* skip all dying ones */
+                if (task->flags & PF_EXITING)
+                        goto repeat;
+        }
 }
 /**
@@ -4449,6 +4563,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
        spin_lock_irq(&css_set_lock);
+        /* @it may be half-advanced by skips, finish advancing */
+        if (it->flags & CSS_TASK_ITER_SKIPPED)
+                css_task_iter_advance(it);
        if (it->task_pos) {
                it->cur_task = list_entry(it->task_pos, struct task_struct,
                                          cg_list);
@@ -4743,20 +4861,26 @@ static struct cftype cgroup_base_files[] = {
 #ifdef CONFIG_PSI
        {
                .name = "io.pressure",
-                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_io_pressure_show,
+                .write = cgroup_io_pressure_write,
+                .poll = cgroup_pressure_poll,
+                .release = cgroup_pressure_release,
        },
        {
                .name = "memory.pressure",
-                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_memory_pressure_show,
+                .write = cgroup_memory_pressure_write,
+                .poll = cgroup_pressure_poll,
+                .release = cgroup_pressure_release,
        },
        {
                .name = "cpu.pressure",
-                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_cpu_pressure_show,
+                .write = cgroup_cpu_pressure_write,
+                .poll = cgroup_pressure_poll,
+                .release = cgroup_pressure_release,
        },
-#endif
+#endif /* CONFIG_PSI */
        { }     /* terminate */
 };
@@ -4882,8 +5006,6 @@ static void css_release_work_fn(struct work_struct *work)
                if (cgrp->kn)
                        RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
                                         NULL);
-                cgroup_bpf_put(cgrp);
        }
        mutex_unlock(&cgroup_mutex);
@@ -5409,6 +5531,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        cgroup1_check_for_release(parent);
+        cgroup_bpf_offline(cgrp);
        /* put the base reference */
        percpu_ref_kill(&cgrp->self.refcnt);
@@ -5543,7 +5667,6 @@ int __init cgroup_init(void)
        int ssid;
        BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
-        BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
@@ -5924,6 +6047,7 @@ void cgroup_exit(struct task_struct *tsk)
        if (!list_empty(&tsk->cg_list)) {
                spin_lock_irq(&css_set_lock);
                css_set_move_task(tsk, cset, NULL, false);
+                list_add_tail(&tsk->cg_list, &cset->dying_tasks);
                cset->nr_tasks--;
                WARN_ON_ONCE(cgroup_task_frozen(tsk));
@@ -5949,6 +6073,13 @@ void cgroup_release(struct task_struct *task)
        do_each_subsys_mask(ss, ssid, have_release_callback) {
                ss->release(task);
        } while_each_subsys_mask();
+        if (use_task_css_set_links) {
+                spin_lock_irq(&css_set_lock);
+                css_set_skip_task_iters(task_css_set(task), task);
+                list_del_init(&task->cg_list);
+                spin_unlock_irq(&css_set_lock);
+        }
 }
 void cgroup_free(struct task_struct *task)
@@ -6110,6 +6241,48 @@ struct cgroup *cgroup_get_from_fd(int fd)
 }
 EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+static u64 power_of_ten(int power)
+{
+        u64 v = 1;
+        while (power--)
+                v *= 10;
+        return v;
+}
+/**
+ * cgroup_parse_float - parse a floating number
+ * @input: input string
+ * @dec_shift: number of decimal digits to shift
+ * @v: output
+ *
+ * Parse a decimal floating point number in @input and store the result in
+ * @v with decimal point right shifted @dec_shift times.  For example, if
+ * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
+ * Returns 0 on success, -errno otherwise.
+ *
+ * There's nothing cgroup specific about this function except that it's
+ * currently the only user.
+ */
+int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
+{
+        s64 whole, frac = 0;
+        int fstart = 0, fend = 0, flen;
+        if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
+                return -EINVAL;
+        if (frac < 0)
+                return -EINVAL;
+        flen = fend > fstart ? fend - fstart : 0;
+        if (flen < dec_shift)
+                frac *= power_of_ten(dec_shift - flen);
+        else
+                frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
+        *v = whole * power_of_ten(dec_shift) + frac;
+        return 0;
+}
 /*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
@@ -6148,6 +6321,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
                 * Don't use cgroup_get_live().
                 */
                cgroup_get(sock_cgroup_ptr(skcd));
+                cgroup_bpf_get(sock_cgroup_ptr(skcd));
                return;
        }
@@ -6159,6 +6333,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
                cset = task_css_set(current);
                if (likely(cgroup_tryget(cset->dfl_cgrp))) {
                        skcd->val = (unsigned long)cset->dfl_cgrp;
+                        cgroup_bpf_get(cset->dfl_cgrp);
                        break;
                }
                cpu_relax();
@@ -6169,7 +6344,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 void cgroup_sk_free(struct sock_cgroup_data *skcd)
 {
-        cgroup_put(sock_cgroup_ptr(skcd));
+        struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+        cgroup_bpf_put(cgrp);
+        cgroup_put(cgrp);
 }
 #endif  /* CONFIG_SOCK_CGROUP_DATA */
@@ -6252,7 +6430,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
 static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
                             char *buf)
 {
-        return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+        return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
 }
 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
@@ -6272,4 +6450,5 @@ static int __init cgroup_sysfs_init(void)
        return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
 }
 subsys_initcall(cgroup_sysfs_init);
 #endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6a1942ed781c..b3b02b9c4405 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -729,7 +729,7 @@ static inline int nr_cpusets(void)
 * load balancing domains (sched domains) as specified by that partial
 * partition.
 *
- * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt
+ * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst
 * for a background explanation of this.
 *
 * Does not return errors, on the theory that the callers of this
@@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task)
        if (task_css_is_root(task, cpuset_cgrp_id))
                return;
-        set_cpus_allowed_ptr(task, &current->cpus_allowed);
+        set_cpus_allowed_ptr(task, current->cpus_ptr);
        task->mems_allowed = current->mems_allowed;
 }
@@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
        spin_unlock_irqrestore(&callback_lock, flags);
 }
+/**
+ * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
+ * @tsk: pointer to task_struct with which the scheduler is struggling
+ *
+ * Description: In the case that the scheduler cannot find an allowed cpu in
+ * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
+ * mode however, this value is the same as task_cs(tsk)->effective_cpus,
+ * which will not contain a sane cpumask during cases such as cpu hotplugging.
+ * This is the absolute last resort for the scheduler and it is only used if
+ * _every_ other avenue has been traveled.
+ **/
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
        rcu_read_lock();
-        do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
+        do_set_cpus_allowed(tsk, is_in_v2_mode() ?
+                task_cs(tsk)->cpus_allowed : cpu_possible_mask);
        rcu_read_unlock();
        /*
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index c9960baaa14f..8e513a573fe9 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Process number limiting controller for cgroups.
 *
@@ -25,10 +26,6 @@
 * a superset of parent/child/pids.current.
 *
 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License.  See the file COPYING in the main directory of the
- * Linux distribution for more details.
 */
 #include <linux/kernel.h>
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 1d75ae7f1cb7..ae042c347c64 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * RDMA resource limiting controller for cgroups.
 *
@@ -5,10 +6,6 @@
 * additional RDMA resources after a certain limit is reached.
 *
 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
 */
 #include <linux/bitops.h>
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index bb95a35e8c2d..ca19b4c8acf5 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include "cgroup-internal.h"
 #include <linux/sched/cputime.h>
diff --git a/kernel/compat.c b/kernel/compat.c
index d8a36c6ad7c9..a2bc1d6ceb57 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  linux/kernel/compat.c
 *
@@ -5,10 +6,6 @@
 *  on 64 bit kernels.
 *
 *  Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
 */
 #include <linux/linkage.h>
@@ -346,8 +343,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
                return -EFAULT;
        switch (_NSIG_WORDS) {
        case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+                /* fall through */
        case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+                /* fall through */
        case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+                /* fall through */
        case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
        }
 #else
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 9ad37b9e44a7..be01a4d627c9 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Context tracking: Probe on high level context boundaries such as kernel
 * and userspace. This includes syscalls and exceptions entry/exit.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f2ef10460698..e84c0873559e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -522,7 +522,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
        /*
         * SMT soft disabling on X86 requires to bring the CPU out of the
         * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
-         * CPU marked itself as booted_once in cpu_notify_starting() so the
+         * CPU marked itself as booted_once in notify_cpu_starting() so the
         * cpu_smt_allowed() check will now return false if this is not the
         * primary sibling.
         */
@@ -1221,6 +1221,13 @@ int freeze_secondary_cpus(int primary)
        for_each_online_cpu(cpu) {
                if (cpu == primary)
                        continue;
+                if (pm_wakeup_pending()) {
+                        pr_info("Wakeup pending. Abort CPU freeze\n");
+                        error = -EBUSY;
+                        break;
+                }
                trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
                error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
                trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
@@ -1964,6 +1971,9 @@ static ssize_t write_cpuhp_fail(struct device *dev,
        if (ret)
                return ret;
+        if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
+                return -EINVAL;
        /*
         * Cannot fail STARTING/DYING callbacks.
         */
@@ -2061,7 +2071,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu)
        kobject_uevent(&dev->kobj, KOBJ_ONLINE);
 }
-static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
 {
        int cpu, ret = 0;
@@ -2093,7 +2103,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
        return ret;
 }
-static int cpuhp_smt_enable(void)
+int cpuhp_smt_enable(void)
 {
        int cpu, ret = 0;
@@ -2339,6 +2349,9 @@ static int __init mitigations_parse_cmdline(char *arg)
                cpu_mitigations = CPU_MITIGATIONS_AUTO;
        else if (!strcmp(arg, "auto,nosmt"))
                cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
+        else
+                pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
+                        arg);
        return 0;
 }
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 67b02e138a47..cbca6879ab7d 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Copyright (C) 2011 Google, Inc.
 *
 * Author:
 *      Colin Cross <ccross@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
 */
 #include <linux/kernel.h>
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 093c9f917ed0..9f1557b98468 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * crash.c - kernel crash support code.
 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
 */
 #include <linux/crash_core.h>
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index b64e238b553b..9c23ae074b40 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/kernel.h>
 #include <linux/crash_dump.h>
 #include <linux/init.h>
diff --git a/kernel/cred.c b/kernel/cred.c
index 45d77284aed0..f9a0ce66c9c3 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Task credentials management - see Documentation/security/credentials.rst
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
 */
 #include <linux/export.h>
 #include <linux/cred.h>
@@ -174,6 +170,11 @@ void exit_creds(struct task_struct *tsk)
        validate_creds(cred);
        alter_cred_subscribers(cred, -1);
        put_cred(cred);
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+        key_put(current->cached_requested_key);
+        current->cached_requested_key = NULL;
+#endif
 }
 /**
@@ -327,6 +328,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
        struct cred *new;
        int ret;
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+        p->cached_requested_key = NULL;
+#endif
        if (
 #ifdef CONFIG_KEYS
                !p->cred->thread_keyring &&
@@ -450,14 +455,23 @@ int commit_creds(struct cred *new)
                if (task->mm)
                        set_dumpable(task->mm, suid_dumpable);
                task->pdeath_signal = 0;
+                /*
+                 * If a task drops privileges and becomes nondumpable,
+                 * the dumpability change must become visible before
+                 * the credential change; otherwise, a __ptrace_may_access()
+                 * racing with this change may be able to attach to a task it
+                 * shouldn't be able to attach to (as if the task had dropped
+                 * privileges without becoming nondumpable).
+                 * Pairs with a read barrier in __ptrace_may_access().
+                 */
                smp_wmb();
        }
        /* alter the thread keyring */
        if (!uid_eq(new->fsuid, old->fsuid))
-                key_fsuid_changed(task);
+                key_fsuid_changed(new);
        if (!gid_eq(new->fsgid, old->fsgid))
-                key_fsgid_changed(task);
+                key_fsgid_changed(new);
        /* do it
         * RLIMIT_NPROC limits on user->processes have already been checked
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
index a85edc339985..332ee6c6ec2c 100644
--- a/kernel/debug/Makefile
+++ b/kernel/debug/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Makefile for the linux kernel debugger
 #
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 7510dc687c0d..4b280fc7dd67 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1033,13 +1033,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
                                return DBG_PASS_EVENT;
                        }
 #endif
+                        /* Fall through */
                case 'C': /* Exception passing */
                        tmp = gdb_cmd_exception_pass(ks);
                        if (tmp > 0)
                                goto default_handle;
                        if (tmp == 0)
                                break;
-                        /* Fall through on tmp < 0 */
+                        /* Fall through - on tmp < 0 */
                case 'c': /* Continue packet */
                case 's': /* Single step packet */
                        if (kgdb_contthread && kgdb_contthread != current) {
@@ -1048,7 +1049,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
                                break;
                        }
                        dbg_activate_sw_breakpoints();
-                        /* Fall through to default processing */
+                        /* Fall through - to default processing */
                default:
 default_handle:
                        error = kgdb_arch_handle_exception(ks->ex_vector,
@@ -1094,10 +1095,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
                return error;
        case 's':
        case 'c':
-                strcpy(remcom_in_buffer, cmd);
+                strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
                return 0;
        case '$':
-                strcpy(remcom_in_buffer, cmd);
+                strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
                gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
                gdbstub_prev_in_buf_pos = 0;
                return 0;
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
index d4fc58f4b88d..efac857c5511 100644
--- a/kernel/debug/kdb/Makefile
+++ b/kernel/debug/kdb/Makefile
@@ -6,7 +6,6 @@
 # Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
 #
-CCVERSION       := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
 obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
 obj-$(CONFIG_KDB_KEYBOARD)    += kdb_keyboard.o
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 6a4b41484afe..3a5184eb6977 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -446,7 +446,7 @@ poll_again:
 char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
 {
        if (prompt && kdb_prompt_str != prompt)
-                strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
+                strscpy(kdb_prompt_str, prompt, CMD_BUFLEN);
        kdb_printf(kdb_prompt_str);
        kdb_nextline = 1;       /* Prompt and input resets line number */
        return kdb_read(buffer, bufsize);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 82a3b32a7cfc..9ecfa37c7fbf 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2522,7 +2522,6 @@ static int kdb_summary(int argc, const char **argv)
        kdb_printf("machine    %s\n", init_uts_ns.name.machine);
        kdb_printf("nodename   %s\n", init_uts_ns.name.nodename);
        kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
-        kdb_printf("ccversion  %s\n", __stringify(CCVERSION));
        now = __ktime_get_real_seconds();
        time64_to_tm(now, 0, &tm);
@@ -2584,7 +2583,7 @@ static int kdb_per_cpu(int argc, const char **argv)
                diag = kdbgetularg(argv[3], &whichcpu);
                if (diag)
                        return diag;
-                if (!cpu_online(whichcpu)) {
+                if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) {
                        kdb_printf("cpu %ld is not online\n", whichcpu);
                        return KDB_BADCPUNUM;
                }
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 50bf9b119bad..b8e6306e7e13 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -192,7 +192,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len)
        while ((name = kdb_walk_kallsyms(&pos))) {
                if (strncmp(name, prefix_name, prefix_len) == 0) {
-                        strcpy(ks_namebuf, name);
+                        strscpy(ks_namebuf, name, sizeof(ks_namebuf));
                        /* Work out the longest name that matches the prefix */
                        if (++number == 1) {
                                prev_len = min_t(int, max_len-1,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 2a12b988c717..27725754ac99 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* delayacct.c - per-task delay accounting
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
- *
- * This program is free software;  you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
 */
 #include <linux/sched.h>
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 83d711f8d665..70f8f8d9200e 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config HAS_DMA
        bool
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index b2a87905846d..bfc0c17f2a3d 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -214,6 +214,62 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
        return cma_release(dev_get_cma_area(dev), pages, count);
 }
+/**
+ * dma_alloc_contiguous() - allocate contiguous pages
+ * @dev:   Pointer to device for which the allocation is performed.
+ * @size:  Requested allocation size.
+ * @gfp:   Allocation flags.
+ *
+ * This function allocates contiguous memory buffer for specified device. It
+ * first tries to use device specific contiguous memory area if available or
+ * the default global one, then tries a fallback allocation of normal pages.
+ *
+ * Note that it byapss one-page size of allocations from the global area as
+ * the addresses within one page are always contiguous, so there is no need
+ * to waste CMA pages for that kind; it also helps reduce fragmentations.
+ */
+struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
+{
+        int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+        size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+        size_t align = get_order(PAGE_ALIGN(size));
+        struct page *page = NULL;
+        struct cma *cma = NULL;
+        if (dev && dev->cma_area)
+                cma = dev->cma_area;
+        else if (count > 1)
+                cma = dma_contiguous_default_area;
+        /* CMA can be used only in the context which permits sleeping */
+        if (cma && gfpflags_allow_blocking(gfp)) {
+                align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
+                page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN);
+        }
+        /* Fallback allocation of normal pages */
+        if (!page)
+                page = alloc_pages_node(node, gfp, align);
+        return page;
+}
+/**
+ * dma_free_contiguous() - release allocated pages
+ * @dev:   Pointer to device for which the pages were allocated.
+ * @page:  Pointer to the allocated pages.
+ * @size:  Size of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_contiguous(). As the
+ * cma_release returns false when provided pages do not belong to contiguous
+ * area and true otherwise, this function then does a fallback __free_pages()
+ * upon a false-return.
+ */
+void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
+{
+        if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT))
+                __free_pages(page, get_order(size));
+}
 /*
 * Support for reserved memory regions defined in device tree
 */
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index badd77670d00..099002d84f46 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Copyright (C) 2008 Advanced Micro Devices, Inc.
 *
 * Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 */
 #define pr_fmt(fmt)     "DMA-API: " fmt
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 2c2772e9702a..b90e1aede743 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -96,8 +96,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
                dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
-        unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-        int page_order = get_order(size);
        struct page *page = NULL;
        u64 phys_mask;
@@ -109,20 +107,9 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
        gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
                        &phys_mask);
 again:
-        /* CMA can be used only in the context which permits sleeping */
+        page = dma_alloc_contiguous(dev, size, gfp);
-        if (gfpflags_allow_blocking(gfp)) {
-                page = dma_alloc_from_contiguous(dev, count, page_order,
-                                                 gfp & __GFP_NOWARN);
-                if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
-                        dma_release_from_contiguous(dev, page, count);
-                        page = NULL;
-                }
-        }
-        if (!page)
-                page = alloc_pages_node(dev_to_node(dev), gfp, page_order);
        if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
-                __free_pages(page, page_order);
+                dma_free_contiguous(dev, page, size);
                page = NULL;
                if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
@@ -151,10 +138,18 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
        if (!page)
                return NULL;
+        if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+                /* remove any dirty cache lines on the kernel alias */
+                if (!PageHighMem(page))
+                        arch_dma_prep_coherent(page, size);
+                /* return the page pointer as the opaque cookie */
+                return page;
+        }
        if (PageHighMem(page)) {
                /*
                 * Depending on the cma= arguments and per-arch setup
-                 * dma_alloc_from_contiguous could return highmem pages.
+                 * dma_alloc_contiguous could return highmem pages.
                 * Without remapping there is no way to return them here,
                 * so log an error and fail.
                 */
@@ -171,15 +166,19 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
                *dma_handle = phys_to_dma(dev, page_to_phys(page));
        }
        memset(ret, 0, size);
+        if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+            dma_alloc_need_uncached(dev, attrs)) {
+                arch_dma_prep_coherent(page, size);
+                ret = uncached_kernel_address(ret);
+        }
        return ret;
 }
 void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
 {
-        unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+        dma_free_contiguous(dev, page, size);
-        if (!dma_release_from_contiguous(dev, page, count))
-                __free_pages(page, get_order(size));
 }
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
@@ -187,15 +186,26 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 {
        unsigned int page_order = get_order(size);
+        if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+                /* cpu_addr is a struct page cookie, not a kernel address */
+                __dma_direct_free_pages(dev, size, cpu_addr);
+                return;
+        }
        if (force_dma_unencrypted())
                set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
+        if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+            dma_alloc_need_uncached(dev, attrs))
+                cpu_addr = cached_kernel_address(cpu_addr);
        __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
 }
 void *dma_direct_alloc(struct device *dev, size_t size,
                dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
-        if (!dev_is_dma_coherent(dev))
+        if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+            dma_alloc_need_uncached(dev, attrs))
                return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
        return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
 }
@@ -203,7 +213,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 void dma_direct_free(struct device *dev, size_t size,
                void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
 {
-        if (!dev_is_dma_coherent(dev))
+        if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+            dma_alloc_need_uncached(dev, attrs))
                arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
        else
                dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f7afdadb6770..1f628e7ac709 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -317,6 +317,12 @@ void arch_dma_set_mask(struct device *dev, u64 mask);
 int dma_set_mask(struct device *dev, u64 mask)
 {
+        /*
+         * Truncate the mask to the actually supported dma_addr_t width to
+         * avoid generating unsupportable addresses.
+         */
+        mask = (dma_addr_t)mask;
        if (!dev->dma_mask || !dma_supported(dev, mask))
                return -EIO;
@@ -330,6 +336,12 @@ EXPORT_SYMBOL(dma_set_mask);
 #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
 int dma_set_coherent_mask(struct device *dev, u64 mask)
 {
+        /*
+         * Truncate the mask to the actually supported dma_addr_t width to
+         * avoid generating unsupportable addresses.
+         */
+        mask = (dma_addr_t)mask;
        if (!dma_supported(dev, mask))
                return -EIO;
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 7a723194ecbe..a594aec07882 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -158,6 +158,9 @@ out:
 bool dma_in_atomic_pool(void *start, size_t size)
 {
+        if (unlikely(!atomic_pool))
+                return false;
        return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
 }
@@ -199,8 +202,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
        size = PAGE_ALIGN(size);
-        if (!gfpflags_allow_blocking(flags) &&
+        if (!gfpflags_allow_blocking(flags)) {
-            !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) {
                ret = dma_alloc_from_pool(size, &page, flags);
                if (!ret)
                        return NULL;
@@ -214,11 +216,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
        /* remove any dirty cache lines on the kernel alias */
        arch_dma_prep_coherent(page, size);
-        if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
-                ret = page; /* opaque cookie */
-                goto done;
-        }
        /* create a coherent mapping */
        ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
                        arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
@@ -237,10 +234,7 @@ done:
 void arch_dma_free(struct device *dev, size_t size, void *vaddr,
                dma_addr_t dma_handle, unsigned long attrs)
 {
-        if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+        if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
-                /* vaddr is a struct page cookie, not a kernel address */
-                __dma_direct_free_pages(dev, size, vaddr);
-        } else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
                phys_addr_t phys = dma_to_phys(dev, dma_handle);
                struct page *page = pfn_to_page(__phys_to_pfn(phys));
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 6f7619c1f877..62fa5a82a065 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Dynamic DMA mapping support.
 *
@@ -695,29 +696,12 @@ bool is_swiotlb_active(void)
 static int __init swiotlb_create_debugfs(void)
 {
-        struct dentry *d_swiotlb_usage;
+        struct dentry *root;
-        struct dentry *ent;
-        d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL);
-        if (!d_swiotlb_usage)
-                return -ENOMEM;
-        ent = debugfs_create_ulong("io_tlb_nslabs", 0400,
-                                   d_swiotlb_usage, &io_tlb_nslabs);
-        if (!ent)
-                goto fail;
-        ent = debugfs_create_ulong("io_tlb_used", 0400,
-                                   d_swiotlb_usage, &io_tlb_used);
-        if (!ent)
-                goto fail;
+        root = debugfs_create_dir("swiotlb", NULL);
+        debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs);
+        debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used);
        return 0;
-fail:
-        debugfs_remove_recursive(d_swiotlb_usage);
-        return -ENOMEM;
 }
 late_initcall(swiotlb_create_debugfs);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index abbd4b3b96c2..785d708f8553 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2952,6 +2952,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
        if (!ctx->nr_active || !(is_active & EVENT_ALL))
                return;
+        /*
+         * If we had been multiplexing, no rotations are necessary, now no events
+         * are active.
+         */
+        ctx->rotate_necessary = 0;
        perf_pmu_disable(ctx->pmu);
        if (is_active & EVENT_PINNED) {
                list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
@@ -3319,10 +3325,13 @@ static int flexible_sched_in(struct perf_event *event, void *data)
                return 0;
        if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
-                if (!group_sched_in(event, sid->cpuctx, sid->ctx))
+                int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
-                        list_add_tail(&event->active_list, &sid->ctx->flexible_active);
+                if (ret) {
-                else
                        sid->can_add_hw = 0;
+                        sid->ctx->rotate_necessary = 1;
+                        return 0;
+                }
+                list_add_tail(&event->active_list, &sid->ctx->flexible_active);
        }
        return 0;
@@ -3690,24 +3699,17 @@ ctx_first_active(struct perf_event_context *ctx)
 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event *cpu_event = NULL, *task_event = NULL;
-        bool cpu_rotate = false, task_rotate = false;
+        struct perf_event_context *task_ctx = NULL;
-        struct perf_event_context *ctx = NULL;
+        int cpu_rotate, task_rotate;
        /*
         * Since we run this from IRQ context, nobody can install new
         * events, thus the event count values are stable.
         */
-        if (cpuctx->ctx.nr_events) {
+        cpu_rotate = cpuctx->ctx.rotate_necessary;
-                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+        task_ctx = cpuctx->task_ctx;
-                        cpu_rotate = true;
+        task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
-        }
-        ctx = cpuctx->task_ctx;
-        if (ctx && ctx->nr_events) {
-                if (ctx->nr_events != ctx->nr_active)
-                        task_rotate = true;
-        }
        if (!(cpu_rotate || task_rotate))
                return false;
@@ -3716,7 +3718,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
        perf_pmu_disable(cpuctx->ctx.pmu);
        if (task_rotate)
-                task_event = ctx_first_active(ctx);
+                task_event = ctx_first_active(task_ctx);
        if (cpu_rotate)
                cpu_event = ctx_first_active(&cpuctx->ctx);
@@ -3724,17 +3726,17 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
         * As per the order given at ctx_resched() first 'pop' task flexible
         * and then, if needed CPU flexible.
         */
-        if (task_event || (ctx && cpu_event))
+        if (task_event || (task_ctx && cpu_event))
-                ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+                ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
        if (cpu_event)
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (task_event)
-                rotate_ctx(ctx, task_event);
+                rotate_ctx(task_ctx, task_event);
        if (cpu_event)
                rotate_ctx(&cpuctx->ctx, cpu_event);
-        perf_event_sched_in(cpuctx, ctx, current);
+        perf_event_sched_in(cpuctx, task_ctx, current);
        perf_pmu_enable(cpuctx->ctx.pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -5005,6 +5007,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
        if (perf_event_check_period(event, value))
                return -EINVAL;
+        if (!event->attr.freq && (value & (1ULL << 63)))
+                return -EINVAL;
        event_function_call(event, __perf_event_period, &value);
        return 0;
@@ -5923,7 +5928,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
        if (user_mode(regs)) {
                regs_user->abi = perf_reg_abi(current);
                regs_user->regs = regs;
-        } else if (current->mm) {
+        } else if (!(current->flags & PF_KTHREAD)) {
                perf_get_regs_user(regs_user, regs, regs_user_copy);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -8532,9 +8537,9 @@ static int perf_tp_event_match(struct perf_event *event,
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;
        /*
-         * All tracepoints are from kernel-space.
+         * If exclude_kernel, only trace user-space tracepoints (uprobes)
         */
-        if (event->attr.exclude_kernel)
+        if (event->attr.exclude_kernel && !user_mode(regs))
                return 0;
        if (!perf_tp_filter_match(event, data))
@@ -9874,6 +9879,12 @@ static int pmu_dev_alloc(struct pmu *pmu)
        if (ret)
                goto del_dev;
+        if (pmu->attr_update)
+                ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
+        if (ret)
+                goto del_dev;
 out:
        return ret;
@@ -10033,6 +10044,12 @@ void perf_pmu_unregister(struct pmu *pmu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+static inline bool has_extended_regs(struct perf_event *event)
+{
+        return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
+               (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
+}
 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 {
        struct perf_event_context *ctx = NULL;
@@ -10064,12 +10081,16 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
                perf_event_ctx_unlock(event->group_leader, ctx);
        if (!ret) {
+                if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+                    has_extended_regs(event))
+                        ret = -EOPNOTSUPP;
                if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
-                                event_has_any_exclude_flag(event)) {
+                    event_has_any_exclude_flag(event))
-                        if (event->destroy)
-                                event->destroy(event);
                        ret = -EINVAL;
-                }
+                if (ret && event->destroy)
+                        event->destroy(event);
        }
        if (ret)
@@ -10680,11 +10701,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
                break;
        case CLOCK_BOOTTIME:
-                event->clock = &ktime_get_boot_ns;
+                event->clock = &ktime_get_boottime_ns;
                break;
        case CLOCK_TAI:
-                event->clock = &ktime_get_tai_ns;
+                event->clock = &ktime_get_clocktai_ns;
                break;
        default:
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 79c47076700a..3aef4191798c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -24,7 +24,7 @@ struct ring_buffer {
        atomic_t                        poll;           /* POLL_ for wakeups */
        local_t                         head;           /* write position    */
-        local_t                         nest;           /* nested writers    */
+        unsigned int                    nest;           /* nested writers    */
        local_t                         events;         /* event limit       */
        local_t                         wakeup;         /* wakeup stamp      */
        local_t                         lost;           /* nr records lost   */
@@ -41,7 +41,7 @@ struct ring_buffer {
        /* AUX area */
        long                            aux_head;
-        local_t                         aux_nest;
+        unsigned int                    aux_nest;
        long                            aux_wakeup;     /* last aux_watermark boundary crossed by aux_head */
        unsigned long                   aux_pgoff;
        int                             aux_nr_pages;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 674b35383491..ffb59a4ef4ff 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle)
        struct ring_buffer *rb = handle->rb;
        preempt_disable();
-        local_inc(&rb->nest);
+        /*
+         * Avoid an explicit LOAD/STORE such that architectures with memops
+         * can use them.
+         */
+        (*(volatile unsigned int *)&rb->nest)++;
        handle->wakeup = local_read(&rb->wakeup);
 }
@@ -46,17 +51,35 @@ static void perf_output_put_handle(struct perf_output_handle *handle)
 {
        struct ring_buffer *rb = handle->rb;
        unsigned long head;
+        unsigned int nest;
+        /*
+         * If this isn't the outermost nesting, we don't have to update
+         * @rb->user_page->data_head.
+         */
+        nest = READ_ONCE(rb->nest);
+        if (nest > 1) {
+                WRITE_ONCE(rb->nest, nest - 1);
+                goto out;
+        }
 again:
+        /*
+         * In order to avoid publishing a head value that goes backwards,
+         * we must ensure the load of @rb->head happens after we've
+         * incremented @rb->nest.
+         *
+         * Otherwise we can observe a @rb->head value before one published
+         * by an IRQ/NMI happening between the load and the increment.
+         */
+        barrier();
        head = local_read(&rb->head);
        /*
-         * IRQ/NMI can happen here, which means we can miss a head update.
+         * IRQ/NMI can happen here and advance @rb->head, causing our
+         * load above to be stale.
         */
-        if (!local_dec_and_test(&rb->nest))
-                goto out;
        /*
         * Since the mmap() consumer (userspace) can run on a different CPU:
         *
@@ -84,14 +107,23 @@ again:
         * See perf_output_begin().
         */
        smp_wmb(); /* B, matches C */
-        rb->user_page->data_head = head;
+        WRITE_ONCE(rb->user_page->data_head, head);
        /*
-         * Now check if we missed an update -- rely on previous implied
+         * We must publish the head before decrementing the nest count,
-         * compiler barriers to force a re-read.
+         * otherwise an IRQ/NMI can publish a more recent head value and our
+         * write will (temporarily) publish a stale value.
         */
+        barrier();
+        WRITE_ONCE(rb->nest, 0);
+        /*
+         * Ensure we decrement @rb->nest before we validate the @rb->head.
+         * Otherwise we cannot be sure we caught the 'last' nested update.
+         */
+        barrier();
        if (unlikely(head != local_read(&rb->head))) {
-                local_inc(&rb->nest);
+                WRITE_ONCE(rb->nest, 1);
                goto again;
        }
@@ -330,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
        struct perf_event *output_event = event;
        unsigned long aux_head, aux_tail;
        struct ring_buffer *rb;
+        unsigned int nest;
        if (output_event->parent)
                output_event = output_event->parent;
@@ -360,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
        if (!refcount_inc_not_zero(&rb->aux_refcount))
                goto err;
+        nest = READ_ONCE(rb->aux_nest);
        /*
         * Nesting is not supported for AUX area, make sure nested
         * writers are caught early
         */
-        if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+        if (WARN_ON_ONCE(nest))
                goto err_put;
+        WRITE_ONCE(rb->aux_nest, nest + 1);
        aux_head = rb->aux_head;
        handle->rb = rb;
@@ -394,7 +430,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
                if (!handle->size) { /* A, matches D */
                        event->pending_disable = smp_processor_id();
                        perf_output_wakeup(handle);
-                        local_set(&rb->aux_nest, 0);
+                        WRITE_ONCE(rb->aux_nest, 0);
                        goto err_put;
                }
        }
@@ -471,7 +507,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
                perf_event_aux_event(handle->event, aux_head, size,
                                     handle->aux_flags);
-        rb->user_page->aux_head = rb->aux_head;
+        WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
        if (rb_need_aux_wakeup(rb))
                wakeup = true;
@@ -483,7 +519,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
        handle->event = NULL;
-        local_set(&rb->aux_nest, 0);
+        WRITE_ONCE(rb->aux_nest, 0);
        /* can't be last */
        rb_free_aux(rb);
        ring_buffer_put(rb);
@@ -503,7 +539,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
        rb->aux_head += size;
-        rb->user_page->aux_head = rb->aux_head;
+        WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
        if (rb_need_aux_wakeup(rb)) {
                perf_output_wakeup(handle);
                handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 78f61bfc6b79..84fa00497c49 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -46,7 +46,7 @@ static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */
 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 #define uprobes_mmap_hash(v)    (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
-static struct percpu_rw_semaphore dup_mmap_sem;
+DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
 /* Have a copy of original instruction */
 #define UPROBE_COPY_INSN        0
@@ -2112,7 +2112,7 @@ static void handle_trampoline(struct pt_regs *regs)
 sigill:
        uprobe_warn(current, "handle uretprobe, sending SIGILL.");
-        force_sig(SIGILL, current);
+        force_sig(SIGILL);
 }
@@ -2228,7 +2228,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
        if (unlikely(err)) {
                uprobe_warn(current, "execute the probed insn, sending SIGILL.");
-                force_sig(SIGILL, current);
+                force_sig(SIGILL);
        }
 }
@@ -2302,7 +2302,5 @@ void __init uprobes_init(void)
        for (i = 0; i < UPROBES_HASH_SZ; i++)
                mutex_init(&uprobes_mmap_mutex[i]);
-        BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
        BUG_ON(register_die_notifier(&uprobe_exception_nb));
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 2166c2d92ddc..a75b6a7f458a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  linux/kernel/exit.c
 *
@@ -194,6 +195,7 @@ repeat:
        rcu_read_unlock();
        proc_flush_task(p);
+        cgroup_release(p);
        write_lock_irq(&tasklist_lock);
        ptrace_release_task(p);
@@ -219,7 +221,6 @@ repeat:
        }
        write_unlock_irq(&tasklist_lock);
-        cgroup_release(p);
        release_thread(p);
        call_rcu(&p->rcu, delayed_put_task_struct);
@@ -422,7 +423,7 @@ retry:
         * freed task structure.
         */
        if (atomic_read(&mm->mm_users) <= 1) {
-                mm->owner = NULL;
+                WRITE_ONCE(mm->owner, NULL);
                return;
        }
@@ -462,7 +463,7 @@ retry:
         * most likely racing with swapoff (try_to_unuse()) or /proc or
         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
         */
-        mm->owner = NULL;
+        WRITE_ONCE(mm->owner, NULL);
        return;
 assign_new_owner:
@@ -483,7 +484,7 @@ assign_new_owner:
                put_task_struct(c);
                goto retry;
        }
-        mm->owner = c;
+        WRITE_ONCE(mm->owner, c);
        task_unlock(c);
        put_task_struct(c);
 }
diff --git a/kernel/extable.c b/kernel/extable.c
index 6a5b61ebc66c..e23cce6e6092 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Rewritten by Rusty Russell, on the backs of many others...
   Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/ftrace.h>
 #include <linux/memory.h>
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index feb80712b913..63b349168da7 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -152,20 +152,13 @@ static int fei_retval_get(void *data, u64 *val)
 DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set,
                         "%llx\n");
-static int fei_debugfs_add_attr(struct fei_attr *attr)
+static void fei_debugfs_add_attr(struct fei_attr *attr)
 {
        struct dentry *dir;
        dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir);
-        if (!dir)
-                return -ENOMEM;
-        if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) {
-                debugfs_remove_recursive(dir);
-                return -ENOMEM;
-        }
-        return 0;
+        debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops);
 }
 static void fei_debugfs_remove_attr(struct fei_attr *attr)
@@ -306,7 +299,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
        ret = register_kprobe(&attr->kp);
        if (!ret)
-                ret = fei_debugfs_add_attr(attr);
+                fei_debugfs_add_attr(attr);
        if (ret < 0)
                fei_attr_remove(attr);
        else {
@@ -337,19 +330,13 @@ static int __init fei_debugfs_init(void)
                return PTR_ERR(dir);
        /* injectable attribute is just a symlink of error_inject/list */
-        if (!debugfs_create_symlink("injectable", dir,
+        debugfs_create_symlink("injectable", dir, "../error_injection/list");
-                                    "../error_injection/list"))
-                goto error;
-        if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops))
+        debugfs_create_file("inject", 0600, dir, NULL, &fei_ops);
-                goto error;
        fei_debugfs_dir = dir;
        return 0;
-error:
-        debugfs_remove_recursive(dir);
-        return -ENOMEM;
 }
 late_initcall(fei_debugfs_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index 737db1828437..8f3e2d97d771 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  linux/kernel/fork.c
 *
@@ -122,7 +123,7 @@
 unsigned long total_forks;      /* Handle normal Linux uptimes. */
 int nr_threads;                 /* The idle threads do not count.. */
-int max_threads;                /* tunable limit on nr_threads */
+static int max_threads;         /* tunable limit on nr_threads */
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
@@ -247,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);
-        return page ? page_address(page) : NULL;
+        if (likely(page)) {
+                tsk->stack = page_address(page);
+                return tsk->stack;
+        }
+        return NULL;
 #endif
 }
@@ -893,6 +898,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #ifdef CONFIG_STACKPROTECTOR
        tsk->stack_canary = get_random_canary();
 #endif
+        if (orig->cpus_ptr == &orig->cpus_mask)
+                tsk->cpus_ptr = &tsk->cpus_mask;
        /*
         * One for us, one for whoever does the "release_task()" (usually
@@ -955,6 +962,15 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
+static __always_inline void mm_clear_owner(struct mm_struct *mm,
+                                           struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+        if (mm->owner == p)
+                WRITE_ONCE(mm->owner, NULL);
+#endif
+}
 static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 {
 #ifdef CONFIG_MEMCG
@@ -1343,6 +1359,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
 free_pt:
        /* don't put binfmt in mmput, we haven't got module yet */
        mm->binfmt = NULL;
+        mm_init_owner(mm, NULL);
        mmput(mm);
 fail_nomem:
@@ -1694,36 +1711,52 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 }
 #endif
+/*
+ * Poll support for process exit notification.
+ */
+static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+        struct task_struct *task;
+        struct pid *pid = file->private_data;
+        int poll_flags = 0;
+        poll_wait(file, &pid->wait_pidfd, pts);
+        rcu_read_lock();
+        task = pid_task(pid, PIDTYPE_PID);
+        /*
+         * Inform pollers only when the whole thread group exits.
+         * If the thread group leader exits before all other threads in the
+         * group, then poll(2) should block, similar to the wait(2) family.
+         */
+        if (!task || (task->exit_state && thread_group_empty(task)))
+                poll_flags = POLLIN | POLLRDNORM;
+        rcu_read_unlock();
+        return poll_flags;
+}
 const struct file_operations pidfd_fops = {
        .release = pidfd_release,
+        .poll = pidfd_poll,
 #ifdef CONFIG_PROC_FS
        .show_fdinfo = pidfd_show_fdinfo,
 #endif
 };
-/**
+static void __delayed_free_task(struct rcu_head *rhp)
- * pidfd_create() - Create a new pid file descriptor.
- *
- * @pid:  struct pid that the pidfd will reference
- *
- * This creates a new pid file descriptor with the O_CLOEXEC flag set.
- *
- * Note, that this function can only be called after the fd table has
- * been unshared to avoid leaking the pidfd to the new process.
- *
- * Return: On success, a cloexec pidfd is returned.
- *         On error, a negative errno number will be returned.
- */
-static int pidfd_create(struct pid *pid)
 {
-        int fd;
+        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
-        fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
+        free_task(tsk);
-                              O_RDWR | O_CLOEXEC);
+}
-        if (fd < 0)
-                put_pid(pid);
-        return fd;
+static __always_inline void delayed_free_task(struct task_struct *tsk)
+{
+        if (IS_ENABLED(CONFIG_MEMCG))
+                call_rcu(&tsk->rcu, __delayed_free_task);
+        else
+                free_task(tsk);
 }
 /*
@@ -1735,19 +1768,16 @@ static int pidfd_create(struct pid *pid)
 * flags). The actual kick-off is left to the caller.
 */
 static __latent_entropy struct task_struct *copy_process(
-                                        unsigned long clone_flags,
-                                        unsigned long stack_start,
-                                        unsigned long stack_size,
-                                        int __user *parent_tidptr,
-                                        int __user *child_tidptr,
                                        struct pid *pid,
                                        int trace,
-                                        unsigned long tls,
+                                        int node,
-                                        int node)
+                                        struct kernel_clone_args *args)
 {
        int pidfd = -1, retval;
        struct task_struct *p;
        struct multiprocess_signals delayed;
+        struct file *pidfile = NULL;
+        u64 clone_flags = args->flags;
        /*
         * Don't allow sharing the root directory with processes in a different
@@ -1796,27 +1826,12 @@ static __latent_entropy struct task_struct *copy_process(
        }
        if (clone_flags & CLONE_PIDFD) {
-                int reserved;
                /*
-                 * - CLONE_PARENT_SETTID is useless for pidfds and also
-                 *   parent_tidptr is used to return pidfds.
                 * - CLONE_DETACHED is blocked so that we can potentially
                 *   reuse it later for CLONE_PIDFD.
                 * - CLONE_THREAD is blocked until someone really needs it.
                 */
-                if (clone_flags &
+                if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
-                    (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
-                        return ERR_PTR(-EINVAL);
-                /*
-                 * Verify that parent_tidptr is sane so we can potentially
-                 * reuse it later.
-                 */
-                if (get_user(reserved, parent_tidptr))
-                        return ERR_PTR(-EFAULT);
-                if (reserved != 0)
                        return ERR_PTR(-EINVAL);
        }
@@ -1849,11 +1864,11 @@ static __latent_entropy struct task_struct *copy_process(
         * p->set_child_tid which is (ab)used as a kthread's data pointer for
         * kernel threads (PF_KTHREAD).
         */
-        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
        /*
         * Clear TID on mm_release()?
         */
-        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
        ftrace_graph_init_task(p);
@@ -1958,9 +1973,6 @@ static __latent_entropy struct task_struct *copy_process(
        p->pagefault_disabled = 0;
 #ifdef CONFIG_LOCKDEP
-        p->lockdep_depth = 0; /* no locks held yet */
-        p->curr_chain_key = 0;
-        p->lockdep_recursion = 0;
        lockdep_init_task(p);
 #endif
@@ -2012,7 +2024,8 @@ static __latent_entropy struct task_struct *copy_process(
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
-        retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
+        retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
+                                 args->tls);
        if (retval)
                goto bad_fork_cleanup_io;
@@ -2032,12 +2045,22 @@ static __latent_entropy struct task_struct *copy_process(
         * if the fd table isn't shared).
         */
        if (clone_flags & CLONE_PIDFD) {
-                retval = pidfd_create(pid);
+                retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
                if (retval < 0)
                        goto bad_fork_free_pid;
                pidfd = retval;
-                retval = put_user(pidfd, parent_tidptr);
+                pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+                                              O_RDWR | O_CLOEXEC);
+                if (IS_ERR(pidfile)) {
+                        put_unused_fd(pidfd);
+                        retval = PTR_ERR(pidfile);
+                        goto bad_fork_free_pid;
+                }
+                get_pid(pid);   /* held by pidfile now */
+                retval = put_user(pidfd, args->pidfd);
                if (retval)
                        goto bad_fork_put_pidfd;
        }
@@ -2068,7 +2091,7 @@ static __latent_entropy struct task_struct *copy_process(
 #ifdef TIF_SYSCALL_EMU
        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 #endif
-        clear_all_latency_tracing(p);
+        clear_tsk_latency_tracing(p);
        /* ok, now we should be set up.. */
        p->pid = pid_nr(pid);
@@ -2080,7 +2103,7 @@ static __latent_entropy struct task_struct *copy_process(
                if (clone_flags & CLONE_PARENT)
                        p->exit_signal = current->group_leader->exit_signal;
                else
-                        p->exit_signal = (clone_flags & CSIGNAL);
+                        p->exit_signal = args->exit_signal;
                p->group_leader = p;
                p->tgid = p->pid;
        }
@@ -2113,7 +2136,7 @@ static __latent_entropy struct task_struct *copy_process(
         */
        p->start_time = ktime_get_ns();
-        p->real_start_time = ktime_get_boot_ns();
+        p->real_start_time = ktime_get_boottime_ns();
        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
@@ -2154,6 +2177,9 @@ static __latent_entropy struct task_struct *copy_process(
                goto bad_fork_cancel_cgroup;
        }
+        /* past the last point of failure */
+        if (pidfile)
+                fd_install(pidfd, pidfile);
        init_task_pid_links(p);
        if (likely(p->pid)) {
@@ -2220,8 +2246,10 @@ bad_fork_cancel_cgroup:
 bad_fork_cgroup_threadgroup_change_end:
        cgroup_threadgroup_change_end(current);
 bad_fork_put_pidfd:
-        if (clone_flags & CLONE_PIDFD)
+        if (clone_flags & CLONE_PIDFD) {
-                ksys_close(pidfd);
+                fput(pidfile);
+                put_unused_fd(pidfd);
+        }
 bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
@@ -2233,8 +2261,10 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-        if (p->mm)
+        if (p->mm) {
+                mm_clear_owner(p->mm, p);
                mmput(p->mm);
+        }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
@@ -2265,7 +2295,7 @@ bad_fork_cleanup_count:
 bad_fork_free:
        p->state = TASK_DEAD;
        put_task_stack(p);
-        free_task(p);
+        delayed_free_task(p);
 fork_out:
        spin_lock_irq(&current->sighand->siglock);
        hlist_del_init(&delayed.node);
@@ -2286,8 +2316,11 @@ static inline void init_idle_pids(struct task_struct *idle)
 struct task_struct *fork_idle(int cpu)
 {
        struct task_struct *task;
-        task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
+        struct kernel_clone_args args = {
-                            cpu_to_node(cpu));
+                .flags = CLONE_VM,
+        };
+        task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
        if (!IS_ERR(task)) {
                init_idle_pids(task);
                init_idle(task, cpu);
@@ -2307,13 +2340,9 @@ struct mm_struct *copy_init_mm(void)
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
-long _do_fork(unsigned long clone_flags,
+long _do_fork(struct kernel_clone_args *args)
-              unsigned long stack_start,
-              unsigned long stack_size,
-              int __user *parent_tidptr,
-              int __user *child_tidptr,
-              unsigned long tls)
 {
+        u64 clone_flags = args->flags;
        struct completion vfork;
        struct pid *pid;
        struct task_struct *p;
@@ -2329,7 +2358,7 @@ long _do_fork(unsigned long clone_flags,
        if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
-                else if ((clone_flags & CSIGNAL) != SIGCHLD)
+                else if (args->exit_signal != SIGCHLD)
                        trace = PTRACE_EVENT_CLONE;
                else
                        trace = PTRACE_EVENT_FORK;
@@ -2338,8 +2367,7 @@ long _do_fork(unsigned long clone_flags,
                        trace = 0;
        }
-        p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
+        p = copy_process(NULL, trace, NUMA_NO_NODE, args);
-                         child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
        add_latent_entropy();
        if (IS_ERR(p))
@@ -2355,7 +2383,7 @@ long _do_fork(unsigned long clone_flags,
        nr = pid_vnr(pid);
        if (clone_flags & CLONE_PARENT_SETTID)
-                put_user(nr, parent_tidptr);
+                put_user(nr, args->parent_tid);
        if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
@@ -2387,8 +2415,16 @@ long do_fork(unsigned long clone_flags,
              int __user *parent_tidptr,
              int __user *child_tidptr)
 {
-        return _do_fork(clone_flags, stack_start, stack_size,
+        struct kernel_clone_args args = {
-                        parent_tidptr, child_tidptr, 0);
+                .flags          = (clone_flags & ~CSIGNAL),
+                .child_tid      = child_tidptr,
+                .parent_tid     = parent_tidptr,
+                .exit_signal    = (clone_flags & CSIGNAL),
+                .stack          = stack_start,
+                .stack_size     = stack_size,
+        };
+        return _do_fork(&args);
 }
 #endif
@@ -2397,15 +2433,25 @@ long do_fork(unsigned long clone_flags,
 */
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
-        return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+        struct kernel_clone_args args = {
-                (unsigned long)arg, NULL, NULL, 0);
+                .flags          = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+                .exit_signal    = (flags & CSIGNAL),
+                .stack          = (unsigned long)fn,
+                .stack_size     = (unsigned long)arg,
+        };
+        return _do_fork(&args);
 }
 #ifdef __ARCH_WANT_SYS_FORK
 SYSCALL_DEFINE0(fork)
 {
 #ifdef CONFIG_MMU
-        return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+        struct kernel_clone_args args = {
+                .exit_signal = SIGCHLD,
+        };
+        return _do_fork(&args);
 #else
        /* can not support in nommu mode */
        return -EINVAL;
@@ -2416,8 +2462,12 @@ SYSCALL_DEFINE0(fork)
 #ifdef __ARCH_WANT_SYS_VFORK
 SYSCALL_DEFINE0(vfork)
 {
-        return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+        struct kernel_clone_args args = {
-                        0, NULL, NULL, 0);
+                .flags          = CLONE_VFORK | CLONE_VM,
+                .exit_signal    = SIGCHLD,
+        };
+        return _do_fork(&args);
 }
 #endif
@@ -2445,7 +2495,112 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 unsigned long, tls)
 #endif
 {
-        return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+        struct kernel_clone_args args = {
+                .flags          = (clone_flags & ~CSIGNAL),
+                .pidfd          = parent_tidptr,
+                .child_tid      = child_tidptr,
+                .parent_tid     = parent_tidptr,
+                .exit_signal    = (clone_flags & CSIGNAL),
+                .stack          = newsp,
+                .tls            = tls,
+        };
+        /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+        if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
+                return -EINVAL;
+        return _do_fork(&args);
+}
+#endif
+#ifdef __ARCH_WANT_SYS_CLONE3
+noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+                                              struct clone_args __user *uargs,
+                                              size_t size)
+{
+        struct clone_args args;
+        if (unlikely(size > PAGE_SIZE))
+                return -E2BIG;
+        if (unlikely(size < sizeof(struct clone_args)))
+                return -EINVAL;
+        if (unlikely(!access_ok(uargs, size)))
+                return -EFAULT;
+        if (size > sizeof(struct clone_args)) {
+                unsigned char __user *addr;
+                unsigned char __user *end;
+                unsigned char val;
+                addr = (void __user *)uargs + sizeof(struct clone_args);
+                end = (void __user *)uargs + size;
+                for (; addr < end; addr++) {
+                        if (get_user(val, addr))
+                                return -EFAULT;
+                        if (val)
+                                return -E2BIG;
+                }
+                size = sizeof(struct clone_args);
+        }
+        if (copy_from_user(&args, uargs, size))
+                return -EFAULT;
+        *kargs = (struct kernel_clone_args){
+                .flags          = args.flags,
+                .pidfd          = u64_to_user_ptr(args.pidfd),
+                .child_tid      = u64_to_user_ptr(args.child_tid),
+                .parent_tid     = u64_to_user_ptr(args.parent_tid),
+                .exit_signal    = args.exit_signal,
+                .stack          = args.stack,
+                .stack_size     = args.stack_size,
+                .tls            = args.tls,
+        };
+        return 0;
+}
+static bool clone3_args_valid(const struct kernel_clone_args *kargs)
+{
+        /*
+         * All lower bits of the flag word are taken.
+         * Verify that no other unknown flags are passed along.
+         */
+        if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+                return false;
+        /*
+         * - make the CLONE_DETACHED bit reuseable for clone3
+         * - make the CSIGNAL bits reuseable for clone3
+         */
+        if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+                return false;
+        if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
+            kargs->exit_signal)
+                return false;
+        return true;
+}
+SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
+{
+        int err;
+        struct kernel_clone_args kargs;
+        err = copy_clone_args_from_user(&kargs, uargs, size);
+        if (err)
+                return err;
+        if (!clone3_args_valid(&kargs))
+                return -EINVAL;
+        return _do_fork(&kargs);
 }
 #endif
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b162b74611e4..c0738424bb43 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kernel/freezer.c - Function to freeze a process
 *
diff --git a/kernel/futex.c b/kernel/futex.c
index 2268b97d5439..6d50728ef2e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 *  Fast Userspace Mutexes (which I call "Futexes!").
 *  (C) Rusty Russell, IBM 2002
@@ -29,20 +30,6 @@
 *
 *  "The futexes are also cursed."
 *  "But they come in a choice of three flavours!"
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/compat.h>
 #include <linux/slab.h>
@@ -484,6 +471,37 @@ enum futex_access {
 };
 /**
+ * futex_setup_timer - set up the sleeping hrtimer.
+ * @time:       ptr to the given timeout value
+ * @timeout:    the hrtimer_sleeper structure to be set up
+ * @flags:      futex flags
+ * @range_ns:   optional range in ns
+ *
+ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
+ *         value given
+ */
+static inline struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+                  int flags, u64 range_ns)
+{
+        if (!time)
+                return NULL;
+        hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ?
+                              CLOCK_REALTIME : CLOCK_MONOTONIC,
+                              HRTIMER_MODE_ABS);
+        hrtimer_init_sleeper(timeout, current);
+        /*
+         * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
+         * effectively the same as calling hrtimer_set_expires().
+         */
+        hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
+        return timeout;
+}
+/**
 * get_futex_key() - Get parameters which are the keys for a futex
 * @uaddr:      virtual address of the futex
 * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
@@ -2692,7 +2710,7 @@ out:
 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
                      ktime_t *abs_time, u32 bitset)
 {
-        struct hrtimer_sleeper timeout, *to = NULL;
+        struct hrtimer_sleeper timeout, *to;
        struct restart_block *restart;
        struct futex_hash_bucket *hb;
        struct futex_q q = futex_q_init;
@@ -2702,17 +2720,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
                return -EINVAL;
        q.bitset = bitset;
-        if (abs_time) {
+        to = futex_setup_timer(abs_time, &timeout, flags,
-                to = &timeout;
+                               current->timer_slack_ns);
-                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
-                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
-                                      HRTIMER_MODE_ABS);
-                hrtimer_init_sleeper(to, current);
-                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
-                                             current->timer_slack_ns);
-        }
 retry:
        /*
         * Prepare to wait on uaddr. On success, holds hb lock and increments
@@ -2792,7 +2801,7 @@ static long futex_wait_restart(struct restart_block *restart)
 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
                         ktime_t *time, int trylock)
 {
-        struct hrtimer_sleeper timeout, *to = NULL;
+        struct hrtimer_sleeper timeout, *to;
        struct futex_pi_state *pi_state = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct futex_hash_bucket *hb;
@@ -2805,13 +2814,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
        if (refill_pi_state_cache())
                return -ENOMEM;
-        if (time) {
+        to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
-                to = &timeout;
-                hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
-                                      HRTIMER_MODE_ABS);
-                hrtimer_init_sleeper(to, current);
-                hrtimer_set_expires(&to->timer, *time);
-        }
 retry:
        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
@@ -3208,7 +3211,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                 u32 val, ktime_t *abs_time, u32 bitset,
                                 u32 __user *uaddr2)
 {
-        struct hrtimer_sleeper timeout, *to = NULL;
+        struct hrtimer_sleeper timeout, *to;
        struct futex_pi_state *pi_state = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct futex_hash_bucket *hb;
@@ -3225,15 +3228,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        if (!bitset)
                return -EINVAL;
-        if (abs_time) {
+        to = futex_setup_timer(abs_time, &timeout, flags,
-                to = &timeout;
+                               current->timer_slack_ns);
-                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
-                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
-                                      HRTIMER_MODE_ABS);
-                hrtimer_init_sleeper(to, current);
-                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
-                                             current->timer_slack_ns);
-        }
        /*
         * The waiter is allocated on our stack, manipulated by the requeue
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 1e3823fa799b..3941a9c48f83 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 menu "GCOV-based kernel profiling"
 config GCOV_KERNEL
@@ -53,6 +54,7 @@ config GCOV_PROFILE_ALL
 choice
        prompt "Specify GCOV format"
        depends on GCOV_KERNEL
+        depends on CC_IS_GCC
        ---help---
        The gcov format is usually determined by the GCC version, and the
        default is chosen according to your GCC version. However, there are
@@ -62,7 +64,7 @@ choice
 config GCOV_FORMAT_3_4
        bool "GCC 3.4 format"
-        depends on CC_IS_GCC && GCC_VERSION < 40700
+        depends on GCC_VERSION < 40700
        ---help---
        Select this option to use the format defined by GCC 3.4.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index ff06d64df397..d66a74b0f100 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -2,5 +2,6 @@
 ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
 obj-y := base.o fs.o
-obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
-obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
+obj-$(CONFIG_CC_IS_CLANG) += clang.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9c7c8d5c18f2..0ffe9f194080 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -22,88 +22,8 @@
 #include <linux/sched.h>
 #include "gcov.h"
-static int gcov_events_enabled;
+int gcov_events_enabled;
-static DEFINE_MUTEX(gcov_lock);
+DEFINE_MUTEX(gcov_lock);
-/*
- * __gcov_init is called by gcc-generated constructor code for each object
- * file compiled with -fprofile-arcs.
- */
-void __gcov_init(struct gcov_info *info)
-{
-        static unsigned int gcov_version;
-        mutex_lock(&gcov_lock);
-        if (gcov_version == 0) {
-                gcov_version = gcov_info_version(info);
-                /*
-                 * Printing gcc's version magic may prove useful for debugging
-                 * incompatibility reports.
-                 */
-                pr_info("version magic: 0x%x\n", gcov_version);
-        }
-        /*
-         * Add new profiling data structure to list and inform event
-         * listener.
-         */
-        gcov_info_link(info);
-        if (gcov_events_enabled)
-                gcov_event(GCOV_ADD, info);
-        mutex_unlock(&gcov_lock);
-}
-EXPORT_SYMBOL(__gcov_init);
-/*
- * These functions may be referenced by gcc-generated profiling code but serve
- * no function for kernel profiling.
- */
-void __gcov_flush(void)
-{
-        /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_flush);
-void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
-{
-        /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_add);
-void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
-{
-        /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_single);
-void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
-{
-        /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_delta);
-void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
-{
-        /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_ior);
-void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
-{
-        /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_time_profile);
-void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
-{
-        /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_icall_topn);
-void __gcov_exit(void)
-{
-        /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_exit);
 /**
 * gcov_enable_events - enable event reporting through gcov_event()
@@ -144,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
        /* Remove entries located in module from linked list. */
        while ((info = gcov_info_next(info))) {
-                if (within_module((unsigned long)info, mod)) {
+                if (gcov_info_within_module(info, mod)) {
                        gcov_info_unlink(prev, info);
                        if (gcov_events_enabled)
                                gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
new file mode 100644
index 000000000000..c94b820a1b62
--- /dev/null
+++ b/kernel/gcov/clang.c
@@ -0,0 +1,581 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Google, Inc.
+ * modified from kernel/gcov/gcc_4_7.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * LLVM uses profiling data that's deliberately similar to GCC, but has a
+ * very different way of exporting that data.  LLVM calls llvm_gcov_init() once
+ * per module, and provides a couple of callbacks that we can use to ask for
+ * more data.
+ *
+ * We care about the "writeout" callback, which in turn calls back into
+ * compiler-rt/this module to dump all the gathered coverage data to disk:
+ *
+ *    llvm_gcda_start_file()
+ *      llvm_gcda_emit_function()
+ *      llvm_gcda_emit_arcs()
+ *      llvm_gcda_emit_function()
+ *      llvm_gcda_emit_arcs()
+ *      [... repeats for each function ...]
+ *    llvm_gcda_summary_info()
+ *    llvm_gcda_end_file()
+ *
+ * This design is much more stateless and unstructured than gcc's, and is
+ * intended to run at process exit.  This forces us to keep some local state
+ * about which module we're dealing with at the moment.  On the other hand, it
+ * also means we don't depend as much on how LLVM represents profiling data
+ * internally.
+ *
+ * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more
+ * details on how this works, particularly GCOVProfiler::emitProfileArcs(),
+ * GCOVProfiler::insertCounterWriteout(), and
+ * GCOVProfiler::insertFlush().
+ */
+#define pr_fmt(fmt)     "gcov: " fmt
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+typedef void (*llvm_gcov_callback)(void);
+struct gcov_info {
+        struct list_head head;
+        const char *filename;
+        unsigned int version;
+        u32 checksum;
+        struct list_head functions;
+};
+struct gcov_fn_info {
+        struct list_head head;
+        u32 ident;
+        u32 checksum;
+        u8 use_extra_checksum;
+        u32 cfg_checksum;
+        u32 num_counters;
+        u64 *counters;
+        const char *function_name;
+};
+static struct gcov_info *current_info;
+static LIST_HEAD(clang_gcov_list);
+void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
+{
+        struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+        if (!info)
+                return;
+        INIT_LIST_HEAD(&info->head);
+        INIT_LIST_HEAD(&info->functions);
+        mutex_lock(&gcov_lock);
+        list_add_tail(&info->head, &clang_gcov_list);
+        current_info = info;
+        writeout();
+        current_info = NULL;
+        if (gcov_events_enabled)
+                gcov_event(GCOV_ADD, info);
+        mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(llvm_gcov_init);
+void llvm_gcda_start_file(const char *orig_filename, const char version[4],
+                u32 checksum)
+{
+        current_info->filename = orig_filename;
+        memcpy(&current_info->version, version, sizeof(current_info->version));
+        current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+void llvm_gcda_emit_function(u32 ident, const char *function_name,
+                u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
+{
+        struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+        if (!info)
+                return;
+        INIT_LIST_HEAD(&info->head);
+        info->ident = ident;
+        info->checksum = func_checksum;
+        info->use_extra_checksum = use_extra_checksum;
+        info->cfg_checksum = cfg_checksum;
+        if (function_name)
+                info->function_name = kstrdup(function_name, GFP_KERNEL);
+        list_add_tail(&info->head, &current_info->functions);
+}
+EXPORT_SYMBOL(llvm_gcda_emit_function);
+void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
+{
+        struct gcov_fn_info *info = list_last_entry(&current_info->functions,
+                        struct gcov_fn_info, head);
+        info->num_counters = num_counters;
+        info->counters = counters;
+}
+EXPORT_SYMBOL(llvm_gcda_emit_arcs);
+void llvm_gcda_summary_info(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_summary_info);
+void llvm_gcda_end_file(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_end_file);
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+        return info->filename;
+}
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+        return info->version;
+}
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+        if (!info)
+                return list_first_entry_or_null(&clang_gcov_list,
+                                struct gcov_info, head);
+        if (list_is_last(&info->head, &clang_gcov_list))
+                return NULL;
+        return list_next_entry(info, head);
+}
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+        list_add_tail(&info->head, &clang_gcov_list);
+}
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+        /* Generic code unlinks while iterating. */
+        __list_del_entry(&info->head);
+}
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+        return within_module((unsigned long)info->filename, mod);
+}
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+        { OBJ_TREE, "gcno" },   /* Link to .gcno file in $(objtree). */
+        { 0, NULL},
+};
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+        struct gcov_fn_info *fn;
+        list_for_each_entry(fn, &info->functions, head)
+                memset(fn->counters, 0,
+                                sizeof(fn->counters[0]) * fn->num_counters);
+}
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+        struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null(
+                        &info1->functions, struct gcov_fn_info, head);
+        struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null(
+                        &info2->functions, struct gcov_fn_info, head);
+        if (info1->checksum != info2->checksum)
+                return false;
+        if (!fn_ptr1)
+                return fn_ptr1 == fn_ptr2;
+        while (!list_is_last(&fn_ptr1->head, &info1->functions) &&
+                !list_is_last(&fn_ptr2->head, &info2->functions)) {
+                if (fn_ptr1->checksum != fn_ptr2->checksum)
+                        return false;
+                if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
+                        return false;
+                if (fn_ptr1->use_extra_checksum &&
+                        fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+                        return false;
+                fn_ptr1 = list_next_entry(fn_ptr1, head);
+                fn_ptr2 = list_next_entry(fn_ptr2, head);
+        }
+        return list_is_last(&fn_ptr1->head, &info1->functions) &&
+                list_is_last(&fn_ptr2->head, &info2->functions);
+}
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+        struct gcov_fn_info *dfn_ptr;
+        struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions,
+                        struct gcov_fn_info, head);
+        list_for_each_entry(dfn_ptr, &dst->functions, head) {
+                u32 i;
+                for (i = 0; i < sfn_ptr->num_counters; i++)
+                        dfn_ptr->counters[i] += sfn_ptr->counters[i];
+        }
+}
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+        size_t cv_size; /* counter values size */
+        struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+                        GFP_KERNEL);
+        if (!fn_dup)
+                return NULL;
+        INIT_LIST_HEAD(&fn_dup->head);
+        fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
+        if (!fn_dup->function_name)
+                goto err_name;
+        cv_size = fn->num_counters * sizeof(fn->counters[0]);
+        fn_dup->counters = vmalloc(cv_size);
+        if (!fn_dup->counters)
+                goto err_counters;
+        memcpy(fn_dup->counters, fn->counters, cv_size);
+        return fn_dup;
+err_counters:
+        kfree(fn_dup->function_name);
+err_name:
+        kfree(fn_dup);
+        return NULL;
+}
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+        struct gcov_info *dup;
+        struct gcov_fn_info *fn;
+        dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+        if (!dup)
+                return NULL;
+        INIT_LIST_HEAD(&dup->head);
+        INIT_LIST_HEAD(&dup->functions);
+        dup->filename = kstrdup(info->filename, GFP_KERNEL);
+        if (!dup->filename)
+                goto err;
+        list_for_each_entry(fn, &info->functions, head) {
+                struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn);
+                if (!fn_dup)
+                        goto err;
+                list_add_tail(&fn_dup->head, &dup->functions);
+        }
+        return dup;
+err:
+        gcov_info_free(dup);
+        return NULL;
+}
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+        struct gcov_fn_info *fn, *tmp;
+        list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+                kfree(fn->function_name);
+                vfree(fn->counters);
+                list_del(&fn->head);
+                kfree(fn);
+        }
+        kfree(info->filename);
+        kfree(info);
+}
+#define ITER_STRIDE     PAGE_SIZE
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+        struct gcov_info *info;
+        void *buffer;
+        size_t size;
+        loff_t pos;
+};
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+        u32 *data;
+        if (buffer) {
+                data = buffer + off;
+                *data = v;
+        }
+        return sizeof(*data);
+}
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+        u32 *data;
+        if (buffer) {
+                data = buffer + off;
+                data[0] = (v & 0xffffffffUL);
+                data[1] = (v >> 32);
+        }
+        return sizeof(*data) * 2;
+}
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+        struct gcov_fn_info *fi_ptr;
+        size_t pos = 0;
+        /* File header. */
+        pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+        pos += store_gcov_u32(buffer, pos, info->version);
+        pos += store_gcov_u32(buffer, pos, info->checksum);
+        list_for_each_entry(fi_ptr, &info->functions, head) {
+                u32 i;
+                u32 len = 2;
+                if (fi_ptr->use_extra_checksum)
+                        len++;
+                pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+                pos += store_gcov_u32(buffer, pos, len);
+                pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+                pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
+                if (fi_ptr->use_extra_checksum)
+                        pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+                pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
+                pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
+                for (i = 0; i < fi_ptr->num_counters; i++)
+                        pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]);
+        }
+        return pos;
+}
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+        struct gcov_iterator *iter;
+        iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+        if (!iter)
+                goto err_free;
+        iter->info = info;
+        /* Dry-run to get the actual buffer size. */
+        iter->size = convert_to_gcda(NULL, info);
+        iter->buffer = vmalloc(iter->size);
+        if (!iter->buffer)
+                goto err_free;
+        convert_to_gcda(iter->buffer, info);
+        return iter;
+err_free:
+        kfree(iter);
+        return NULL;
+}
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+        vfree(iter->buffer);
+        kfree(iter);
+}
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+        return iter->info;
+}
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+        iter->pos = 0;
+}
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+        if (iter->pos < iter->size)
+                iter->pos += ITER_STRIDE;
+        if (iter->pos >= iter->size)
+                return -EINVAL;
+        return 0;
+}
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+        size_t len;
+        if (iter->pos >= iter->size)
+                return -EINVAL;
+        len = ITER_STRIDE;
+        if (iter->pos + len > iter->size)
+                len = iter->size - iter->pos;
+        seq_write(seq, iter->buffer + iter->pos, len);
+        return 0;
+}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 6e40ff6be083..e5eb5ea7ea59 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -64,7 +64,6 @@ struct gcov_node {
 static const char objtree[] = OBJTREE;
 static const char srctree[] = SRCTREE;
 static struct gcov_node root_node;
-static struct dentry *reset_dentry;
 static LIST_HEAD(all_head);
 static DEFINE_MUTEX(node_lock);
@@ -387,8 +386,6 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
                        goto out_err;
                node->links[i] = debugfs_create_symlink(deskew(basename),
                                                        parent, target);
-                if (!node->links[i])
-                        goto out_err;
                kfree(target);
        }
@@ -450,11 +447,6 @@ static struct gcov_node *new_node(struct gcov_node *parent,
                                        parent->dentry, node, &gcov_data_fops);
        } else
                node->dentry = debugfs_create_dir(node->name, parent->dentry);
-        if (!node->dentry) {
-                pr_warn("could not create file\n");
-                kfree(node);
-                return NULL;
-        }
        if (info)
                add_links(node, parent->dentry);
        list_add(&node->list, &parent->children);
@@ -761,32 +753,20 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
 /* Create debugfs entries. */
 static __init int gcov_fs_init(void)
 {
-        int rc = -EIO;
        init_node(&root_node, NULL, NULL, NULL);
        /*
         * /sys/kernel/debug/gcov will be parent for the reset control file
         * and all profiling files.
         */
        root_node.dentry = debugfs_create_dir("gcov", NULL);
-        if (!root_node.dentry)
-                goto err_remove;
        /*
         * Create reset file which resets all profiling counts when written
         * to.
         */
-        reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
+        debugfs_create_file("reset", 0600, root_node.dentry, NULL,
-                                           NULL, &gcov_reset_fops);
+                            &gcov_reset_fops);
-        if (!reset_dentry)
-                goto err_remove;
        /* Replay previous events to get our fs hierarchy up-to-date. */
        gcov_enable_events();
        return 0;
-err_remove:
-        pr_err("init failed\n");
-        debugfs_remove(root_node.dentry);
-        return rc;
 }
 device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 2dddecbdbe6e..801ee4b0b969 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
                gcov_info_head = info->next;
 }
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+        return within_module((unsigned long)info, mod);
+}
 /* Symbolic links to be created for each profiling data file. */
 const struct gcov_link gcov_link[] = {
        { OBJ_TREE, "gcno" },   /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index ca5e5c0ef853..ec37563674d6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
                gcov_info_head = info->next;
 }
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+        return within_module((unsigned long)info, mod);
+}
 /* Symbolic links to be created for each profiling data file. */
 const struct gcov_link gcov_link[] = {
        { OBJ_TREE, "gcno" },   /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c
new file mode 100644
index 000000000000..3cf736b9f880
--- /dev/null
+++ b/kernel/gcov/gcc_base.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+        static unsigned int gcov_version;
+        mutex_lock(&gcov_lock);
+        if (gcov_version == 0) {
+                gcov_version = gcov_info_version(info);
+                /*
+                 * Printing gcc's version magic may prove useful for debugging
+                 * incompatibility reports.
+                 */
+                pr_info("version magic: 0x%x\n", gcov_version);
+        }
+        /*
+         * Add new profiling data structure to list and inform event
+         * listener.
+         */
+        gcov_info_link(info);
+        if (gcov_events_enabled)
+                gcov_event(GCOV_ADD, info);
+        mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+void __gcov_exit(void)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index de118ad4a024..6ab2c1808c9d 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -15,6 +15,7 @@
 #ifndef GCOV_H
 #define GCOV_H GCOV_H
+#include <linux/module.h>
 #include <linux/types.h>
 /*
@@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info);
 struct gcov_info *gcov_info_next(struct gcov_info *info);
 void gcov_info_link(struct gcov_info *info);
 void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
 /* Base interface. */
 enum gcov_action {
@@ -83,4 +85,7 @@ struct gcov_link {
 };
 extern const struct gcov_link gcov_link[];
+extern int gcov_events_enabled;
+extern struct mutex gcov_lock;
 #endif /* GCOV_H */
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_kheaders.sh
index 591a94f7b387..9ff449888d9c 100755
--- a/kernel/gen_ikh_data.sh
+++ b/kernel/gen_kheaders.sh
@@ -2,26 +2,14 @@
 # SPDX-License-Identifier: GPL-2.0
 # This script generates an archive consisting of kernel headers
-# for CONFIG_IKHEADERS_PROC.
+# for CONFIG_IKHEADERS.
 set -e
-spath="$(dirname "$(readlink -f "$0")")"
+sfile="$(readlink -f "$0")"
-kroot="$spath/.."
 outdir="$(pwd)"
 tarfile=$1
 cpio_dir=$outdir/$tarfile.tmp
-# Script filename relative to the kernel source root
+dir_list="
-# We add it to the archive because it is small and any changes
-# to this script will also cause a rebuild of the archive.
-sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")"
-src_file_list="
-include/
-arch/$SRCARCH/include/
-$sfile
-"
-obj_file_list="
 include/
 arch/$SRCARCH/include/
 "
@@ -31,28 +19,31 @@ arch/$SRCARCH/include/
 # This block is useful for debugging the incremental builds.
 # Uncomment it for debugging.
-# iter=1
+# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter;
-# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter;
+# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi
-# else;         iter=$(($(cat /tmp/iter) + 1)); fi
+# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter
-# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
+# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter
-# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
 # include/generated/compile.h is ignored because it is touched even when none
 # of the source files changed. This causes pointless regeneration, so let us
 # ignore them for md5 calculation.
-pushd $kroot > /dev/null
+pushd $srctree > /dev/null
-src_files_md5="$(find $src_file_list -type f                       |
+src_files_md5="$(find $dir_list -name "*.h"                        |
                grep -v "include/generated/compile.h"              |
-                xargs ls -lR | md5sum | cut -d ' ' -f1)"
+                grep -v "include/generated/autoconf.h"             |
+                xargs ls -l | md5sum | cut -d ' ' -f1)"
 popd > /dev/null
-obj_files_md5="$(find $obj_file_list -type f                       |
+obj_files_md5="$(find $dir_list -name "*.h"                        |
                grep -v "include/generated/compile.h"              |
-                xargs ls -lR | md5sum | cut -d ' ' -f1)"
+                grep -v "include/generated/autoconf.h"             |
+                xargs ls -l | md5sum | cut -d ' ' -f1)"
+# Any changes to this script will also cause a rebuild of the archive.
+this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)"
 if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
 if [ -f kernel/kheaders.md5 ] &&
        [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
        [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
+        [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] &&
        [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
                exit
 fi
@@ -64,16 +55,16 @@ fi
 rm -rf $cpio_dir
 mkdir $cpio_dir
-pushd $kroot > /dev/null
+pushd $srctree > /dev/null
-for f in $src_file_list;
+for f in $dir_list;
-        do find "$f" ! -name "*.cmd" ! -name ".*";
+        do find "$f" -name "*.h";
 done | cpio --quiet -pd $cpio_dir
 popd > /dev/null
 # The second CPIO can complain if files already exist which can
 # happen with out of tree builds. Just silence CPIO for now.
-for f in $obj_file_list;
+for f in $dir_list;
-        do find "$f" ! -name "*.cmd" ! -name ".*";
+        do find "$f" -name "*.h";
 done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
 # Remove comments except SDPX lines
@@ -82,8 +73,9 @@ find $cpio_dir -type f -print0 |
 tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
-echo "$src_files_md5" > kernel/kheaders.md5
+echo "$src_files_md5" >  kernel/kheaders.md5
 echo "$obj_files_md5" >> kernel/kheaders.md5
+echo "$this_file_md5" >> kernel/kheaders.md5
 echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
 rm -rf $cpio_dir
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f108a95882c6..14a625c16cb3 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Detect Hung Task
 *
diff --git a/kernel/iomem.c b/kernel/iomem.c
index 93c264444510..62c92e43aa0d 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap);
 void memunmap(void *addr)
 {
-        if (is_vmalloc_addr(addr))
+        if (is_ioremap_addr(addr))
                iounmap((void __iomem *) addr);
 }
 EXPORT_SYMBOL(memunmap);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5f3e2baefca9..f92d9a687372 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 menu "IRQ subsystem"
 # Options selectable by the architecture code
@@ -91,6 +92,9 @@ config GENERIC_MSI_IRQ_DOMAIN
        select IRQ_DOMAIN_HIERARCHY
        select GENERIC_MSI_IRQ
+config IRQ_MSI_IOMMU
+        bool
 config HANDLE_DOMAIN_IRQ
        bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index ff6e352e3a6c..b4f53717d143 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -2,6 +2,9 @@
 obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
 obj-$(CONFIG_IRQ_TIMINGS) += timings.o
+ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y)
+        CFLAGS_timings.o += -DDEBUG
+endif
 obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f18cd5aa33e8..4352b08ae48d 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,8 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
        return nodes;
 }
-static int __irq_build_affinity_masks(const struct irq_affinity *affd,
+static int __irq_build_affinity_masks(unsigned int startvec,
-                                      unsigned int startvec,
                                      unsigned int numvecs,
                                      unsigned int firstvec,
                                      cpumask_var_t *node_to_cpumask,
@@ -171,8 +170,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 *      1) spread present CPU on these vectors
 *      2) spread other possible CPUs on these vectors
 */
-static int irq_build_affinity_masks(const struct irq_affinity *affd,
+static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
-                                    unsigned int startvec, unsigned int numvecs,
                                    unsigned int firstvec,
                                    struct irq_affinity_desc *masks)
 {
@@ -197,7 +195,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
        build_node_to_cpumask(node_to_cpumask);
        /* Spread on present CPUs starting from affd->pre_vectors */
-        nr_present = __irq_build_affinity_masks(affd, curvec, numvecs,
+        nr_present = __irq_build_affinity_masks(curvec, numvecs,
                                                firstvec, node_to_cpumask,
                                                cpu_present_mask, nmsk, masks);
@@ -212,7 +210,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
        else
                curvec = firstvec + nr_present;
        cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
-        nr_others = __irq_build_affinity_masks(affd, curvec, numvecs,
+        nr_others = __irq_build_affinity_masks(curvec, numvecs,
                                               firstvec, node_to_cpumask,
                                               npresmsk, nmsk, masks);
        put_online_cpus();
@@ -295,7 +293,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
                unsigned int this_vecs = affd->set_size[i];
                int ret;
-                ret = irq_build_affinity_masks(affd, curvec, this_vecs,
+                ret = irq_build_affinity_masks(curvec, this_vecs,
                                               curvec, masks);
                if (ret) {
                        kfree(masks);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 16cbf6beb276..ae60cae24e9a 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -90,7 +90,7 @@ unsigned long probe_irq_on(void)
                        /* It triggered already - consider it spurious. */
                        if (!(desc->istate & IRQS_WAITING)) {
                                desc->istate &= ~IRQS_AUTODETECT;
-                                irq_shutdown(desc);
+                                irq_shutdown_and_deactivate(desc);
                        } else
                                if (i < 32)
                                        mask |= 1 << i;
@@ -127,7 +127,7 @@ unsigned int probe_irq_mask(unsigned long val)
                                mask |= 1 << i;
                        desc->istate &= ~IRQS_AUTODETECT;
-                        irq_shutdown(desc);
+                        irq_shutdown_and_deactivate(desc);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -169,7 +169,7 @@ int probe_irq_off(unsigned long val)
                                nr_of_irqs++;
                        }
                        desc->istate &= ~IRQS_AUTODETECT;
-                        irq_shutdown(desc);
+                        irq_shutdown_and_deactivate(desc);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 51128bea3846..b76703b2c0af 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -314,6 +314,12 @@ void irq_shutdown(struct irq_desc *desc)
                }
                irq_state_clr_started(desc);
        }
+}
+void irq_shutdown_and_deactivate(struct irq_desc *desc)
+{
+        irq_shutdown(desc);
        /*
         * This must be called even if the interrupt was never started up,
         * because the activation can happen before the interrupt is
@@ -748,6 +754,8 @@ void handle_fasteoi_nmi(struct irq_desc *desc)
        unsigned int irq = irq_desc_get_irq(desc);
        irqreturn_t res;
+        __kstat_incr_irqs_this_cpu(desc);
        trace_irq_handler_entry(irq, action);
        /*
         * NMIs cannot be shared, there is only one action.
@@ -962,6 +970,8 @@ void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc)
        unsigned int irq = irq_desc_get_irq(desc);
        irqreturn_t res;
+        __kstat_incr_irqs_this_cpu(desc);
        trace_irq_handler_entry(irq, action);
        res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
        trace_irq_handler_exit(irq, action, res);
@@ -1459,6 +1469,33 @@ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
        return -ENOSYS;
 }
 EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent);
+/**
+ * irq_chip_request_resources_parent - Request resources on the parent interrupt
+ * @data:       Pointer to interrupt specific data
+ */
+int irq_chip_request_resources_parent(struct irq_data *data)
+{
+        data = data->parent_data;
+        if (data->chip->irq_request_resources)
+                return data->chip->irq_request_resources(data);
+        return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent);
+/**
+ * irq_chip_release_resources_parent - Release resources on the parent interrupt
+ * @data:       Pointer to interrupt specific data
+ */
+void irq_chip_release_resources_parent(struct irq_data *data)
+{
+        data = data->parent_data;
+        if (data->chip->irq_release_resources)
+                data->chip->irq_release_resources(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
 #endif
 /**
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 5b1072e394b2..6c7ca2e983a5 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -116,7 +116,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
                 */
                if (irqd_affinity_is_managed(d)) {
                        irqd_set_managed_shutdown(d);
-                        irq_shutdown(desc);
+                        irq_shutdown_and_deactivate(desc);
                        return false;
                }
                affinity = cpu_online_mask;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 70c3053bc1f6..3924fbe829d4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -82,6 +82,7 @@ extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
 extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
 extern void irq_shutdown(struct irq_desc *desc);
+extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
 extern void irq_enable(struct irq_desc *desc);
 extern void irq_disable(struct irq_desc *desc);
 extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
@@ -96,6 +97,10 @@ static inline void irq_mark_irq(unsigned int irq) { }
 extern void irq_mark_irq(unsigned int irq);
 #endif
+extern int __irq_get_irqchip_state(struct irq_data *data,
+                                   enum irqchip_irq_state which,
+                                   bool *state);
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
@@ -354,6 +359,16 @@ static inline int irq_timing_decode(u64 value, u64 *timestamp)
        return value & U16_MAX;
 }
+static __always_inline void irq_timings_push(u64 ts, int irq)
+{
+        struct irq_timings *timings = this_cpu_ptr(&irq_timings);
+        timings->values[timings->count & IRQ_TIMINGS_MASK] =
+                irq_timing_encode(ts, irq);
+        timings->count++;
+}
 /*
 * The function record_irq_time is only called in one place in the
 * interrupts handler. We want this function always inline so the code
@@ -367,15 +382,8 @@ static __always_inline void record_irq_time(struct irq_desc *desc)
        if (!static_branch_likely(&irq_timing_enabled))
                return;
-        if (desc->istate & IRQS_TIMINGS) {
+        if (desc->istate & IRQS_TIMINGS)
-                struct irq_timings *timings = this_cpu_ptr(&irq_timings);
+                irq_timings_push(local_clock(), irq_desc_get_irq(desc));
-                timings->values[timings->count & IRQ_TIMINGS_MASK] =
-                        irq_timing_encode(local_clock(),
-                                          irq_desc_get_irq(desc));
-                timings->count++;
-        }
 }
 #else
 static inline void irq_remove_timings(struct irq_desc *desc) {}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index c52b737ab8e3..9484e88dabc2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -680,6 +680,8 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
 * @hwirq:      The HW irq number to convert to a logical one
 * @regs:       Register file coming from the low-level handling code
 *
+ *              This function must be called from an NMI context.
+ *
 * Returns:     0 on success, or -EINVAL if conversion has failed
 */
 int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
@@ -689,7 +691,10 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
        unsigned int irq;
        int ret = 0;
-        nmi_enter();
+        /*
+         * NMI context needs to be setup earlier in order to deal with tracing.
+         */
+        WARN_ON(!in_nmi());
        irq = irq_find_mapping(domain, hwirq);
@@ -702,7 +707,6 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
        else
                ret = -EINVAL;
-        nmi_exit();
        set_irq_regs(old_regs);
        return ret;
 }
@@ -946,6 +950,11 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
                        *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
 }
+static bool irq_is_nmi(struct irq_desc *desc)
+{
+        return desc->istate & IRQS_NMI;
+}
 /**
 * kstat_irqs - Get the statistics for an interrupt
 * @irq:        The interrupt number
@@ -963,7 +972,8 @@ unsigned int kstat_irqs(unsigned int irq)
        if (!desc || !desc->kstat_irqs)
                return 0;
        if (!irq_settings_is_per_cpu_devid(desc) &&
-            !irq_settings_is_per_cpu(desc))
+            !irq_settings_is_per_cpu(desc) &&
+            !irq_is_nmi(desc))
            return desc->tot_count;
        for_each_possible_cpu(cpu)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 9ed29e4a7dbf..3078d0e48bba 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
 * @ops: domain callbacks
 * @host_data: Controller private data pointer
 *
- * Allocates and initialize and irq_domain structure.
+ * Allocates and initializes an irq_domain structure.
 * Returns pointer to IRQ domain, or NULL on failure.
 */
 struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
@@ -139,7 +139,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
        domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
                              GFP_KERNEL, of_node_to_nid(of_node));
-        if (WARN_ON(!domain))
+        if (!domain)
                return NULL;
        if (fwnode && is_fwnode_irqchip(fwnode)) {
@@ -1297,7 +1297,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
 /**
 * __irq_domain_alloc_irqs - Allocate IRQs from domain
 * @domain:     domain to allocate from
- * @irq_base:   allocate specified IRQ nubmer if irq_base >= 0
+ * @irq_base:   allocate specified IRQ number if irq_base >= 0
 * @nr_irqs:    number of IRQs to allocate
 * @node:       NUMA node id for memory allocation
 * @arg:        domain specific argument
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 78f3ddeb7fe4..e8f7f179bf77 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
+#include <linux/irqdomain.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
@@ -34,8 +35,9 @@ static int __init setup_forced_irqthreads(char *arg)
 early_param("threadirqs", setup_forced_irqthreads);
 #endif
-static void __synchronize_hardirq(struct irq_desc *desc)
+static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
 {
+        struct irq_data *irqd = irq_desc_get_irq_data(desc);
        bool inprogress;
        do {
@@ -51,6 +53,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
                /* Ok, that indicated we're done: double-check carefully. */
                raw_spin_lock_irqsave(&desc->lock, flags);
                inprogress = irqd_irq_inprogress(&desc->irq_data);
+                /*
+                 * If requested and supported, check at the chip whether it
+                 * is in flight at the hardware level, i.e. already pending
+                 * in a CPU and waiting for service and acknowledge.
+                 */
+                if (!inprogress && sync_chip) {
+                        /*
+                         * Ignore the return code. inprogress is only updated
+                         * when the chip supports it.
+                         */
+                        __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE,
+                                                &inprogress);
+                }
                raw_spin_unlock_irqrestore(&desc->lock, flags);
                /* Oops, that failed? */
@@ -73,13 +89,18 @@ static void __synchronize_hardirq(struct irq_desc *desc)
 *      Returns: false if a threaded handler is active.
 *
 *      This function may be called - with care - from IRQ context.
+ *
+ *      It does not check whether there is an interrupt in flight at the
+ *      hardware level, but not serviced yet, as this might deadlock when
+ *      called with interrupts disabled and the target CPU of the interrupt
+ *      is the current CPU.
 */
 bool synchronize_hardirq(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        if (desc) {
-                __synchronize_hardirq(desc);
+                __synchronize_hardirq(desc, false);
                return !atomic_read(&desc->threads_active);
        }
@@ -95,14 +116,19 @@ EXPORT_SYMBOL(synchronize_hardirq);
 *      to complete before returning. If you use this function while
 *      holding a resource the IRQ handler may need you will deadlock.
 *
- *      This function may be called - with care - from IRQ context.
+ *      Can only be called from preemptible code as it might sleep when
+ *      an interrupt thread is associated to @irq.
+ *
+ *      It optionally makes sure (when the irq chip supports that method)
+ *      that the interrupt is not pending in any CPU and waiting for
+ *      service.
 */
 void synchronize_irq(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        if (desc) {
-                __synchronize_hardirq(desc);
+                __synchronize_hardirq(desc, true);
                /*
                 * We made sure that no hardirq handler is
                 * running. Now verify that no threaded handlers are
@@ -1699,6 +1725,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
        /* If this was the last handler, shut down the IRQ line: */
        if (!desc->action) {
                irq_settings_clr_disable_unlazy(desc);
+                /* Only shutdown. Deactivate after synchronize_hardirq() */
                irq_shutdown(desc);
        }
@@ -1727,8 +1754,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
        unregister_handler_proc(irq, action);
-        /* Make sure it's not being used on another CPU: */
+        /*
-        synchronize_hardirq(irq);
+         * Make sure it's not being used on another CPU and if the chip
+         * supports it also make sure that there is no (not yet serviced)
+         * interrupt in flight at the hardware level.
+         */
+        __synchronize_hardirq(desc, true);
 #ifdef CONFIG_DEBUG_SHIRQ
        /*
@@ -1768,6 +1799,14 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
                 * require it to deallocate resources over the slow bus.
                 */
                chip_bus_lock(desc);
+                /*
+                 * There is no interrupt on the fly anymore. Deactivate it
+                 * completely.
+                 */
+                raw_spin_lock_irqsave(&desc->lock, flags);
+                irq_domain_deactivate_irq(&desc->irq_data);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
                irq_release_resources(desc);
                chip_bus_sync_unlock(desc);
                irq_remove_timings(desc);
@@ -1855,7 +1894,7 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
        }
        irq_settings_clr_disable_unlazy(desc);
-        irq_shutdown(desc);
+        irq_shutdown_and_deactivate(desc);
        irq_release_resources(desc);
@@ -2578,6 +2617,28 @@ out:
        irq_put_desc_unlock(desc, flags);
 }
+int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
+                            bool *state)
+{
+        struct irq_chip *chip;
+        int err = -EINVAL;
+        do {
+                chip = irq_data_get_irq_chip(data);
+                if (chip->irq_get_irqchip_state)
+                        break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+                data = data->parent_data;
+#else
+                data = NULL;
+#endif
+        } while (data);
+        if (data)
+                err = chip->irq_get_irqchip_state(data, which, state);
+        return err;
+}
 /**
 *      irq_get_irqchip_state - returns the irqchip state of a interrupt.
 *      @irq: Interrupt line that is forwarded to a VM
@@ -2596,7 +2657,6 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 {
        struct irq_desc *desc;
        struct irq_data *data;
-        struct irq_chip *chip;
        unsigned long flags;
        int err = -EINVAL;
@@ -2606,19 +2666,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
        data = irq_desc_get_irq_data(desc);
-        do {
+        err = __irq_get_irqchip_state(data, which, state);
-                chip = irq_data_get_irq_chip(data);
-                if (chip->irq_get_irqchip_state)
-                        break;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
-                data = data->parent_data;
-#else
-                data = NULL;
-#endif
-        } while (data);
-        if (data)
-                err = chip->irq_get_irqchip_state(data, which, state);
        irq_put_desc_busunlock(desc, flags);
        return err;
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 90c735da15d0..e960d7ce7bcc 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -1,10 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
+#define pr_fmt(fmt) "irq_timings: " fmt
 #include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/static_key.h>
+#include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/idr.h>
 #include <linux/irq.h>
@@ -261,12 +263,29 @@ void irq_timings_disable(void)
 #define EMA_ALPHA_VAL           64
 #define EMA_ALPHA_SHIFT         7
-#define PREDICTION_PERIOD_MIN   2
+#define PREDICTION_PERIOD_MIN   3
 #define PREDICTION_PERIOD_MAX   5
 #define PREDICTION_FACTOR       4
 #define PREDICTION_MAX          10 /* 2 ^ PREDICTION_MAX useconds */
 #define PREDICTION_BUFFER_SIZE  16 /* slots for EMAs, hardly more than 16 */
+/*
+ * Number of elements in the circular buffer: If it happens it was
+ * flushed before, then the number of elements could be smaller than
+ * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
+ * used as we wrapped. The index begins from zero when we did not
+ * wrap. That could be done in a nicer way with the proper circular
+ * array structure type but with the cost of extra computation in the
+ * interrupt handler hot path. We choose efficiency.
+ */
+#define for_each_irqts(i, irqts)                                        \
+        for (i = irqts->count < IRQ_TIMINGS_SIZE ?                      \
+                     0 : irqts->count & IRQ_TIMINGS_MASK,               \
+                     irqts->count = min(IRQ_TIMINGS_SIZE,               \
+                                        irqts->count);                  \
+             irqts->count > 0; irqts->count--,                          \
+                     i = (i + 1) & IRQ_TIMINGS_MASK)
 struct irqt_stat {
        u64     last_ts;
        u64     ema_time[PREDICTION_BUFFER_SIZE];
@@ -297,7 +316,16 @@ static u64 irq_timings_ema_new(u64 value, u64 ema_old)
 static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
 {
-        int i;
+        int period;
+        /*
+         * Move the beginning pointer to the end minus the max period x 3.
+         * We are at the point we can begin searching the pattern
+         */
+        buffer = &buffer[len - (period_max * 3)];
+        /* Adjust the length to the maximum allowed period x 3 */
+        len = period_max * 3;
        /*
         * The buffer contains the suite of intervals, in a ilog2
@@ -306,21 +334,45 @@ static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
         * period beginning at the end of the buffer. We do that for
         * each suffix.
         */
-        for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
+        for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) {
-                int *begin = &buffer[len - (i * 3)];
+                /*
-                int *ptr = begin;
+                 * The first comparison always succeed because the
+                 * suffix is deduced from the first n-period bytes of
+                 * the buffer and we compare the initial suffix with
+                 * itself, so we can skip the first iteration.
+                 */
+                int idx = period;
+                size_t size = period;
                /*
                 * We look if the suite with period 'i' repeat
                 * itself. If it is truncated at the end, as it
                 * repeats we can use the period to find out the next
-                 * element.
+                 * element with the modulo.
                 */
-                while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
+                while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) {
-                        ptr += i;
-                        if (ptr >= &buffer[len])
+                        /*
-                                return begin[((i * 3) % i)];
+                         * Move the index in a period basis
+                         */
+                        idx += size;
+                        /*
+                         * If this condition is reached, all previous
+                         * memcmp were successful, so the period is
+                         * found.
+                         */
+                        if (idx == len)
+                                return buffer[len % period];
+                        /*
+                         * If the remaining elements to compare are
+                         * smaller than the period, readjust the size
+                         * of the comparison for the last iteration.
+                         */
+                        if (len - idx < period)
+                                size = len - idx;
                }
        }
@@ -380,11 +432,43 @@ static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
        return irqs->last_ts + irqs->ema_time[index];
 }
+static __always_inline int irq_timings_interval_index(u64 interval)
+{
+        /*
+         * The PREDICTION_FACTOR increase the interval size for the
+         * array of exponential average.
+         */
+        u64 interval_us = (interval >> 10) / PREDICTION_FACTOR;
+        return likely(interval_us) ? ilog2(interval_us) : 0;
+}
+static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
+                                                u64 interval)
+{
+        int index;
+        /*
+         * Get the index in the ema table for this interrupt.
+         */
+        index = irq_timings_interval_index(interval);
+        /*
+         * Store the index as an element of the pattern in another
+         * circular array.
+         */
+        irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
+        irqs->ema_time[index] = irq_timings_ema_new(interval,
+                                                    irqs->ema_time[index]);
+        irqs->count++;
+}
 static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
 {
        u64 old_ts = irqs->last_ts;
        u64 interval;
-        int index;
        /*
         * The timestamps are absolute time values, we need to compute
@@ -415,24 +499,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
                return;
        }
-        /*
+        __irq_timings_store(irq, irqs, interval);
-         * Get the index in the ema table for this interrupt. The
-         * PREDICTION_FACTOR increase the interval size for the array
-         * of exponential average.
-         */
-        index = likely(interval) ?
-                ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
-        /*
-         * Store the index as an element of the pattern in another
-         * circular array.
-         */
-        irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
-        irqs->ema_time[index] = irq_timings_ema_new(interval,
-                                                    irqs->ema_time[index]);
-        irqs->count++;
 }
 /**
@@ -493,11 +560,7 @@ u64 irq_timings_next_event(u64 now)
         * model while decrementing the counter because we consume the
         * data from our circular buffer.
         */
+        for_each_irqts(i, irqts) {
-        i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
-        irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
-        for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
                irq = irq_timing_decode(irqts->values[i], &ts);
                s = idr_find(&irqt_stats, irq);
                if (s)
@@ -564,3 +627,325 @@ int irq_timings_alloc(int irq)
        return 0;
 }
+#ifdef CONFIG_TEST_IRQ_TIMINGS
+struct timings_intervals {
+        u64 *intervals;
+        size_t count;
+};
+/*
+ * Intervals are given in nanosecond base
+ */
+static u64 intervals0[] __initdata = {
+        10000, 50000, 200000, 500000,
+        10000, 50000, 200000, 500000,
+        10000, 50000, 200000, 500000,
+        10000, 50000, 200000, 500000,
+        10000, 50000, 200000, 500000,
+        10000, 50000, 200000, 500000,
+        10000, 50000, 200000, 500000,
+        10000, 50000, 200000, 500000,
+        10000, 50000, 200000,
+};
+static u64 intervals1[] __initdata = {
+        223947000, 1240000, 1384000, 1386000, 1386000,
+        217416000, 1236000, 1384000, 1386000, 1387000,
+        214719000, 1241000, 1386000, 1387000, 1384000,
+        213696000, 1234000, 1384000, 1386000, 1388000,
+        219904000, 1240000, 1385000, 1389000, 1385000,
+        212240000, 1240000, 1386000, 1386000, 1386000,
+        214415000, 1236000, 1384000, 1386000, 1387000,
+        214276000, 1234000,
+};
+static u64 intervals2[] __initdata = {
+        4000, 3000, 5000, 100000,
+        3000, 3000, 5000, 117000,
+        4000, 4000, 5000, 112000,
+        4000, 3000, 4000, 110000,
+        3000, 5000, 3000, 117000,
+        4000, 4000, 5000, 112000,
+        4000, 3000, 4000, 110000,
+        3000, 4000, 5000, 112000,
+        4000,
+};
+static u64 intervals3[] __initdata = {
+        1385000, 212240000, 1240000,
+        1386000, 214415000, 1236000,
+        1384000, 214276000, 1234000,
+        1386000, 214415000, 1236000,
+        1385000, 212240000, 1240000,
+        1386000, 214415000, 1236000,
+        1384000, 214276000, 1234000,
+        1386000, 214415000, 1236000,
+        1385000, 212240000, 1240000,
+};
+static u64 intervals4[] __initdata = {
+        10000, 50000, 10000, 50000,
+        10000, 50000, 10000, 50000,
+        10000, 50000, 10000, 50000,
+        10000, 50000, 10000, 50000,
+        10000, 50000, 10000, 50000,
+        10000, 50000, 10000, 50000,
+        10000, 50000, 10000, 50000,
+        10000, 50000, 10000, 50000,
+        10000,
+};
+static struct timings_intervals tis[] __initdata = {
+        { intervals0, ARRAY_SIZE(intervals0) },
+        { intervals1, ARRAY_SIZE(intervals1) },
+        { intervals2, ARRAY_SIZE(intervals2) },
+        { intervals3, ARRAY_SIZE(intervals3) },
+        { intervals4, ARRAY_SIZE(intervals4) },
+};
+static int __init irq_timings_test_next_index(struct timings_intervals *ti)
+{
+        int _buffer[IRQ_TIMINGS_SIZE];
+        int buffer[IRQ_TIMINGS_SIZE];
+        int index, start, i, count, period_max;
+        count = ti->count - 1;
+        period_max = count > (3 * PREDICTION_PERIOD_MAX) ?
+                PREDICTION_PERIOD_MAX : count / 3;
+        /*
+         * Inject all values except the last one which will be used
+         * to compare with the next index result.
+         */
+        pr_debug("index suite: ");
+        for (i = 0; i < count; i++) {
+                index = irq_timings_interval_index(ti->intervals[i]);
+                _buffer[i & IRQ_TIMINGS_MASK] = index;
+                pr_cont("%d ", index);
+        }
+        start = count < IRQ_TIMINGS_SIZE ? 0 :
+                count & IRQ_TIMINGS_MASK;
+        count = min_t(int, count, IRQ_TIMINGS_SIZE);
+        for (i = 0; i < count; i++) {
+                int index = (start + i) & IRQ_TIMINGS_MASK;
+                buffer[i] = _buffer[index];
+        }
+        index = irq_timings_next_event_index(buffer, count, period_max);
+        i = irq_timings_interval_index(ti->intervals[ti->count - 1]);
+        if (index != i) {
+                pr_err("Expected (%d) and computed (%d) next indexes differ\n",
+                       i, index);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int __init irq_timings_next_index_selftest(void)
+{
+        int i, ret;
+        for (i = 0; i < ARRAY_SIZE(tis); i++) {
+                pr_info("---> Injecting intervals number #%d (count=%zd)\n",
+                        i, tis[i].count);
+                ret = irq_timings_test_next_index(&tis[i]);
+                if (ret)
+                        break;
+        }
+        return ret;
+}
+static int __init irq_timings_test_irqs(struct timings_intervals *ti)
+{
+        struct irqt_stat __percpu *s;
+        struct irqt_stat *irqs;
+        int i, index, ret, irq = 0xACE5;
+        ret = irq_timings_alloc(irq);
+        if (ret) {
+                pr_err("Failed to allocate irq timings\n");
+                return ret;
+        }
+        s = idr_find(&irqt_stats, irq);
+        if (!s) {
+                ret = -EIDRM;
+                goto out;
+        }
+        irqs = this_cpu_ptr(s);
+        for (i = 0; i < ti->count; i++) {
+                index = irq_timings_interval_index(ti->intervals[i]);
+                pr_debug("%d: interval=%llu ema_index=%d\n",
+                         i, ti->intervals[i], index);
+                __irq_timings_store(irq, irqs, ti->intervals[i]);
+                if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) {
+                        pr_err("Failed to store in the circular buffer\n");
+                        goto out;
+                }
+        }
+        if (irqs->count != ti->count) {
+                pr_err("Count differs\n");
+                goto out;
+        }
+        ret = 0;
+out:
+        irq_timings_free(irq);
+        return ret;
+}
+static int __init irq_timings_irqs_selftest(void)
+{
+        int i, ret;
+        for (i = 0; i < ARRAY_SIZE(tis); i++) {
+                pr_info("---> Injecting intervals number #%d (count=%zd)\n",
+                        i, tis[i].count);
+                ret = irq_timings_test_irqs(&tis[i]);
+                if (ret)
+                        break;
+        }
+        return ret;
+}
+static int __init irq_timings_test_irqts(struct irq_timings *irqts,
+                                         unsigned count)
+{
+        int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0;
+        int i, irq, oirq = 0xBEEF;
+        u64 ots = 0xDEAD, ts;
+        /*
+         * Fill the circular buffer by using the dedicated function.
+         */
+        for (i = 0; i < count; i++) {
+                pr_debug("%d: index=%d, ts=%llX irq=%X\n",
+                         i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i);
+                irq_timings_push(ots + i, oirq + i);
+        }
+        /*
+         * Compute the first elements values after the index wrapped
+         * up or not.
+         */
+        ots += start;
+        oirq += start;
+        /*
+         * Test the circular buffer count is correct.
+         */
+        pr_debug("---> Checking timings array count (%d) is right\n", count);
+        if (WARN_ON(irqts->count != count))
+                return -EINVAL;
+        /*
+         * Test the macro allowing to browse all the irqts.
+         */
+        pr_debug("---> Checking the for_each_irqts() macro\n");
+        for_each_irqts(i, irqts) {
+                irq = irq_timing_decode(irqts->values[i], &ts);
+                pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n",
+                         i, ts, ots, irq, oirq);
+                if (WARN_ON(ts != ots || irq != oirq))
+                        return -EINVAL;
+                ots++; oirq++;
+        }
+        /*
+         * The circular buffer should have be flushed when browsed
+         * with for_each_irqts
+         */
+        pr_debug("---> Checking timings array is empty after browsing it\n");
+        if (WARN_ON(irqts->count))
+                return -EINVAL;
+        return 0;
+}
+static int __init irq_timings_irqts_selftest(void)
+{
+        struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
+        int i, ret;
+        /*
+         * Test the circular buffer with different number of
+         * elements. The purpose is to test at the limits (empty, half
+         * full, full, wrapped with the cursor at the boundaries,
+         * wrapped several times, etc ...
+         */
+        int count[] = { 0,
+                        IRQ_TIMINGS_SIZE >> 1,
+                        IRQ_TIMINGS_SIZE,
+                        IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1),
+                        2 * IRQ_TIMINGS_SIZE,
+                        (2 * IRQ_TIMINGS_SIZE) + 3,
+        };
+        for (i = 0; i < ARRAY_SIZE(count); i++) {
+                pr_info("---> Checking the timings with %d/%d values\n",
+                        count[i], IRQ_TIMINGS_SIZE);
+                ret = irq_timings_test_irqts(irqts, count[i]);
+                if (ret)
+                        break;
+        }
+        return ret;
+}
+static int __init irq_timings_selftest(void)
+{
+        int ret;
+        pr_info("------------------- selftest start -----------------\n");
+        /*
+         * At this point, we don't except any subsystem to use the irq
+         * timings but us, so it should not be enabled.
+         */
+        if (static_branch_unlikely(&irq_timing_enabled)) {
+                pr_warn("irq timings already initialized, skipping selftest\n");
+                return 0;
+        }
+        ret = irq_timings_irqts_selftest();
+        if (ret)
+                goto out;
+        ret = irq_timings_irqs_selftest();
+        if (ret)
+                goto out;
+        ret = irq_timings_next_index_selftest();
+out:
+        pr_info("---------- selftest end with %s -----------\n",
+                ret ? "failure" : "success");
+        return ret;
+}
+early_initcall(irq_timings_selftest);
+#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 73288914ed5e..d42acaf81886 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
 *
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index de6efdecc70d..df3008419a1d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * jump label support
 *
@@ -36,12 +37,26 @@ static int jump_label_cmp(const void *a, const void *b)
        const struct jump_entry *jea = a;
        const struct jump_entry *jeb = b;
+        /*
+         * Entrires are sorted by key.
+         */
        if (jump_entry_key(jea) < jump_entry_key(jeb))
                return -1;
        if (jump_entry_key(jea) > jump_entry_key(jeb))
                return 1;
+        /*
+         * In the batching mode, entries should also be sorted by the code
+         * inside the already sorted list of entries, enabling a bsearch in
+         * the vector.
+         */
+        if (jump_entry_code(jea) < jump_entry_code(jeb))
+                return -1;
+        if (jump_entry_code(jea) > jump_entry_code(jeb))
+                return 1;
        return 0;
 }
@@ -383,25 +398,55 @@ static enum jump_label_type jump_label_type(struct jump_entry *entry)
        return enabled ^ branch;
 }
+static bool jump_label_can_update(struct jump_entry *entry, bool init)
+{
+        /*
+         * Cannot update code that was in an init text area.
+         */
+        if (!init && jump_entry_is_init(entry))
+                return false;
+        if (!kernel_text_address(jump_entry_code(entry))) {
+                WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry));
+                return false;
+        }
+        return true;
+}
+#ifndef HAVE_JUMP_LABEL_BATCH
 static void __jump_label_update(struct static_key *key,
                                struct jump_entry *entry,
                                struct jump_entry *stop,
                                bool init)
 {
        for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
-                /*
+                if (jump_label_can_update(entry, init))
-                 * An entry->code of 0 indicates an entry which has been
+                        arch_jump_label_transform(entry, jump_label_type(entry));
-                 * disabled because it was in an init text area.
+        }
-                 */
+}
-                if (init || !jump_entry_is_init(entry)) {
+#else
-                        if (kernel_text_address(jump_entry_code(entry)))
+static void __jump_label_update(struct static_key *key,
-                                arch_jump_label_transform(entry, jump_label_type(entry));
+                                struct jump_entry *entry,
-                        else
+                                struct jump_entry *stop,
-                                WARN_ONCE(1, "can't patch jump_label at %pS",
+                                bool init)
-                                          (void *)jump_entry_code(entry));
+{
+        for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
+                if (!jump_label_can_update(entry, init))
+                        continue;
+                if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) {
+                        /*
+                         * Queue is full: Apply the current queue and try again.
+                         */
+                        arch_jump_label_transform_apply();
+                        BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry)));
                }
        }
+        arch_jump_label_transform_apply();
 }
+#endif
 void __init jump_label_init(void)
 {
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 14934afa9e68..95a260f9214b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
 *
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 68559808fdfa..1b018f1a6e0d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kexec.c - kexec_load system call
 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
 */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index fd5c95ff9251..d5870723b8ad 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kexec.c - kexec system call core code.
 * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
 */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 072b6ee55e3f..b8cc032d5620 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kexec: kexec_file_load system call
 *
 * Copyright (C) 2014 Red Hat Inc.
 * Authors:
 *      Vivek Goyal <vgoyal@redhat.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
 */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -198,9 +196,6 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
                return ret;
        image->kernel_buf_len = size;
-        /* IMA needs to pass the measurement list to the next kernel. */
-        ima_add_kexec_buffer(image);
        /* Call arch image probe handlers */
        ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
                                            image->kernel_buf_len);
@@ -241,8 +236,14 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
                        ret = -EINVAL;
                        goto out;
                }
+                ima_kexec_cmdline(image->cmdline_buf,
+                                  image->cmdline_buf_len - 1);
        }
+        /* IMA needs to pass the measurement list to the next kernel. */
+        ima_add_kexec_buffer(image);
        /* Call arch image load handlers */
        ldata = arch_kexec_kernel_image_load(image);
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
index 70ae6052920d..8f69772af77b 100644
--- a/kernel/kheaders.c
+++ b/kernel/kheaders.c
@@ -8,9 +8,8 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/proc_fs.h>
+#include <linux/kobject.h>
 #include <linux/init.h>
-#include <linux/uaccess.h>
 /*
 * Define kernel_headers_data and kernel_headers_data_end, within which the
@@ -31,39 +30,32 @@ extern char kernel_headers_data;
 extern char kernel_headers_data_end;
 static ssize_t
-ikheaders_read_current(struct file *file, char __user *buf,
+ikheaders_read(struct file *file,  struct kobject *kobj,
-                      size_t len, loff_t *offset)
+               struct bin_attribute *bin_attr,
+               char *buf, loff_t off, size_t len)
 {
-        return simple_read_from_buffer(buf, len, offset,
+        memcpy(buf, &kernel_headers_data + off, len);
-                                       &kernel_headers_data,
+        return len;
-                                       &kernel_headers_data_end -
-                                       &kernel_headers_data);
 }
-static const struct file_operations ikheaders_file_ops = {
+static struct bin_attribute kheaders_attr __ro_after_init = {
-        .read = ikheaders_read_current,
+        .attr = {
-        .llseek = default_llseek,
+                .name = "kheaders.tar.xz",
+                .mode = 0444,
+        },
+        .read = &ikheaders_read,
 };
 static int __init ikheaders_init(void)
 {
-        struct proc_dir_entry *entry;
+        kheaders_attr.size = (&kernel_headers_data_end -
+                              &kernel_headers_data);
-        /* create the current headers file */
+        return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
-        entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL,
-                            &ikheaders_file_ops);
-        if (!entry)
-                return -ENOMEM;
-        proc_set_size(entry,
-                      &kernel_headers_data_end -
-                      &kernel_headers_data);
-        return 0;
 }
 static void __exit ikheaders_cleanup(void)
 {
-        remove_proc_entry("kheaders.tar.xz", NULL);
+        sysfs_remove_bin_file(kernel_kobj, &kheaders_attr);
 }
 module_init(ikheaders_init);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b1ea30a5540e..9f5433a52488 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 *  Kernel Probes (KProbes)
 *  kernel/kprobes.c
 *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
 * Copyright (C) IBM Corporation, 2002, 2004
 *
 * 2002-Oct     Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
@@ -2583,33 +2570,20 @@ static const struct file_operations fops_kp = {
 static int __init debugfs_kprobe_init(void)
 {
-        struct dentry *dir, *file;
+        struct dentry *dir;
        unsigned int value = 1;
        dir = debugfs_create_dir("kprobes", NULL);
-        if (!dir)
-                return -ENOMEM;
-        file = debugfs_create_file("list", 0400, dir, NULL,
+        debugfs_create_file("list", 0400, dir, NULL,
-                                &debugfs_kprobes_operations);
+                            &debugfs_kprobes_operations);
-        if (!file)
-                goto error;
-        file = debugfs_create_file("enabled", 0600, dir,
+        debugfs_create_file("enabled", 0600, dir, &value, &fops_kp);
-                                        &value, &fops_kp);
-        if (!file)
-                goto error;
-        file = debugfs_create_file("blacklist", 0400, dir, NULL,
+        debugfs_create_file("blacklist", 0400, dir, NULL,
-                                &debugfs_kprobe_blacklist_ops);
+                            &debugfs_kprobe_blacklist_ops);
-        if (!file)
-                goto error;
        return 0;
-error:
-        debugfs_remove(dir);
-        return -ENOMEM;
 }
 late_initcall(debugfs_kprobe_init);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 46ba853656f6..35859da8bd4f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
 *                   are not related to any other subsystem
 *
 * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
- * 
- * This file is release under the GPLv2
- *
 */
 #include <linux/kobject.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5942eeafb9ac..621467c33fef 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /* Kernel thread helper functions.
 *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
 *
@@ -11,6 +12,7 @@
 #include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/err.h>
+#include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 99a5b5f46dc5..e3acead004e6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * latencytop.c: Latency display infrastructure
 *
 * (C) Copyright 2008 Intel Corporation
 * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
 */
 /*
@@ -67,13 +63,10 @@ static struct latency_record latency_record[MAXLR];
 int latencytop_enabled;
-void clear_all_latency_tracing(struct task_struct *p)
+void clear_tsk_latency_tracing(struct task_struct *p)
 {
        unsigned long flags;
-        if (!latencytop_enabled)
-                return;
        raw_spin_lock_irqsave(&latency_lock, flags);
        memset(&p->latency_record, 0, sizeof(p->latency_record));
        p->latency_record_count = 0;
@@ -96,9 +89,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
        int firstnonnull = MAXLR + 1;
        int i;
-        if (!latencytop_enabled)
-                return;
        /* skip kernel threads for now */
        if (!tsk->mm)
                return;
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index ec4565122e65..54102deb50ba 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config HAVE_LIVEPATCH
        bool
        help
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index b36ceda6488e..cf9b5bcdb952 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
 livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index f6fbaff10e71..c4ce08f43bd6 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * core.c - Kernel Live Patching Core
 *
 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
 * Copyright (C) 2014 SUSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -30,6 +18,7 @@
 #include <linux/elf.h>
 #include <linux/moduleloader.h>
 #include <linux/completion.h>
+#include <linux/memory.h>
 #include <asm/cacheflush.h>
 #include "core.h"
 #include "patch.h"
@@ -730,16 +719,21 @@ static int klp_init_object_loaded(struct klp_patch *patch,
        struct klp_func *func;
        int ret;
+        mutex_lock(&text_mutex);
        module_disable_ro(patch->mod);
        ret = klp_write_object_relocations(patch->mod, obj);
        if (ret) {
                module_enable_ro(patch->mod, true);
+                mutex_unlock(&text_mutex);
                return ret;
        }
        arch_klp_init_object_loaded(patch, obj);
        module_enable_ro(patch->mod, true);
+        mutex_unlock(&text_mutex);
        klp_for_each_func(obj, func) {
                ret = klp_find_object_symbol(obj->name, func->old_name,
                                             func->old_sympos,
@@ -1208,14 +1202,6 @@ void klp_module_going(struct module *mod)
 static int __init klp_init(void)
 {
-        int ret;
-        ret = klp_check_compiler_support();
-        if (ret) {
-                pr_info("Your compiler is too old; turning off.\n");
-                return -EINVAL;
-        }
        klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
        if (!klp_root_kobj)
                return -ENOMEM;
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 99cb3ad05eb4..bd43537702bd 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * patch.c - livepatch patching functions
 *
 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
 * Copyright (C) 2014 SUSE
 * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index 83958c814439..e5c9fb295ba9 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * shadow.c - Shadow Variables
 *
 * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
 * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 /**
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index c53370d596be..cdf318d86dd6 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * transition.c - Kernel Live Patching transition functions
 *
 * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -259,7 +247,6 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
        int ret, nr_entries;
        ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
-        WARN_ON_ONCE(ret == -ENOSYS);
        if (ret < 0) {
                snprintf(err_buf, STACK_ERR_BUF_SIZE,
                         "%s: %s:%d has an unreliable stack\n",
@@ -293,11 +280,11 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
 */
 static bool klp_try_switch_task(struct task_struct *task)
 {
+        static char err_buf[STACK_ERR_BUF_SIZE];
        struct rq *rq;
        struct rq_flags flags;
        int ret;
        bool success = false;
-        char err_buf[STACK_ERR_BUF_SIZE];
        err_buf[0] = '\0';
@@ -306,6 +293,13 @@ static bool klp_try_switch_task(struct task_struct *task)
                return true;
        /*
+         * For arches which don't have reliable stack traces, we have to rely
+         * on other methods (e.g., switching tasks at kernel exit).
+         */
+        if (!klp_have_reliable_stack())
+                return false;
+        /*
         * Now try to check the stack for any to-be-patched or to-be-unpatched
         * functions.  If all goes well, switch the task to the target patch
         * state.
@@ -340,7 +334,6 @@ done:
                pr_debug("%s", err_buf);
        return success;
 }
 /*
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 6fe2f333aecb..45452facff3b 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
 # and is generally not a function of system call inputs.
 KCOV_INSTRUMENT         := n
-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index feb1acc54611..8c7e7d25f09c 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -31,12 +31,13 @@ enum lock_events {
 DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
 /*
- * Increment the PV qspinlock statistical counters
+ * Increment the statistical counters. use raw_cpu_inc() because of lower
+ * overhead and we don't care if we loose the occasional update.
 */
 static inline void __lockevent_inc(enum lock_events event, bool cond)
 {
        if (cond)
-                __this_cpu_inc(lockevents[event]);
+                raw_cpu_inc(lockevents[event]);
 }
 #define lockevent_inc(ev)         __lockevent_inc(LOCKEVENT_ ##ev, true)
@@ -44,7 +45,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond)
 static inline void __lockevent_add(enum lock_events event, int inc)
 {
-        __this_cpu_add(lockevents[event], inc);
+        raw_cpu_add(lockevents[event], inc);
 }
 #define lockevent_add(ev, c)    __lockevent_add(LOCKEVENT_ ##ev, c)
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
index ad7668cfc9da..239039d0ce21 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -56,12 +56,16 @@ LOCK_EVENT(rwsem_sleep_reader)	/* # of reader sleeps			*/
 LOCK_EVENT(rwsem_sleep_writer)  /* # of writer sleeps                   */
 LOCK_EVENT(rwsem_wake_reader)   /* # of reader wakeups                  */
 LOCK_EVENT(rwsem_wake_writer)   /* # of writer wakeups                  */
-LOCK_EVENT(rwsem_opt_wlock)     /* # of write locks opt-spin acquired   */
+LOCK_EVENT(rwsem_opt_rlock)     /* # of opt-acquired read locks         */
-LOCK_EVENT(rwsem_opt_fail)      /* # of failed opt-spinnings            */
+LOCK_EVENT(rwsem_opt_wlock)     /* # of opt-acquired write locks        */
+LOCK_EVENT(rwsem_opt_fail)      /* # of failed optspins                 */
+LOCK_EVENT(rwsem_opt_nospin)    /* # of disabled optspins               */
+LOCK_EVENT(rwsem_opt_norspin)   /* # of disabled reader-only optspins   */
+LOCK_EVENT(rwsem_opt_rlock2)    /* # of opt-acquired 2ndary read locks  */
 LOCK_EVENT(rwsem_rlock)         /* # of read locks acquired             */
 LOCK_EVENT(rwsem_rlock_fast)    /* # of fast read locks acquired        */
 LOCK_EVENT(rwsem_rlock_fail)    /* # of failed read lock acquisitions   */
-LOCK_EVENT(rwsem_rtrylock)      /* # of read trylock calls              */
+LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs              */
 LOCK_EVENT(rwsem_wlock)         /* # of write locks acquired            */
 LOCK_EVENT(rwsem_wlock_fail)    /* # of failed write lock acquisitions  */
-LOCK_EVENT(rwsem_wtrylock)      /* # of write trylock calls             */
+LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs             */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d06190fa5082..341f52117f88 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kernel/lockdep.c
 *
@@ -150,17 +151,28 @@ unsigned long nr_lock_classes;
 static
 #endif
 struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
 static inline struct lock_class *hlock_class(struct held_lock *hlock)
 {
-        if (!hlock->class_idx) {
+        unsigned int class_idx = hlock->class_idx;
+        /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */
+        barrier();
+        if (!test_bit(class_idx, lock_classes_in_use)) {
                /*
                 * Someone passed in garbage, we give up.
                 */
                DEBUG_LOCKS_WARN_ON(1);
                return NULL;
        }
-        return lock_classes + hlock->class_idx - 1;
+        /*
+         * At this point, if the passed hlock->class_idx is still garbage,
+         * we just have to live with it
+         */
+        return lock_classes + class_idx;
 }
 #ifdef CONFIG_LOCK_STAT
@@ -358,6 +370,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx)
        return k0 | (u64)k1 << 32;
 }
+void lockdep_init_task(struct task_struct *task)
+{
+        task->lockdep_depth = 0; /* no locks held yet */
+        task->curr_chain_key = INITIAL_CHAIN_KEY;
+        task->lockdep_recursion = 0;
+}
 void lockdep_off(void)
 {
        current->lockdep_recursion++;
@@ -418,13 +437,6 @@ static int verbose(struct lock_class *class)
        return 0;
 }
-/*
- * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the graph_lock.
- */
-unsigned long nr_stack_trace_entries;
-static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
 static void print_lockdep_off(const char *bug_msg)
 {
        printk(KERN_DEBUG "%s\n", bug_msg);
@@ -434,6 +446,15 @@ static void print_lockdep_off(const char *bug_msg)
 #endif
 }
+unsigned long nr_stack_trace_entries;
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the graph_lock.
+ */
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
 static int save_trace(struct lock_trace *trace)
 {
        unsigned long *entries = stack_trace + nr_stack_trace_entries;
@@ -456,6 +477,7 @@ static int save_trace(struct lock_trace *trace)
        return 1;
 }
+#endif
 unsigned int nr_hardirq_chains;
 unsigned int nr_softirq_chains;
@@ -469,6 +491,7 @@ unsigned int max_lockdep_depth;
 DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
 #endif
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
 * Locking printouts:
 */
@@ -486,6 +509,7 @@ static const char *usage_str[] =
 #undef LOCKDEP_STATE
        [LOCK_USED] = "INITIAL USE",
 };
+#endif
 const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
 {
@@ -499,15 +523,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit)
 static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
 {
+        /*
+         * The usage character defaults to '.' (i.e., irqs disabled and not in
+         * irq context), which is the safest usage category.
+         */
        char c = '.';
-        if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
+        /*
+         * The order of the following usage checks matters, which will
+         * result in the outcome character as follows:
+         *
+         * - '+': irq is enabled and not in irq context
+         * - '-': in irq context and irq is disabled
+         * - '?': in irq context and irq is enabled
+         */
+        if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) {
                c = '+';
-        if (class->usage_mask & lock_flag(bit)) {
+                if (class->usage_mask & lock_flag(bit))
-                c = '-';
-                if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
                        c = '?';
-        }
+        } else if (class->usage_mask & lock_flag(bit))
+                c = '-';
        return c;
 }
@@ -571,19 +606,22 @@ static void print_lock(struct held_lock *hlock)
        /*
         * We can be called locklessly through debug_show_all_locks() so be
         * extra careful, the hlock might have been released and cleared.
+         *
+         * If this indeed happens, lets pretend it does not hurt to continue
+         * to print the lock unless the hlock class_idx does not point to a
+         * registered class. The rationale here is: since we don't attempt
+         * to distinguish whether we are in this situation, if it just
+         * happened we can't count on class_idx to tell either.
         */
-        unsigned int class_idx = hlock->class_idx;
+        struct lock_class *lock = hlock_class(hlock);
-        /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
-        barrier();
-        if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+        if (!lock) {
                printk(KERN_CONT "<RELEASED>\n");
                return;
        }
        printk(KERN_CONT "%p", hlock->instance);
-        print_lock_name(lock_classes + class_idx - 1);
+        print_lock_name(lock);
        printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
 }
@@ -731,7 +769,8 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
                         * Huh! same key, different name? Did someone trample
                         * on some memory? We're most confused.
                         */
-                        WARN_ON_ONCE(class->name != lock->name);
+                        WARN_ON_ONCE(class->name != lock->name &&
+                                     lock->key != &__lockdep_no_validate__);
                        return class;
                }
        }
@@ -837,11 +876,11 @@ static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
 static bool check_lock_chain_key(struct lock_chain *chain)
 {
 #ifdef CONFIG_PROVE_LOCKING
-        u64 chain_key = 0;
+        u64 chain_key = INITIAL_CHAIN_KEY;
        int i;
        for (i = chain->base; i < chain->base + chain->depth; i++)
-                chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+                chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
        /*
         * The 'unsigned long long' casts avoid that a compiler warning
         * is reported when building tools/lib/lockdep.
@@ -1116,6 +1155,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
                return NULL;
        }
        nr_lock_classes++;
+        __set_bit(class - lock_classes, lock_classes_in_use);
        debug_atomic_inc(nr_unused_locks);
        class->key = key;
        class->name = lock->name;
@@ -1227,13 +1267,17 @@ static int add_lock_to_list(struct lock_class *this,
 #define CQ_MASK                         (MAX_CIRCULAR_QUEUE_SIZE-1)
 /*
- * The circular_queue and helpers is used to implement the
+ * The circular_queue and helpers are used to implement graph
- * breadth-first search(BFS)algorithem, by which we can build
+ * breadth-first search (BFS) algorithm, by which we can determine
- * the shortest path from the next lock to be acquired to the
+ * whether there is a path from a lock to another. In deadlock checks,
- * previous held lock if there is a circular between them.
+ * a path from the next lock to be acquired to a previous held lock
+ * indicates that adding the <prev> -> <next> lock dependency will
+ * produce a circle in the graph. Breadth-first search instead of
+ * depth-first search is used in order to find the shortest (circular)
+ * path.
 */
 struct circular_queue {
-        unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+        struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE];
        unsigned int  front, rear;
 };
@@ -1259,7 +1303,7 @@ static inline int __cq_full(struct circular_queue *cq)
        return ((cq->rear + 1) & CQ_MASK) == cq->front;
 }
-static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem)
 {
        if (__cq_full(cq))
                return -1;
@@ -1269,14 +1313,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
        return 0;
 }
-static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+/*
+ * Dequeue an element from the circular_queue, return a lock_list if
+ * the queue is not empty, or NULL if otherwise.
+ */
+static inline struct lock_list * __cq_dequeue(struct circular_queue *cq)
 {
+        struct lock_list * lock;
        if (__cq_empty(cq))
-                return -1;
+                return NULL;
-        *elem = cq->element[cq->front];
+        lock = cq->element[cq->front];
        cq->front = (cq->front + 1) & CQ_MASK;
-        return 0;
+        return lock;
 }
 static inline unsigned int  __cq_get_elem_count(struct circular_queue *cq)
@@ -1321,13 +1372,32 @@ static inline int get_lock_depth(struct lock_list *child)
        return depth;
 }
+/*
+ * Return the forward or backward dependency list.
+ *
+ * @lock:   the lock_list to get its class's dependency list
+ * @offset: the offset to struct lock_class to determine whether it is
+ *          locks_after or locks_before
+ */
+static inline struct list_head *get_dep_list(struct lock_list *lock, int offset)
+{
+        void *lock_class = lock->class;
+        return lock_class + offset;
+}
+/*
+ * Forward- or backward-dependency search, used for both circular dependency
+ * checking and hardirq-unsafe/softirq-unsafe checking.
+ */
 static int __bfs(struct lock_list *source_entry,
                 void *data,
                 int (*match)(struct lock_list *entry, void *data),
                 struct lock_list **target_entry,
-                 int forward)
+                 int offset)
 {
        struct lock_list *entry;
+        struct lock_list *lock;
        struct list_head *head;
        struct circular_queue *cq = &lock_cq;
        int ret = 1;
@@ -1338,31 +1408,21 @@ static int __bfs(struct lock_list *source_entry,
                goto exit;
        }
-        if (forward)
+        head = get_dep_list(source_entry, offset);
-                head = &source_entry->class->locks_after;
-        else
-                head = &source_entry->class->locks_before;
        if (list_empty(head))
                goto exit;
        __cq_init(cq);
-        __cq_enqueue(cq, (unsigned long)source_entry);
+        __cq_enqueue(cq, source_entry);
-        while (!__cq_empty(cq)) {
+        while ((lock = __cq_dequeue(cq))) {
-                struct lock_list *lock;
-                __cq_dequeue(cq, (unsigned long *)&lock);
                if (!lock->class) {
                        ret = -2;
                        goto exit;
                }
-                if (forward)
+                head = get_dep_list(lock, offset);
-                        head = &lock->class->locks_after;
-                else
-                        head = &lock->class->locks_before;
                DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -1376,7 +1436,7 @@ static int __bfs(struct lock_list *source_entry,
                                        goto exit;
                                }
-                                if (__cq_enqueue(cq, (unsigned long)entry)) {
+                                if (__cq_enqueue(cq, entry)) {
                                        ret = -1;
                                        goto exit;
                                }
@@ -1395,7 +1455,8 @@ static inline int __bfs_forwards(struct lock_list *src_entry,
                        int (*match)(struct lock_list *entry, void *data),
                        struct lock_list **target_entry)
 {
-        return __bfs(src_entry, data, match, target_entry, 1);
+        return __bfs(src_entry, data, match, target_entry,
+                     offsetof(struct lock_class, locks_after));
 }
@@ -1404,16 +1465,11 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
                        int (*match)(struct lock_list *entry, void *data),
                        struct lock_list **target_entry)
 {
-        return __bfs(src_entry, data, match, target_entry, 0);
+        return __bfs(src_entry, data, match, target_entry,
+                     offsetof(struct lock_class, locks_before));
 }
-/*
- * Recursive, forwards-direction lock-dependency checking, used for
- * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
- * checking.
- */
 static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
 {
        unsigned long *entries = stack_trace + trace->offset;
@@ -1425,16 +1481,15 @@ static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
 * Print a dependency chain entry (this is only done when a deadlock
 * has been detected):
 */
-static noinline int
+static noinline void
 print_circular_bug_entry(struct lock_list *target, int depth)
 {
        if (debug_locks_silent)
-                return 0;
+                return;
        printk("\n-> #%u", depth);
        print_lock_name(target->class);
        printk(KERN_CONT ":\n");
        print_lock_trace(&target->trace, 6);
-        return 0;
 }
 static void
@@ -1491,7 +1546,7 @@ print_circular_lock_scenario(struct held_lock *src,
 * When a circular dependency is detected, print the
 * header first:
 */
-static noinline int
+static noinline void
 print_circular_bug_header(struct lock_list *entry, unsigned int depth,
                        struct held_lock *check_src,
                        struct held_lock *check_tgt)
@@ -1499,7 +1554,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
        struct task_struct *curr = current;
        if (debug_locks_silent)
-                return 0;
+                return;
        pr_warn("\n");
        pr_warn("======================================================\n");
@@ -1517,8 +1572,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
        pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
        print_circular_bug_entry(entry, depth);
-        return 0;
 }
 static inline int class_equal(struct lock_list *entry, void *data)
@@ -1526,10 +1579,10 @@ static inline int class_equal(struct lock_list *entry, void *data)
        return entry->class == data;
 }
-static noinline int print_circular_bug(struct lock_list *this,
+static noinline void print_circular_bug(struct lock_list *this,
-                                       struct lock_list *target,
+                                        struct lock_list *target,
-                                       struct held_lock *check_src,
+                                        struct held_lock *check_src,
-                                       struct held_lock *check_tgt)
+                                        struct held_lock *check_tgt)
 {
        struct task_struct *curr = current;
        struct lock_list *parent;
@@ -1537,10 +1590,10 @@ static noinline int print_circular_bug(struct lock_list *this,
        int depth;
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-                return 0;
+                return;
        if (!save_trace(&this->trace))
-                return 0;
+                return;
        depth = get_lock_depth(target);
@@ -1562,21 +1615,17 @@ static noinline int print_circular_bug(struct lock_list *this,
        printk("\nstack backtrace:\n");
        dump_stack();
-        return 0;
 }
-static noinline int print_bfs_bug(int ret)
+static noinline void print_bfs_bug(int ret)
 {
        if (!debug_locks_off_graph_unlock())
-                return 0;
+                return;
        /*
         * Breadth-first-search failed, graph got corrupted?
         */
        WARN(1, "lockdep bfs error:%d\n", ret);
-        return 0;
 }
 static int noop_count(struct lock_list *entry, void *data)
@@ -1639,36 +1688,95 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
 }
 /*
- * Prove that the dependency graph starting at <entry> can not
+ * Check that the dependency graph starting at <src> can lead to
- * lead to <target>. Print an error and return 0 if it does.
+ * <target> or not. Print an error and return 0 if it does.
 */
 static noinline int
-check_noncircular(struct lock_list *root, struct lock_class *target,
+check_path(struct lock_class *target, struct lock_list *src_entry,
-                struct lock_list **target_entry)
+           struct lock_list **target_entry)
 {
-        int result;
+        int ret;
+        ret = __bfs_forwards(src_entry, (void *)target, class_equal,
+                             target_entry);
+        if (unlikely(ret < 0))
+                print_bfs_bug(ret);
+        return ret;
+}
+/*
+ * Prove that the dependency graph starting at <src> can not
+ * lead to <target>. If it can, there is a circle when adding
+ * <target> -> <src> dependency.
+ *
+ * Print an error and return 0 if it does.
+ */
+static noinline int
+check_noncircular(struct held_lock *src, struct held_lock *target,
+                  struct lock_trace *trace)
+{
+        int ret;
+        struct lock_list *uninitialized_var(target_entry);
+        struct lock_list src_entry = {
+                .class = hlock_class(src),
+                .parent = NULL,
+        };
        debug_atomic_inc(nr_cyclic_checks);
-        result = __bfs_forwards(root, target, class_equal, target_entry);
+        ret = check_path(hlock_class(target), &src_entry, &target_entry);
-        return result;
+        if (unlikely(!ret)) {
+                if (!trace->nr_entries) {
+                        /*
+                         * If save_trace fails here, the printing might
+                         * trigger a WARN but because of the !nr_entries it
+                         * should not do bad things.
+                         */
+                        save_trace(trace);
+                }
+                print_circular_bug(&src_entry, target_entry, src, target);
+        }
+        return ret;
 }
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. If it can, <src> -> <target> dependency is already
+ * in the graph.
+ *
+ * Print an error and return 2 if it does or 1 if it does not.
+ */
 static noinline int
-check_redundant(struct lock_list *root, struct lock_class *target,
+check_redundant(struct held_lock *src, struct held_lock *target)
-                struct lock_list **target_entry)
 {
-        int result;
+        int ret;
+        struct lock_list *uninitialized_var(target_entry);
+        struct lock_list src_entry = {
+                .class = hlock_class(src),
+                .parent = NULL,
+        };
        debug_atomic_inc(nr_redundant_checks);
-        result = __bfs_forwards(root, target, class_equal, target_entry);
+        ret = check_path(hlock_class(target), &src_entry, &target_entry);
-        return result;
+        if (!ret) {
+                debug_atomic_inc(nr_redundant);
+                ret = 2;
+        } else if (ret < 0)
+                ret = 0;
+        return ret;
 }
+#endif
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_TRACE_IRQFLAGS
 static inline int usage_accumulate(struct lock_list *entry, void *mask)
 {
@@ -1765,7 +1873,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
 */
 static void __used
 print_shortest_lock_dependencies(struct lock_list *leaf,
-                                struct lock_list *root)
+                                 struct lock_list *root)
 {
        struct lock_list *entry = leaf;
        int depth;
@@ -1787,8 +1895,6 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
                entry = get_lock_parent(entry);
                depth--;
        } while (entry && (depth >= 0));
-        return;
 }
 static void
@@ -1847,7 +1953,7 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
        printk("\n *** DEADLOCK ***\n\n");
 }
-static int
+static void
 print_bad_irq_dependency(struct task_struct *curr,
                         struct lock_list *prev_root,
                         struct lock_list *next_root,
@@ -1860,7 +1966,7 @@ print_bad_irq_dependency(struct task_struct *curr,
                         const char *irqclass)
 {
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-                return 0;
+                return;
        pr_warn("\n");
        pr_warn("=====================================================\n");
@@ -1906,19 +2012,17 @@ print_bad_irq_dependency(struct task_struct *curr,
        pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
        if (!save_trace(&prev_root->trace))
-                return 0;
+                return;
        print_shortest_lock_dependencies(backwards_entry, prev_root);
        pr_warn("\nthe dependencies between the lock to be acquired");
        pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
        if (!save_trace(&next_root->trace))
-                return 0;
+                return;
        print_shortest_lock_dependencies(forwards_entry, next_root);
        pr_warn("\nstack backtrace:\n");
        dump_stack();
-        return 0;
 }
 static const char *state_names[] = {
@@ -2065,8 +2169,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
        this.class = hlock_class(prev);
        ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
-        if (ret < 0)
+        if (ret < 0) {
-                return print_bfs_bug(ret);
+                print_bfs_bug(ret);
+                return 0;
+        }
        usage_mask &= LOCKF_USED_IN_IRQ_ALL;
        if (!usage_mask)
@@ -2082,8 +2188,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
        that.class = hlock_class(next);
        ret = find_usage_forwards(&that, forward_mask, &target_entry1);
-        if (ret < 0)
+        if (ret < 0) {
-                return print_bfs_bug(ret);
+                print_bfs_bug(ret);
+                return 0;
+        }
        if (ret == 1)
                return ret;
@@ -2095,8 +2203,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
        backward_mask = original_mask(target_entry1->class->usage_mask);
        ret = find_usage_backwards(&this, backward_mask, &target_entry);
-        if (ret < 0)
+        if (ret < 0) {
-                return print_bfs_bug(ret);
+                print_bfs_bug(ret);
+                return 0;
+        }
        if (DEBUG_LOCKS_WARN_ON(ret == 1))
                return 1;
@@ -2110,11 +2220,13 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
        if (DEBUG_LOCKS_WARN_ON(ret == -1))
                return 1;
-        return print_bad_irq_dependency(curr, &this, &that,
+        print_bad_irq_dependency(curr, &this, &that,
-                        target_entry, target_entry1,
+                                 target_entry, target_entry1,
-                        prev, next,
+                                 prev, next,
-                        backward_bit, forward_bit,
+                                 backward_bit, forward_bit,
-                        state_name(backward_bit));
+                                 state_name(backward_bit));
+        return 0;
 }
 static void inc_chains(void)
@@ -2142,11 +2254,10 @@ static inline void inc_chains(void)
        nr_process_chains++;
 }
-#endif
+#endif /* CONFIG_TRACE_IRQFLAGS */
 static void
-print_deadlock_scenario(struct held_lock *nxt,
+print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
-                             struct held_lock *prv)
 {
        struct lock_class *next = hlock_class(nxt);
        struct lock_class *prev = hlock_class(prv);
@@ -2164,12 +2275,12 @@ print_deadlock_scenario(struct held_lock *nxt,
        printk(" May be due to missing lock nesting notation\n\n");
 }
-static int
+static void
 print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
                   struct held_lock *next)
 {
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-                return 0;
+                return;
        pr_warn("\n");
        pr_warn("============================================\n");
@@ -2188,8 +2299,6 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
        pr_warn("\nstack backtrace:\n");
        dump_stack();
-        return 0;
 }
 /*
@@ -2201,8 +2310,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
 * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
 */
 static int
-check_deadlock(struct task_struct *curr, struct held_lock *next,
+check_deadlock(struct task_struct *curr, struct held_lock *next)
-               struct lockdep_map *next_instance, int read)
 {
        struct held_lock *prev;
        struct held_lock *nest = NULL;
@@ -2221,7 +2329,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
                 * Allow read-after-read recursion of the same
                 * lock class (i.e. read_lock(lock)+read_lock(lock)):
                 */
-                if ((read == 2) && prev->read)
+                if ((next->read == 2) && prev->read)
                        return 2;
                /*
@@ -2231,14 +2339,15 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
                if (nest)
                        return 2;
-                return print_deadlock_bug(curr, prev, next);
+                print_deadlock_bug(curr, prev, next);
+                return 0;
        }
        return 1;
 }
 /*
 * There was a chain-cache miss, and we are about to add a new dependency
- * to a previous lock. We recursively validate the following rules:
+ * to a previous lock. We validate the following rules:
 *
 *  - would the adding of the <prev> -> <next> dependency create a
 *    circular dependency in the graph? [== circular deadlock]
@@ -2262,9 +2371,7 @@ static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
               struct held_lock *next, int distance, struct lock_trace *trace)
 {
-        struct lock_list *uninitialized_var(target_entry);
        struct lock_list *entry;
-        struct lock_list this;
        int ret;
        if (!hlock_class(prev)->key || !hlock_class(next)->key) {
@@ -2288,28 +2395,16 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
        /*
         * Prove that the new <prev> -> <next> dependency would not
         * create a circular dependency in the graph. (We do this by
-         * forward-recursing into the graph starting at <next>, and
+         * a breadth-first search into the graph starting at <next>,
-         * checking whether we can reach <prev>.)
+         * and check whether we can reach <prev>.)
         *
-         * We are using global variables to control the recursion, to
+         * The search is limited by the size of the circular queue (i.e.,
-         * keep the stackframe size of the recursive functions low:
+         * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes
+         * in the graph whose neighbours are to be checked.
         */
-        this.class = hlock_class(next);
+        ret = check_noncircular(next, prev, trace);
-        this.parent = NULL;
+        if (unlikely(ret <= 0))
-        ret = check_noncircular(&this, hlock_class(prev), &target_entry);
+                return 0;
-        if (unlikely(!ret)) {
-                if (!trace->nr_entries) {
-                        /*
-                         * If save_trace fails here, the printing might
-                         * trigger a WARN but because of the !nr_entries it
-                         * should not do bad things.
-                         */
-                        save_trace(trace);
-                }
-                return print_circular_bug(&this, target_entry, next, prev);
-        }
-        else if (unlikely(ret < 0))
-                return print_bfs_bug(ret);
        if (!check_irq_usage(curr, prev, next))
                return 0;
@@ -2340,19 +2435,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
                }
        }
+#ifdef CONFIG_LOCKDEP_SMALL
        /*
         * Is the <prev> -> <next> link redundant?
         */
-        this.class = hlock_class(prev);
+        ret = check_redundant(prev, next);
-        this.parent = NULL;
+        if (ret != 1)
-        ret = check_redundant(&this, hlock_class(next), &target_entry);
+                return ret;
-        if (!ret) {
+#endif
-                debug_atomic_inc(nr_redundant);
-                return 2;
-        }
-        if (ret < 0)
-                return print_bfs_bug(ret);
        if (!trace->nr_entries && !save_trace(trace))
                return 0;
@@ -2504,12 +2594,13 @@ static void
 print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
 {
        struct held_lock *hlock;
-        u64 chain_key = 0;
+        u64 chain_key = INITIAL_CHAIN_KEY;
        int depth = curr->lockdep_depth;
-        int i;
+        int i = get_first_held_lock(curr, hlock_next);
-        printk("depth: %u\n", depth + 1);
+        printk("depth: %u (irq_context %u)\n", depth - i + 1,
-        for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) {
+                hlock_next->irq_context);
+        for (; i < depth; i++) {
                hlock = curr->held_locks + i;
                chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
@@ -2523,13 +2614,13 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne
 static void print_chain_keys_chain(struct lock_chain *chain)
 {
        int i;
-        u64 chain_key = 0;
+        u64 chain_key = INITIAL_CHAIN_KEY;
        int class_id;
        printk("depth: %u\n", chain->depth);
        for (i = 0; i < chain->depth; i++) {
                class_id = chain_hlocks[chain->base + i];
-                chain_key = print_chain_key_iteration(class_id + 1, chain_key);
+                chain_key = print_chain_key_iteration(class_id, chain_key);
                print_lock_name(lock_classes + class_id);
                printk("\n");
@@ -2580,7 +2671,7 @@ static int check_no_collision(struct task_struct *curr,
        }
        for (j = 0; j < chain->depth - 1; j++, i++) {
-                id = curr->held_locks[i].class_idx - 1;
+                id = curr->held_locks[i].class_idx;
                if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
                        print_collision(curr, hlock, chain);
@@ -2663,7 +2754,7 @@ static inline int add_chain_cache(struct task_struct *curr,
        if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
                chain->base = nr_chain_hlocks;
                for (j = 0; j < chain->depth - 1; j++, i++) {
-                        int lock_id = curr->held_locks[i].class_idx - 1;
+                        int lock_id = curr->held_locks[i].class_idx;
                        chain_hlocks[chain->base + j] = lock_id;
                }
                chain_hlocks[chain->base + j] = class - lock_classes;
@@ -2753,8 +2844,9 @@ cache_hit:
        return 1;
 }
-static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
+static int validate_chain(struct task_struct *curr,
-                struct held_lock *hlock, int chain_head, u64 chain_key)
+                          struct held_lock *hlock,
+                          int chain_head, u64 chain_key)
 {
        /*
         * Trylock needs to maintain the stack of held locks, but it
@@ -2775,12 +2867,18 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
                 * - is softirq-safe, if this lock is hardirq-unsafe
                 *
                 * And check whether the new lock's dependency graph
-                 * could lead back to the previous lock.
+                 * could lead back to the previous lock:
                 *
-                 * any of these scenarios could lead to a deadlock. If
+                 * - within the current held-lock stack
-                 * All validations
+                 * - across our accumulated lock dependency records
+                 *
+                 * any of these scenarios could lead to a deadlock.
                 */
-                int ret = check_deadlock(curr, hlock, lock, hlock->read);
+                /*
+                 * The simple case: does the current hold the same lock
+                 * already?
+                 */
+                int ret = check_deadlock(curr, hlock);
                if (!ret)
                        return 0;
@@ -2811,16 +2909,12 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
 }
 #else
 static inline int validate_chain(struct task_struct *curr,
-                struct lockdep_map *lock, struct held_lock *hlock,
+                                 struct held_lock *hlock,
-                int chain_head, u64 chain_key)
+                                 int chain_head, u64 chain_key)
 {
        return 1;
 }
+#endif /* CONFIG_PROVE_LOCKING */
-static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
-{
-}
-#endif
 /*
 * We are building curr_chain_key incrementally, so double-check
@@ -2831,7 +2925,7 @@ static void check_chain_key(struct task_struct *curr)
 #ifdef CONFIG_DEBUG_LOCKDEP
        struct held_lock *hlock, *prev_hlock = NULL;
        unsigned int i;
-        u64 chain_key = 0;
+        u64 chain_key = INITIAL_CHAIN_KEY;
        for (i = 0; i < curr->lockdep_depth; i++) {
                hlock = curr->held_locks + i;
@@ -2847,15 +2941,17 @@ static void check_chain_key(struct task_struct *curr)
                                (unsigned long long)hlock->prev_chain_key);
                        return;
                }
                /*
-                 * Whoops ran out of static storage again?
+                 * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is
+                 * it registered lock class index?
                 */
-                if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
+                if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use)))
                        return;
                if (prev_hlock && (prev_hlock->irq_context !=
                                                        hlock->irq_context))
-                        chain_key = 0;
+                        chain_key = INITIAL_CHAIN_KEY;
                chain_key = iterate_chain_key(chain_key, hlock->class_idx);
                prev_hlock = hlock;
        }
@@ -2873,14 +2969,11 @@ static void check_chain_key(struct task_struct *curr)
 #endif
 }
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
                     enum lock_usage_bit new_bit);
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+static void print_usage_bug_scenario(struct held_lock *lock)
-static void
-print_usage_bug_scenario(struct held_lock *lock)
 {
        struct lock_class *class = hlock_class(lock);
@@ -2897,12 +2990,12 @@ print_usage_bug_scenario(struct held_lock *lock)
        printk("\n *** DEADLOCK ***\n\n");
 }
-static int
+static void
 print_usage_bug(struct task_struct *curr, struct held_lock *this,
                enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
 {
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-                return 0;
+                return;
        pr_warn("\n");
        pr_warn("================================\n");
@@ -2932,8 +3025,6 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        pr_warn("\nstack backtrace:\n");
        dump_stack();
-        return 0;
 }
 /*
@@ -2943,8 +3034,10 @@ static inline int
 valid_state(struct task_struct *curr, struct held_lock *this,
            enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
 {
-        if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
+        if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
-                return print_usage_bug(curr, this, bad_bit, new_bit);
+                print_usage_bug(curr, this, bad_bit, new_bit);
+                return 0;
+        }
        return 1;
 }
@@ -2952,7 +3045,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
 /*
 * print irq inversion bug:
 */
-static int
+static void
 print_irq_inversion_bug(struct task_struct *curr,
                        struct lock_list *root, struct lock_list *other,
                        struct held_lock *this, int forwards,
@@ -2963,7 +3056,7 @@ print_irq_inversion_bug(struct task_struct *curr,
        int depth;
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-                return 0;
+                return;
        pr_warn("\n");
        pr_warn("========================================================\n");
@@ -3004,13 +3097,11 @@ print_irq_inversion_bug(struct task_struct *curr,
        pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
        if (!save_trace(&root->trace))
-                return 0;
+                return;
        print_shortest_lock_dependencies(other, root);
        pr_warn("\nstack backtrace:\n");
        dump_stack();
-        return 0;
 }
 /*
@@ -3028,13 +3119,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
        root.parent = NULL;
        root.class = hlock_class(this);
        ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
-        if (ret < 0)
+        if (ret < 0) {
-                return print_bfs_bug(ret);
+                print_bfs_bug(ret);
+                return 0;
+        }
        if (ret == 1)
                return ret;
-        return print_irq_inversion_bug(curr, &root, target_entry,
+        print_irq_inversion_bug(curr, &root, target_entry,
-                                        this, 1, irqclass);
+                                this, 1, irqclass);
+        return 0;
 }
 /*
@@ -3052,13 +3146,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
        root.parent = NULL;
        root.class = hlock_class(this);
        ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
-        if (ret < 0)
+        if (ret < 0) {
-                return print_bfs_bug(ret);
+                print_bfs_bug(ret);
+                return 0;
+        }
        if (ret == 1)
                return ret;
-        return print_irq_inversion_bug(curr, &root, target_entry,
+        print_irq_inversion_bug(curr, &root, target_entry,
-                                        this, 0, irqclass);
+                                this, 0, irqclass);
+        return 0;
 }
 void print_irqtrace_events(struct task_struct *curr)
@@ -3141,7 +3238,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
         * Validate that the lock dependencies don't have conflicting usage
         * states.
         */
-        if ((!read || !dir || STRICT_READ_CHECKS) &&
+        if ((!read || STRICT_READ_CHECKS) &&
                        !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK)))
                return 0;
@@ -3366,8 +3463,12 @@ void trace_softirqs_off(unsigned long ip)
                debug_atomic_inc(redundant_softirqs_off);
 }
-static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
+static int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 {
+        if (!check)
+                goto lock_used;
        /*
         * If non-trylock use in a hardirq or softirq context, then
         * mark the lock as used in these contexts:
@@ -3411,6 +3512,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
                }
        }
+lock_used:
+        /* mark it as used: */
+        if (!mark_lock(curr, hlock, LOCK_USED))
+                return 0;
        return 1;
 }
@@ -3442,35 +3548,6 @@ static int separate_irq_context(struct task_struct *curr,
        return 0;
 }
-#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-static inline
-int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
-                enum lock_usage_bit new_bit)
-{
-        WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
-        return 1;
-}
-static inline int mark_irqflags(struct task_struct *curr,
-                struct held_lock *hlock)
-{
-        return 1;
-}
-static inline unsigned int task_irq_context(struct task_struct *task)
-{
-        return 0;
-}
-static inline int separate_irq_context(struct task_struct *curr,
-                struct held_lock *hlock)
-{
-        return 0;
-}
-#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 /*
 * Mark a lock with a usage bit, and validate the state transition:
 */
@@ -3479,6 +3556,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 {
        unsigned int new_mask = 1 << new_bit, ret = 1;
+        if (new_bit >= LOCK_USAGE_STATES) {
+                DEBUG_LOCKS_WARN_ON(1);
+                return 0;
+        }
        /*
         * If already set then do not dirty the cacheline,
         * nor do any checks:
@@ -3502,25 +3584,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
                return 0;
        switch (new_bit) {
-#define LOCKDEP_STATE(__STATE)                  \
-        case LOCK_USED_IN_##__STATE:            \
-        case LOCK_USED_IN_##__STATE##_READ:     \
-        case LOCK_ENABLED_##__STATE:            \
-        case LOCK_ENABLED_##__STATE##_READ:
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-                ret = mark_lock_irq(curr, this, new_bit);
-                if (!ret)
-                        return 0;
-                break;
        case LOCK_USED:
                debug_atomic_dec(nr_unused_locks);
                break;
        default:
-                if (!debug_locks_off_graph_unlock())
+                ret = mark_lock_irq(curr, this, new_bit);
+                if (!ret)
                        return 0;
-                WARN_ON(1);
-                return 0;
        }
        graph_unlock();
@@ -3538,6 +3608,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
        return ret;
 }
+#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+static inline int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
+{
+        return 1;
+}
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+        return 0;
+}
+static inline int separate_irq_context(struct task_struct *curr,
+                struct held_lock *hlock)
+{
+        return 0;
+}
+#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 /*
 * Initialize a lock instance's lock-class mapping info:
 */
@@ -3601,15 +3692,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
 struct lock_class_key __lockdep_no_validate__;
 EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
-static int
+static void
 print_lock_nested_lock_not_held(struct task_struct *curr,
                                struct held_lock *hlock,
                                unsigned long ip)
 {
        if (!debug_locks_off())
-                return 0;
+                return;
        if (debug_locks_silent)
-                return 0;
+                return;
        pr_warn("\n");
        pr_warn("==================================\n");
@@ -3631,8 +3722,6 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
        pr_warn("\nstack backtrace:\n");
        dump_stack();
-        return 0;
 }
 static int __lock_is_held(const struct lockdep_map *lock, int read);
@@ -3697,24 +3786,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
                return 0;
-        class_idx = class - lock_classes + 1;
+        class_idx = class - lock_classes;
        if (depth) {
                hlock = curr->held_locks + depth - 1;
                if (hlock->class_idx == class_idx && nest_lock) {
-                        if (hlock->references) {
+                        if (!references)
-                                /*
+                                references++;
-                                 * Check: unsigned int references:12, overflow.
-                                 */
-                                if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1))
-                                        return 0;
+                        if (!hlock->references)
                                hlock->references++;
-                        } else {
-                                hlock->references = 2;
-                        }
-                        return 1;
+                        hlock->references += references;
+                        /* Overflow */
+                        if (DEBUG_LOCKS_WARN_ON(hlock->references < references))
+                                return 0;
+                        return 2;
                }
        }
@@ -3741,11 +3830,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 #endif
        hlock->pin_count = pin_count;
-        if (check && !mark_irqflags(curr, hlock))
+        /* Initialize the lock usage bit */
-                return 0;
+        if (!mark_usage(curr, hlock, check))
-        /* mark it as used: */
-        if (!mark_lock(curr, hlock, LOCK_USED))
                return 0;
        /*
@@ -3759,9 +3845,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         * the hash, not class->key.
         */
        /*
-         * Whoops, we did it again.. ran straight out of our static allocation.
+         * Whoops, we did it again.. class_idx is invalid.
         */
-        if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
+        if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use)))
                return 0;
        chain_key = curr->curr_chain_key;
@@ -3769,27 +3855,29 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                /*
                 * How can we have a chain hash when we ain't got no keys?!
                 */
-                if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+                if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY))
                        return 0;
                chain_head = 1;
        }
        hlock->prev_chain_key = chain_key;
        if (separate_irq_context(curr, hlock)) {
-                chain_key = 0;
+                chain_key = INITIAL_CHAIN_KEY;
                chain_head = 1;
        }
        chain_key = iterate_chain_key(chain_key, class_idx);
-        if (nest_lock && !__lock_is_held(nest_lock, -1))
+        if (nest_lock && !__lock_is_held(nest_lock, -1)) {
-                return print_lock_nested_lock_not_held(curr, hlock, ip);
+                print_lock_nested_lock_not_held(curr, hlock, ip);
+                return 0;
+        }
        if (!debug_locks_silent) {
                WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key);
                WARN_ON_ONCE(!hlock_class(hlock)->key);
        }
-        if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
+        if (!validate_chain(curr, hlock, chain_head, chain_key))
                return 0;
        curr->curr_chain_key = chain_key;
@@ -3818,14 +3906,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        return 1;
 }
-static int
+static void print_unlock_imbalance_bug(struct task_struct *curr,
-print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+                                       struct lockdep_map *lock,
-                           unsigned long ip)
+                                       unsigned long ip)
 {
        if (!debug_locks_off())
-                return 0;
+                return;
        if (debug_locks_silent)
-                return 0;
+                return;
        pr_warn("\n");
        pr_warn("=====================================\n");
@@ -3843,8 +3931,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
        pr_warn("\nstack backtrace:\n");
        dump_stack();
-        return 0;
 }
 static int match_held_lock(const struct held_lock *hlock,
@@ -3876,7 +3962,7 @@ static int match_held_lock(const struct held_lock *hlock,
                if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
                        return 0;
-                if (hlock->class_idx == class - lock_classes + 1)
+                if (hlock->class_idx == class - lock_classes)
                        return 1;
        }
@@ -3920,22 +4006,33 @@ out:
 }
 static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
-                              int idx)
+                                int idx, unsigned int *merged)
 {
        struct held_lock *hlock;
+        int first_idx = idx;
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
        for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
-                if (!__lock_acquire(hlock->instance,
+                switch (__lock_acquire(hlock->instance,
                                    hlock_class(hlock)->subclass,
                                    hlock->trylock,
                                    hlock->read, hlock->check,
                                    hlock->hardirqs_off,
                                    hlock->nest_lock, hlock->acquire_ip,
-                                    hlock->references, hlock->pin_count))
+                                    hlock->references, hlock->pin_count)) {
+                case 0:
                        return 1;
+                case 1:
+                        break;
+                case 2:
+                        *merged += (idx == first_idx);
+                        break;
+                default:
+                        WARN_ON(1);
+                        return 0;
+                }
        }
        return 0;
 }
@@ -3946,9 +4043,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
                 unsigned long ip)
 {
        struct task_struct *curr = current;
+        unsigned int depth, merged = 0;
        struct held_lock *hlock;
        struct lock_class *class;
-        unsigned int depth;
        int i;
        if (unlikely(!debug_locks))
@@ -3963,24 +4060,26 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
                return 0;
        hlock = find_held_lock(curr, lock, depth, &i);
-        if (!hlock)
+        if (!hlock) {
-                return print_unlock_imbalance_bug(curr, lock, ip);
+                print_unlock_imbalance_bug(curr, lock, ip);
+                return 0;
+        }
        lockdep_init_map(lock, name, key, 0);
        class = register_lock_class(lock, subclass, 0);
-        hlock->class_idx = class - lock_classes + 1;
+        hlock->class_idx = class - lock_classes;
        curr->lockdep_depth = i;
        curr->curr_chain_key = hlock->prev_chain_key;
-        if (reacquire_held_locks(curr, depth, i))
+        if (reacquire_held_locks(curr, depth, i, &merged))
                return 0;
        /*
         * I took it apart and put it back together again, except now I have
         * these 'spare' parts.. where shall I put them.
         */
-        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged))
                return 0;
        return 1;
 }
@@ -3988,8 +4087,8 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
 {
        struct task_struct *curr = current;
+        unsigned int depth, merged = 0;
        struct held_lock *hlock;
-        unsigned int depth;
        int i;
        if (unlikely(!debug_locks))
@@ -4004,8 +4103,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
                return 0;
        hlock = find_held_lock(curr, lock, depth, &i);
-        if (!hlock)
+        if (!hlock) {
-                return print_unlock_imbalance_bug(curr, lock, ip);
+                print_unlock_imbalance_bug(curr, lock, ip);
+                return 0;
+        }
        curr->lockdep_depth = i;
        curr->curr_chain_key = hlock->prev_chain_key;
@@ -4014,7 +4115,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
        hlock->read = 1;
        hlock->acquire_ip = ip;
-        if (reacquire_held_locks(curr, depth, i))
+        if (reacquire_held_locks(curr, depth, i, &merged))
+                return 0;
+        /* Merging can't happen with unchanged classes.. */
+        if (DEBUG_LOCKS_WARN_ON(merged))
                return 0;
        /*
@@ -4023,6 +4128,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
         */
        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
                return 0;
        return 1;
 }
@@ -4034,11 +4140,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
 * @nested is an hysterical artifact, needs a tree wide cleanup.
 */
 static int
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+__lock_release(struct lockdep_map *lock, unsigned long ip)
 {
        struct task_struct *curr = current;
+        unsigned int depth, merged = 1;
        struct held_lock *hlock;
-        unsigned int depth;
        int i;
        if (unlikely(!debug_locks))
@@ -4049,16 +4155,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
         * So we're all set to release this lock.. wait what lock? We don't
         * own any locks, you've been drinking again?
         */
-        if (DEBUG_LOCKS_WARN_ON(depth <= 0))
+        if (depth <= 0) {
-                 return print_unlock_imbalance_bug(curr, lock, ip);
+                print_unlock_imbalance_bug(curr, lock, ip);
+                return 0;
+        }
        /*
         * Check whether the lock exists in the current stack
         * of held locks:
         */
        hlock = find_held_lock(curr, lock, depth, &i);
-        if (!hlock)
+        if (!hlock) {
-                return print_unlock_imbalance_bug(curr, lock, ip);
+                print_unlock_imbalance_bug(curr, lock, ip);
+                return 0;
+        }
        if (hlock->instance == lock)
                lock_release_holdtime(hlock);
@@ -4093,14 +4203,15 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
        if (i == depth-1)
                return 1;
-        if (reacquire_held_locks(curr, depth, i + 1))
+        if (reacquire_held_locks(curr, depth, i + 1, &merged))
                return 0;
        /*
         * We had N bottles of beer on the wall, we drank one, but now
         * there's not N-1 bottles of beer left on the wall...
+         * Pouring two of the bottles together is acceptable.
         */
-        DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1);
+        DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged);
        /*
         * Since reacquire_held_locks() would have called check_chain_key()
@@ -4318,7 +4429,7 @@ void lock_release(struct lockdep_map *lock, int nested,
        check_flags(flags);
        current->lockdep_recursion = 1;
        trace_lock_release(lock, ip);
-        if (__lock_release(lock, nested, ip))
+        if (__lock_release(lock, ip))
                check_chain_key(current);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
@@ -4401,14 +4512,14 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
 EXPORT_SYMBOL_GPL(lock_unpin_lock);
 #ifdef CONFIG_LOCK_STAT
-static int
+static void print_lock_contention_bug(struct task_struct *curr,
-print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
+                                      struct lockdep_map *lock,
-                           unsigned long ip)
+                                      unsigned long ip)
 {
        if (!debug_locks_off())
-                return 0;
+                return;
        if (debug_locks_silent)
-                return 0;
+                return;
        pr_warn("\n");
        pr_warn("=================================\n");
@@ -4426,8 +4537,6 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
        pr_warn("\nstack backtrace:\n");
        dump_stack();
-        return 0;
 }
 static void
@@ -4572,9 +4681,7 @@ void lockdep_reset(void)
        int i;
        raw_local_irq_save(flags);
-        current->curr_chain_key = 0;
+        lockdep_init_task(current);
-        current->lockdep_depth = 0;
-        current->lockdep_recursion = 0;
        memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
        nr_hardirq_chains = 0;
        nr_softirq_chains = 0;
@@ -4614,9 +4721,9 @@ static void remove_class_from_lock_chain(struct pending_free *pf,
        return;
 recalc:
-        chain_key = 0;
+        chain_key = INITIAL_CHAIN_KEY;
        for (i = chain->base; i < chain->base + chain->depth; i++)
-                chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+                chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
        if (chain->depth && chain->chain_key == chain_key)
                return;
        /* Overwrite the chain key for concurrent RCU readers. */
@@ -4690,6 +4797,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
                WRITE_ONCE(class->key, NULL);
                WRITE_ONCE(class->name, NULL);
                nr_lock_classes--;
+                __clear_bit(class - lock_classes, lock_classes_in_use);
        } else {
                WARN_ONCE(true, "%s() failed for class %s\n", __func__,
                          class->name);
@@ -5035,6 +5143,7 @@ void __init lockdep_init(void)
        printk(" memory used by lock dependency info: %zu kB\n",
               (sizeof(lock_classes) +
+                sizeof(lock_classes_in_use) +
                sizeof(classhash_table) +
                sizeof(list_entries) +
                sizeof(list_entries_in_use) +
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 150ec3f0c5b5..cc83568d5012 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -131,7 +131,6 @@ extern unsigned int nr_hardirq_chains;
 extern unsigned int nr_softirq_chains;
 extern unsigned int nr_process_chains;
 extern unsigned int max_lockdep_depth;
-extern unsigned int max_recursion_depth;
 extern unsigned int max_bfs_queue_depth;
@@ -160,25 +159,22 @@ lockdep_count_backward_deps(struct lock_class *class)
 * and we want to avoid too much cache bouncing.
 */
 struct lockdep_stats {
-        int     chain_lookup_hits;
+        unsigned long  chain_lookup_hits;
-        int     chain_lookup_misses;
+        unsigned int   chain_lookup_misses;
-        int     hardirqs_on_events;
+        unsigned long  hardirqs_on_events;
-        int     hardirqs_off_events;
+        unsigned long  hardirqs_off_events;
-        int     redundant_hardirqs_on;
+        unsigned long  redundant_hardirqs_on;
-        int     redundant_hardirqs_off;
+        unsigned long  redundant_hardirqs_off;
-        int     softirqs_on_events;
+        unsigned long  softirqs_on_events;
-        int     softirqs_off_events;
+        unsigned long  softirqs_off_events;
-        int     redundant_softirqs_on;
+        unsigned long  redundant_softirqs_on;
-        int     redundant_softirqs_off;
+        unsigned long  redundant_softirqs_off;
-        int     nr_unused_locks;
+        int            nr_unused_locks;
-        int     nr_redundant_checks;
+        unsigned int   nr_redundant_checks;
-        int     nr_redundant;
+        unsigned int   nr_redundant;
-        int     nr_cyclic_checks;
+        unsigned int   nr_cyclic_checks;
-        int     nr_cyclic_check_recursions;
+        unsigned int   nr_find_usage_forwards_checks;
-        int     nr_find_usage_forwards_checks;
+        unsigned int   nr_find_usage_backwards_checks;
-        int     nr_find_usage_forwards_recursions;
-        int     nr_find_usage_backwards_checks;
-        int     nr_find_usage_backwards_recursions;
        /*
         * Per lock class locking operation stat counts
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 80a463d31a8d..c513031cd7e3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -975,7 +975,7 @@ static int __init lock_torture_init(void)
                        goto unwind;
        }
        if (stutter > 0) {
-                firsterr = torture_stutter_init(stutter);
+                firsterr = torture_stutter_init(stutter, stutter);
                if (firsterr)
                        goto unwind;
        }
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index db578783dd36..0c601ae072b3 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kernel/locking/mutex.c
 *
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f17dad99eec8..364d38a0c444 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/atomic.h>
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
@@ -17,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
                return -ENOMEM;
        /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
-        rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+        rcu_sync_init(&sem->rss);
        __init_rwsem(&sem->rw_sem, name, rwsem_key);
        rcuwait_init(&sem->writer);
        sem->readers_block = 0;
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index c7471c3fb798..fe9ca92faa2a 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * Queued read/write locks
 *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
 * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
 *
 * Authors: Waiman Long <waiman.long@hp.com>
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index e14b32c69639..2473f10c6956 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * Queued spinlock
 *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
 * (C) Copyright 2013-2014,2018 Red Hat, Inc.
 * (C) Copyright 2015 Intel Corp.
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 54152670ff24..e625bb410aa2 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -1,13 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
 *
 * Authors: Waiman Long <longman@redhat.com>
 */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 978d63a8261c..38fbf9fa7f1b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * RT-Mutexes: simple blocking mutual exclusion locks with PI support
 *
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
deleted file mode 100644
index 6b3ee9948bf1..000000000000
--- a/kernel/locking/rwsem-xadd.c
+++ /dev/null
@@ -1,729 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* rwsem.c: R/W semaphores: contention handling functions
- *
- * Written by David Howells (dhowells@redhat.com).
- * Derived from arch/i386/kernel/semaphore.c
- *
- * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
- * and Michel Lespinasse <walken@google.com>
- *
- * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
- * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
- */
-#include <linux/rwsem.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/debug.h>
-#include <linux/osq_lock.h>
-#include "rwsem.h"
-/*
- * Guide to the rw_semaphore's count field for common values.
- * (32-bit case illustrated, similar for 64-bit)
- *
- * 0x0000000X   (1) X readers active or attempting lock, no writer waiting
- *                  X = #active_readers + #readers attempting to lock
- *                  (X*ACTIVE_BIAS)
- *
- * 0x00000000   rwsem is unlocked, and no one is waiting for the lock or
- *              attempting to read lock or write lock.
- *
- * 0xffff000X   (1) X readers active or attempting lock, with waiters for lock
- *                  X = #active readers + # readers attempting lock
- *                  (X*ACTIVE_BIAS + WAITING_BIAS)
- *              (2) 1 writer attempting lock, no waiters for lock
- *                  X-1 = #active readers + #readers attempting lock
- *                  ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- *              (3) 1 writer active, no waiters for lock
- *                  X-1 = #active readers + #readers attempting lock
- *                  ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- *
- * 0xffff0001   (1) 1 reader active or attempting lock, waiters for lock
- *                  (WAITING_BIAS + ACTIVE_BIAS)
- *              (2) 1 writer active or attempting lock, no waiters for lock
- *                  (ACTIVE_WRITE_BIAS)
- *
- * 0xffff0000   (1) There are writers or readers queued but none active
- *                  or in the process of attempting lock.
- *                  (WAITING_BIAS)
- *              Note: writer can attempt to steal lock for this count by adding
- *              ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
- *
- * 0xfffe0001   (1) 1 writer active, or attempting lock. Waiters on queue.
- *                  (ACTIVE_WRITE_BIAS + WAITING_BIAS)
- *
- * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
- *       the count becomes more than 0 for successful lock acquisition,
- *       i.e. the case where there are only readers or nobody has lock.
- *       (1st and 2nd case above).
- *
- *       Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
- *       checking the count becomes ACTIVE_WRITE_BIAS for successful lock
- *       acquisition (i.e. nobody else has lock or attempts lock).  If
- *       unsuccessful, in rwsem_down_write_failed, we'll check to see if there
- *       are only waiters but none active (5th case above), and attempt to
- *       steal the lock.
- *
- */
-/*
- * Initialize an rwsem:
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
-                  struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-        /*
-         * Make sure we are not reinitializing a held semaphore:
-         */
-        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
-        lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
-        atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
-        raw_spin_lock_init(&sem->wait_lock);
-        INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-        sem->owner = NULL;
-        osq_lock_init(&sem->osq);
-#endif
-}
-EXPORT_SYMBOL(__init_rwsem);
-enum rwsem_waiter_type {
-        RWSEM_WAITING_FOR_WRITE,
-        RWSEM_WAITING_FOR_READ
-};
-struct rwsem_waiter {
-        struct list_head list;
-        struct task_struct *task;
-        enum rwsem_waiter_type type;
-};
-enum rwsem_wake_type {
-        RWSEM_WAKE_ANY,         /* Wake whatever's at head of wait list */
-        RWSEM_WAKE_READERS,     /* Wake readers only */
-        RWSEM_WAKE_READ_OWNED   /* Waker thread holds the read lock */
-};
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here from up_xxxx(), then:
- *   - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
- *   - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
- * - there must be someone on the queue
- * - the wait_lock must be held by the caller
- * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
- *   to actually wakeup the blocked task(s) and drop the reference count,
- *   preferably when the wait_lock is released
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only marked woken if downgrading is false
- */
-static void __rwsem_mark_wake(struct rw_semaphore *sem,
-                              enum rwsem_wake_type wake_type,
-                              struct wake_q_head *wake_q)
-{
-        struct rwsem_waiter *waiter, *tmp;
-        long oldcount, woken = 0, adjustment = 0;
-        /*
-         * Take a peek at the queue head waiter such that we can determine
-         * the wakeup(s) to perform.
-         */
-        waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
-        if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
-                if (wake_type == RWSEM_WAKE_ANY) {
-                        /*
-                         * Mark writer at the front of the queue for wakeup.
-                         * Until the task is actually later awoken later by
-                         * the caller, other writers are able to steal it.
-                         * Readers, on the other hand, will block as they
-                         * will notice the queued writer.
-                         */
-                        wake_q_add(wake_q, waiter->task);
-                        lockevent_inc(rwsem_wake_writer);
-                }
-                return;
-        }
-        /*
-         * Writers might steal the lock before we grant it to the next reader.
-         * We prefer to do the first reader grant before counting readers
-         * so we can bail out early if a writer stole the lock.
-         */
-        if (wake_type != RWSEM_WAKE_READ_OWNED) {
-                adjustment = RWSEM_ACTIVE_READ_BIAS;
- try_reader_grant:
-                oldcount = atomic_long_fetch_add(adjustment, &sem->count);
-                if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
-                        /*
-                         * If the count is still less than RWSEM_WAITING_BIAS
-                         * after removing the adjustment, it is assumed that
-                         * a writer has stolen the lock. We have to undo our
-                         * reader grant.
-                         */
-                        if (atomic_long_add_return(-adjustment, &sem->count) <
-                            RWSEM_WAITING_BIAS)
-                                return;
-                        /* Last active locker left. Retry waking readers. */
-                        goto try_reader_grant;
-                }
-                /*
-                 * Set it to reader-owned to give spinners an early
-                 * indication that readers now have the lock.
-                 */
-                __rwsem_set_reader_owned(sem, waiter->task);
-        }
-        /*
-         * Grant an infinite number of read locks to the readers at the front
-         * of the queue. We know that woken will be at least 1 as we accounted
-         * for above. Note we increment the 'active part' of the count by the
-         * number of readers before waking any processes up.
-         */
-        list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
-                struct task_struct *tsk;
-                if (waiter->type == RWSEM_WAITING_FOR_WRITE)
-                        break;
-                woken++;
-                tsk = waiter->task;
-                get_task_struct(tsk);
-                list_del(&waiter->list);
-                /*
-                 * Ensure calling get_task_struct() before setting the reader
-                 * waiter to nil such that rwsem_down_read_failed() cannot
-                 * race with do_exit() by always holding a reference count
-                 * to the task to wakeup.
-                 */
-                smp_store_release(&waiter->task, NULL);
-                /*
-                 * Ensure issuing the wakeup (either by us or someone else)
-                 * after setting the reader waiter to nil.
-                 */
-                wake_q_add_safe(wake_q, tsk);
-        }
-        adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
-        lockevent_cond_inc(rwsem_wake_reader, woken);
-        if (list_empty(&sem->wait_list)) {
-                /* hit end of list above */
-                adjustment -= RWSEM_WAITING_BIAS;
-        }
-        if (adjustment)
-                atomic_long_add(adjustment, &sem->count);
-}
-/*
- * This function must be called with the sem->wait_lock held to prevent
- * race conditions between checking the rwsem wait list and setting the
- * sem->count accordingly.
- */
-static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
-{
-        /*
-         * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
-         */
-        if (count != RWSEM_WAITING_BIAS)
-                return false;
-        /*
-         * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
-         * are other tasks on the wait list, we need to add on WAITING_BIAS.
-         */
-        count = list_is_singular(&sem->wait_list) ?
-                        RWSEM_ACTIVE_WRITE_BIAS :
-                        RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
-        if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
-                                                        == RWSEM_WAITING_BIAS) {
-                rwsem_set_owner(sem);
-                return true;
-        }
-        return false;
-}
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-/*
- * Try to acquire write lock before the writer has been put on wait queue.
- */
-static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
-{
-        long count = atomic_long_read(&sem->count);
-        while (!count || count == RWSEM_WAITING_BIAS) {
-                if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
-                                        count + RWSEM_ACTIVE_WRITE_BIAS)) {
-                        rwsem_set_owner(sem);
-                        lockevent_inc(rwsem_opt_wlock);
-                        return true;
-                }
-        }
-        return false;
-}
-static inline bool owner_on_cpu(struct task_struct *owner)
-{
-        /*
-         * As lock holder preemption issue, we both skip spinning if
-         * task is not on cpu or its cpu is preempted
-         */
-        return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-}
-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
-{
-        struct task_struct *owner;
-        bool ret = true;
-        BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
-        if (need_resched())
-                return false;
-        rcu_read_lock();
-        owner = READ_ONCE(sem->owner);
-        if (owner) {
-                ret = is_rwsem_owner_spinnable(owner) &&
-                      owner_on_cpu(owner);
-        }
-        rcu_read_unlock();
-        return ret;
-}
-/*
- * Return true only if we can still spin on the owner field of the rwsem.
- */
-static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
-{
-        struct task_struct *owner = READ_ONCE(sem->owner);
-        if (!is_rwsem_owner_spinnable(owner))
-                return false;
-        rcu_read_lock();
-        while (owner && (READ_ONCE(sem->owner) == owner)) {
-                /*
-                 * Ensure we emit the owner->on_cpu, dereference _after_
-                 * checking sem->owner still matches owner, if that fails,
-                 * owner might point to free()d memory, if it still matches,
-                 * the rcu_read_lock() ensures the memory stays valid.
-                 */
-                barrier();
-                /*
-                 * abort spinning when need_resched or owner is not running or
-                 * owner's cpu is preempted.
-                 */
-                if (need_resched() || !owner_on_cpu(owner)) {
-                        rcu_read_unlock();
-                        return false;
-                }
-                cpu_relax();
-        }
-        rcu_read_unlock();
-        /*
-         * If there is a new owner or the owner is not set, we continue
-         * spinning.
-         */
-        return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
-}
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
-{
-        bool taken = false;
-        preempt_disable();
-        /* sem->wait_lock should not be held when doing optimistic spinning */
-        if (!rwsem_can_spin_on_owner(sem))
-                goto done;
-        if (!osq_lock(&sem->osq))
-                goto done;
-        /*
-         * Optimistically spin on the owner field and attempt to acquire the
-         * lock whenever the owner changes. Spinning will be stopped when:
-         *  1) the owning writer isn't running; or
-         *  2) readers own the lock as we can't determine if they are
-         *     actively running or not.
-         */
-        while (rwsem_spin_on_owner(sem)) {
-                /*
-                 * Try to acquire the lock
-                 */
-                if (rwsem_try_write_lock_unqueued(sem)) {
-                        taken = true;
-                        break;
-                }
-                /*
-                 * When there's no owner, we might have preempted between the
-                 * owner acquiring the lock and setting the owner field. If
-                 * we're an RT task that will live-lock because we won't let
-                 * the owner complete.
-                 */
-                if (!sem->owner && (need_resched() || rt_task(current)))
-                        break;
-                /*
-                 * The cpu_relax() call is a compiler barrier which forces
-                 * everything in this loop to be re-loaded. We don't need
-                 * memory barriers as we'll eventually observe the right
-                 * values at the cost of a few extra spins.
-                 */
-                cpu_relax();
-        }
-        osq_unlock(&sem->osq);
-done:
-        preempt_enable();
-        lockevent_cond_inc(rwsem_opt_fail, !taken);
-        return taken;
-}
-/*
- * Return true if the rwsem has active spinner
- */
-static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
-{
-        return osq_is_locked(&sem->osq);
-}
-#else
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
-{
-        return false;
-}
-static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
-{
-        return false;
-}
-#endif
-/*
- * Wait for the read lock to be granted
- */
-static inline struct rw_semaphore __sched *
-__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
-{
-        long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
-        struct rwsem_waiter waiter;
-        DEFINE_WAKE_Q(wake_q);
-        waiter.task = current;
-        waiter.type = RWSEM_WAITING_FOR_READ;
-        raw_spin_lock_irq(&sem->wait_lock);
-        if (list_empty(&sem->wait_list)) {
-                /*
-                 * In case the wait queue is empty and the lock isn't owned
-                 * by a writer, this reader can exit the slowpath and return
-                 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
-                 * been set in the count.
-                 */
-                if (atomic_long_read(&sem->count) >= 0) {
-                        raw_spin_unlock_irq(&sem->wait_lock);
-                        rwsem_set_reader_owned(sem);
-                        lockevent_inc(rwsem_rlock_fast);
-                        return sem;
-                }
-                adjustment += RWSEM_WAITING_BIAS;
-        }
-        list_add_tail(&waiter.list, &sem->wait_list);
-        /* we're now waiting on the lock, but no longer actively locking */
-        count = atomic_long_add_return(adjustment, &sem->count);
-        /*
-         * If there are no active locks, wake the front queued process(es).
-         *
-         * If there are no writers and we are first in the queue,
-         * wake our own waiter to join the existing active readers !
-         */
-        if (count == RWSEM_WAITING_BIAS ||
-            (count > RWSEM_WAITING_BIAS &&
-             adjustment != -RWSEM_ACTIVE_READ_BIAS))
-                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-        raw_spin_unlock_irq(&sem->wait_lock);
-        wake_up_q(&wake_q);
-        /* wait to be given the lock */
-        while (true) {
-                set_current_state(state);
-                if (!waiter.task)
-                        break;
-                if (signal_pending_state(state, current)) {
-                        raw_spin_lock_irq(&sem->wait_lock);
-                        if (waiter.task)
-                                goto out_nolock;
-                        raw_spin_unlock_irq(&sem->wait_lock);
-                        break;
-                }
-                schedule();
-                lockevent_inc(rwsem_sleep_reader);
-        }
-        __set_current_state(TASK_RUNNING);
-        lockevent_inc(rwsem_rlock);
-        return sem;
-out_nolock:
-        list_del(&waiter.list);
-        if (list_empty(&sem->wait_list))
-                atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
-        raw_spin_unlock_irq(&sem->wait_lock);
-        __set_current_state(TASK_RUNNING);
-        lockevent_inc(rwsem_rlock_fail);
-        return ERR_PTR(-EINTR);
-}
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed(struct rw_semaphore *sem)
-{
-        return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed);
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed_killable(struct rw_semaphore *sem)
-{
-        return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed_killable);
-/*
- * Wait until we successfully acquire the write lock
- */
-static inline struct rw_semaphore *
-__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
-{
-        long count;
-        bool waiting = true; /* any queued threads before us */
-        struct rwsem_waiter waiter;
-        struct rw_semaphore *ret = sem;
-        DEFINE_WAKE_Q(wake_q);
-        /* undo write bias from down_write operation, stop active locking */
-        count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
-        /* do optimistic spinning and steal lock if possible */
-        if (rwsem_optimistic_spin(sem))
-                return sem;
-        /*
-         * Optimistic spinning failed, proceed to the slowpath
-         * and block until we can acquire the sem.
-         */
-        waiter.task = current;
-        waiter.type = RWSEM_WAITING_FOR_WRITE;
-        raw_spin_lock_irq(&sem->wait_lock);
-        /* account for this before adding a new element to the list */
-        if (list_empty(&sem->wait_list))
-                waiting = false;
-        list_add_tail(&waiter.list, &sem->wait_list);
-        /* we're now waiting on the lock, but no longer actively locking */
-        if (waiting) {
-                count = atomic_long_read(&sem->count);
-                /*
-                 * If there were already threads queued before us and there are
-                 * no active writers, the lock must be read owned; so we try to
-                 * wake any read locks that were queued ahead of us.
-                 */
-                if (count > RWSEM_WAITING_BIAS) {
-                        __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
-                        /*
-                         * The wakeup is normally called _after_ the wait_lock
-                         * is released, but given that we are proactively waking
-                         * readers we can deal with the wake_q overhead as it is
-                         * similar to releasing and taking the wait_lock again
-                         * for attempting rwsem_try_write_lock().
-                         */
-                        wake_up_q(&wake_q);
-                        /*
-                         * Reinitialize wake_q after use.
-                         */
-                        wake_q_init(&wake_q);
-                }
-        } else
-                count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
-        /* wait until we successfully acquire the lock */
-        set_current_state(state);
-        while (true) {
-                if (rwsem_try_write_lock(count, sem))
-                        break;
-                raw_spin_unlock_irq(&sem->wait_lock);
-                /* Block until there are no active lockers. */
-                do {
-                        if (signal_pending_state(state, current))
-                                goto out_nolock;
-                        schedule();
-                        lockevent_inc(rwsem_sleep_writer);
-                        set_current_state(state);
-                } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
-                raw_spin_lock_irq(&sem->wait_lock);
-        }
-        __set_current_state(TASK_RUNNING);
-        list_del(&waiter.list);
-        raw_spin_unlock_irq(&sem->wait_lock);
-        lockevent_inc(rwsem_wlock);
-        return ret;
-out_nolock:
-        __set_current_state(TASK_RUNNING);
-        raw_spin_lock_irq(&sem->wait_lock);
-        list_del(&waiter.list);
-        if (list_empty(&sem->wait_list))
-                atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
-        else
-                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-        raw_spin_unlock_irq(&sem->wait_lock);
-        wake_up_q(&wake_q);
-        lockevent_inc(rwsem_wlock_fail);
-        return ERR_PTR(-EINTR);
-}
-__visible struct rw_semaphore * __sched
-rwsem_down_write_failed(struct rw_semaphore *sem)
-{
-        return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(rwsem_down_write_failed);
-__visible struct rw_semaphore * __sched
-rwsem_down_write_failed_killable(struct rw_semaphore *sem)
-{
-        return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(rwsem_down_write_failed_killable);
-/*
- * handle waking up a waiter on the semaphore
- * - up_read/up_write has decremented the active part of count if we come here
- */
-__visible
-struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        DEFINE_WAKE_Q(wake_q);
-        /*
-        * __rwsem_down_write_failed_common(sem)
-        *   rwsem_optimistic_spin(sem)
-        *     osq_unlock(sem->osq)
-        *   ...
-        *   atomic_long_add_return(&sem->count)
-        *
-        *      - VS -
-        *
-        *              __up_write()
-        *                if (atomic_long_sub_return_release(&sem->count) < 0)
-        *                  rwsem_wake(sem)
-        *                    osq_is_locked(&sem->osq)
-        *
-        * And __up_write() must observe !osq_is_locked() when it observes the
-        * atomic_long_add_return() in order to not miss a wakeup.
-        *
-        * This boils down to:
-        *
-        * [S.rel] X = 1                [RmW] r0 = (Y += 0)
-        *         MB                         RMB
-        * [RmW]   Y += 1               [L]   r1 = X
-        *
-        * exists (r0=1 /\ r1=0)
-        */
-        smp_rmb();
-        /*
-         * If a spinner is present, it is not necessary to do the wakeup.
-         * Try to do wakeup only if the trylock succeeds to minimize
-         * spinlock contention which may introduce too much delay in the
-         * unlock operation.
-         *
-         *    spinning writer           up_write/up_read caller
-         *    ---------------           -----------------------
-         * [S]   osq_unlock()           [L]   osq
-         *       MB                           RMB
-         * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
-         *
-         * Here, it is important to make sure that there won't be a missed
-         * wakeup while the rwsem is free and the only spinning writer goes
-         * to sleep without taking the rwsem. Even when the spinning writer
-         * is just going to break out of the waiting loop, it will still do
-         * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
-         * rwsem_has_spinner() is true, it will guarantee at least one
-         * trylock attempt on the rwsem later on.
-         */
-        if (rwsem_has_spinner(sem)) {
-                /*
-                 * The smp_rmb() here is to make sure that the spinner
-                 * state is consulted before reading the wait_lock.
-                 */
-                smp_rmb();
-                if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
-                        return sem;
-                goto locked;
-        }
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-locked:
-        if (!list_empty(&sem->wait_list))
-                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        wake_up_q(&wake_q);
-        return sem;
-}
-EXPORT_SYMBOL(rwsem_wake);
-/*
- * downgrade a write lock into a read lock
- * - caller incremented waiting part of count and discovered it still negative
- * - just wake up any readers at the front of the queue
- */
-__visible
-struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        DEFINE_WAKE_Q(wake_q);
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (!list_empty(&sem->wait_list))
-                __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        wake_up_q(&wake_q);
-        return sem;
-}
-EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index ccbf18f560ff..37524a47f002 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -3,17 +3,1438 @@
 *
 * Written by David Howells (dhowells@redhat.com).
 * Derived from asm-i386/semaphore.h
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
+ *
+ * Rwsem count bit fields re-definition and rwsem rearchitecture by
+ * Waiman Long <longman@redhat.com> and
+ * Peter Zijlstra <peterz@infradead.org>.
 */
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/clock.h>
 #include <linux/export.h>
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
 #include "rwsem.h"
+#include "lock_events.h"
+/*
+ * The least significant 3 bits of the owner value has the following
+ * meanings when set.
+ *  - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
+ *  - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock.
+ *  - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock.
+ *
+ * When the rwsem is either owned by an anonymous writer, or it is
+ * reader-owned, but a spinning writer has timed out, both nonspinnable
+ * bits will be set to disable optimistic spinning by readers and writers.
+ * In the later case, the last unlocking reader should then check the
+ * writer nonspinnable bit and clear it only to give writers preference
+ * to acquire the lock via optimistic spinning, but not readers. Similar
+ * action is also done in the reader slowpath.
+ * When a writer acquires a rwsem, it puts its task_struct pointer
+ * into the owner field. It is cleared after an unlock.
+ *
+ * When a reader acquires a rwsem, it will also puts its task_struct
+ * pointer into the owner field with the RWSEM_READER_OWNED bit set.
+ * On unlock, the owner field will largely be left untouched. So
+ * for a free or reader-owned rwsem, the owner value may contain
+ * information about the last reader that acquires the rwsem.
+ *
+ * That information may be helpful in debugging cases where the system
+ * seems to hang on a reader owned rwsem especially if only one reader
+ * is involved. Ideally we would like to track all the readers that own
+ * a rwsem, but the overhead is simply too big.
+ *
+ * Reader optimistic spinning is helpful when the reader critical section
+ * is short and there aren't that many readers around. It makes readers
+ * relatively more preferred than writers. When a writer times out spinning
+ * on a reader-owned lock and set the nospinnable bits, there are two main
+ * reasons for that.
+ *
+ *  1) The reader critical section is long, perhaps the task sleeps after
+ *     acquiring the read lock.
+ *  2) There are just too many readers contending the lock causing it to
+ *     take a while to service all of them.
+ *
+ * In the former case, long reader critical section will impede the progress
+ * of writers which is usually more important for system performance. In
+ * the later case, reader optimistic spinning tends to make the reader
+ * groups that contain readers that acquire the lock together smaller
+ * leading to more of them. That may hurt performance in some cases. In
+ * other words, the setting of nonspinnable bits indicates that reader
+ * optimistic spinning may not be helpful for those workloads that cause
+ * it.
+ *
+ * Therefore, any writers that had observed the setting of the writer
+ * nonspinnable bit for a given rwsem after they fail to acquire the lock
+ * via optimistic spinning will set the reader nonspinnable bit once they
+ * acquire the write lock. Similarly, readers that observe the setting
+ * of reader nonspinnable bit at slowpath entry will set the reader
+ * nonspinnable bits when they acquire the read lock via the wakeup path.
+ *
+ * Once the reader nonspinnable bit is on, it will only be reset when
+ * a writer is able to acquire the rwsem in the fast path or somehow a
+ * reader or writer in the slowpath doesn't observe the nonspinable bit.
+ *
+ * This is to discourage reader optmistic spinning on that particular
+ * rwsem and make writers more preferred. This adaptive disabling of reader
+ * optimistic spinning will alleviate the negative side effect of this
+ * feature.
+ */
+#define RWSEM_READER_OWNED      (1UL << 0)
+#define RWSEM_RD_NONSPINNABLE   (1UL << 1)
+#define RWSEM_WR_NONSPINNABLE   (1UL << 2)
+#define RWSEM_NONSPINNABLE      (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE)
+#define RWSEM_OWNER_FLAGS_MASK  (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
+#ifdef CONFIG_DEBUG_RWSEMS
+# define DEBUG_RWSEMS_WARN_ON(c, sem)   do {                    \
+        if (!debug_locks_silent &&                              \
+            WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+                #c, atomic_long_read(&(sem)->count),            \
+                atomic_long_read(&(sem)->owner), (long)current, \
+                list_empty(&(sem)->wait_list) ? "" : "not "))   \
+                        debug_locks_off();                      \
+        } while (0)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c, sem)
+#endif
+/*
+ * On 64-bit architectures, the bit definitions of the count are:
+ *
+ * Bit  0    - writer locked bit
+ * Bit  1    - waiters present bit
+ * Bit  2    - lock handoff bit
+ * Bits 3-7  - reserved
+ * Bits 8-62 - 55-bit reader count
+ * Bit  63   - read fail bit
+ *
+ * On 32-bit architectures, the bit definitions of the count are:
+ *
+ * Bit  0    - writer locked bit
+ * Bit  1    - waiters present bit
+ * Bit  2    - lock handoff bit
+ * Bits 3-7  - reserved
+ * Bits 8-30 - 23-bit reader count
+ * Bit  31   - read fail bit
+ *
+ * It is not likely that the most significant bit (read fail bit) will ever
+ * be set. This guard bit is still checked anyway in the down_read() fastpath
+ * just in case we need to use up more of the reader bits for other purpose
+ * in the future.
+ *
+ * atomic_long_fetch_add() is used to obtain reader lock, whereas
+ * atomic_long_cmpxchg() will be used to obtain writer lock.
+ *
+ * There are three places where the lock handoff bit may be set or cleared.
+ * 1) rwsem_mark_wake() for readers.
+ * 2) rwsem_try_write_lock() for writers.
+ * 3) Error path of rwsem_down_write_slowpath().
+ *
+ * For all the above cases, wait_lock will be held. A writer must also
+ * be the first one in the wait_list to be eligible for setting the handoff
+ * bit. So concurrent setting/clearing of handoff bit is not possible.
+ */
+#define RWSEM_WRITER_LOCKED     (1UL << 0)
+#define RWSEM_FLAG_WAITERS      (1UL << 1)
+#define RWSEM_FLAG_HANDOFF      (1UL << 2)
+#define RWSEM_FLAG_READFAIL     (1UL << (BITS_PER_LONG - 1))
+#define RWSEM_READER_SHIFT      8
+#define RWSEM_READER_BIAS       (1UL << RWSEM_READER_SHIFT)
+#define RWSEM_READER_MASK       (~(RWSEM_READER_BIAS - 1))
+#define RWSEM_WRITER_MASK       RWSEM_WRITER_LOCKED
+#define RWSEM_LOCK_MASK         (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
+#define RWSEM_READ_FAILED_MASK  (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
+                                 RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
+/*
+ * All writes to owner are protected by WRITE_ONCE() to make sure that
+ * store tearing can't happen as optimistic spinners may read and use
+ * the owner value concurrently without lock. Read from owner, however,
+ * may not need READ_ONCE() as long as the pointer value is only used
+ * for comparison and isn't being dereferenced.
+ */
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+        atomic_long_set(&sem->owner, (long)current);
+}
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+        atomic_long_set(&sem->owner, 0);
+}
+/*
+ * Test the flags in the owner field.
+ */
+static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
+{
+        return atomic_long_read(&sem->owner) & flags;
+}
+/*
+ * The task_struct pointer of the last owning reader will be left in
+ * the owner field.
+ *
+ * Note that the owner value just indicates the task has owned the rwsem
+ * previously, it may not be the real owner or one of the real owners
+ * anymore when that field is examined, so take it with a grain of salt.
+ *
+ * The reader non-spinnable bit is preserved.
+ */
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+                                            struct task_struct *owner)
+{
+        unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
+                (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE);
+        atomic_long_set(&sem->owner, val);
+}
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+        __rwsem_set_reader_owned(sem, current);
+}
+/*
+ * Return true if the rwsem is owned by a reader.
+ */
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+#ifdef CONFIG_DEBUG_RWSEMS
+        /*
+         * Check the count to see if it is write-locked.
+         */
+        long count = atomic_long_read(&sem->count);
+        if (count & RWSEM_WRITER_MASK)
+                return false;
+#endif
+        return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
+}
+#ifdef CONFIG_DEBUG_RWSEMS
+/*
+ * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
+ * is a task pointer in owner of a reader-owned rwsem, it will be the
+ * real owner or one of the real owners. The only exception is when the
+ * unlock is done by up_read_non_owner().
+ */
+static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
+{
+        unsigned long val = atomic_long_read(&sem->owner);
+        while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
+                if (atomic_long_try_cmpxchg(&sem->owner, &val,
+                                            val & RWSEM_OWNER_FLAGS_MASK))
+                        return;
+        }
+}
+#else
+static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
+{
+}
+#endif
+/*
+ * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
+ * remains set. Otherwise, the operation will be aborted.
+ */
+static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
+{
+        unsigned long owner = atomic_long_read(&sem->owner);
+        do {
+                if (!(owner & RWSEM_READER_OWNED))
+                        break;
+                if (owner & RWSEM_NONSPINNABLE)
+                        break;
+        } while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
+                                          owner | RWSEM_NONSPINNABLE));
+}
+static inline bool rwsem_read_trylock(struct rw_semaphore *sem)
+{
+        long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
+        if (WARN_ON_ONCE(cnt < 0))
+                rwsem_set_nonspinnable(sem);
+        return !(cnt & RWSEM_READ_FAILED_MASK);
+}
+/*
+ * Return just the real task structure pointer of the owner
+ */
+static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+{
+        return (struct task_struct *)
+                (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
+}
+/*
+ * Return the real task structure pointer of the owner and the embedded
+ * flags in the owner. pflags must be non-NULL.
+ */
+static inline struct task_struct *
+rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
+{
+        unsigned long owner = atomic_long_read(&sem->owner);
+        *pflags = owner & RWSEM_OWNER_FLAGS_MASK;
+        return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
+}
+/*
+ * Guide to the rw_semaphore's count field.
+ *
+ * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
+ * by a writer.
+ *
+ * The lock is owned by readers when
+ * (1) the RWSEM_WRITER_LOCKED isn't set in count,
+ * (2) some of the reader bits are set in count, and
+ * (3) the owner field has RWSEM_READ_OWNED bit set.
+ *
+ * Having some reader bits set is not enough to guarantee a readers owned
+ * lock as the readers may be in the process of backing out from the count
+ * and a writer has just released the lock. So another writer may steal
+ * the lock immediately after that.
+ */
+/*
+ * Initialize an rwsem:
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+                  struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        /*
+         * Make sure we are not reinitializing a held semaphore:
+         */
+        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+        lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+        atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
+        raw_spin_lock_init(&sem->wait_lock);
+        INIT_LIST_HEAD(&sem->wait_list);
+        atomic_long_set(&sem->owner, 0L);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+        osq_lock_init(&sem->osq);
+#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
+enum rwsem_waiter_type {
+        RWSEM_WAITING_FOR_WRITE,
+        RWSEM_WAITING_FOR_READ
+};
+struct rwsem_waiter {
+        struct list_head list;
+        struct task_struct *task;
+        enum rwsem_waiter_type type;
+        unsigned long timeout;
+        unsigned long last_rowner;
+};
+#define rwsem_first_waiter(sem) \
+        list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
+enum rwsem_wake_type {
+        RWSEM_WAKE_ANY,         /* Wake whatever's at head of wait list */
+        RWSEM_WAKE_READERS,     /* Wake readers only */
+        RWSEM_WAKE_READ_OWNED   /* Waker thread holds the read lock */
+};
+enum writer_wait_state {
+        WRITER_NOT_FIRST,       /* Writer is not first in wait list */
+        WRITER_FIRST,           /* Writer is first in wait list     */
+        WRITER_HANDOFF          /* Writer is first & handoff needed */
+};
+/*
+ * The typical HZ value is either 250 or 1000. So set the minimum waiting
+ * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
+ * queue before initiating the handoff protocol.
+ */
+#define RWSEM_WAIT_TIMEOUT      DIV_ROUND_UP(HZ, 250)
+/*
+ * Magic number to batch-wakeup waiting readers, even when writers are
+ * also present in the queue. This both limits the amount of work the
+ * waking thread must do and also prevents any potential counter overflow,
+ * however unlikely.
+ */
+#define MAX_READERS_WAKEUP      0x100
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
+ *   have been set.
+ * - there must be someone on the queue
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ *   to actually wakeup the blocked task(s) and drop the reference count,
+ *   preferably when the wait_lock is released
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only marked woken if downgrading is false
+ */
+static void rwsem_mark_wake(struct rw_semaphore *sem,
+                            enum rwsem_wake_type wake_type,
+                            struct wake_q_head *wake_q)
+{
+        struct rwsem_waiter *waiter, *tmp;
+        long oldcount, woken = 0, adjustment = 0;
+        struct list_head wlist;
+        lockdep_assert_held(&sem->wait_lock);
+        /*
+         * Take a peek at the queue head waiter such that we can determine
+         * the wakeup(s) to perform.
+         */
+        waiter = rwsem_first_waiter(sem);
+        if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+                if (wake_type == RWSEM_WAKE_ANY) {
+                        /*
+                         * Mark writer at the front of the queue for wakeup.
+                         * Until the task is actually later awoken later by
+                         * the caller, other writers are able to steal it.
+                         * Readers, on the other hand, will block as they
+                         * will notice the queued writer.
+                         */
+                        wake_q_add(wake_q, waiter->task);
+                        lockevent_inc(rwsem_wake_writer);
+                }
+                return;
+        }
+        /*
+         * No reader wakeup if there are too many of them already.
+         */
+        if (unlikely(atomic_long_read(&sem->count) < 0))
+                return;
+        /*
+         * Writers might steal the lock before we grant it to the next reader.
+         * We prefer to do the first reader grant before counting readers
+         * so we can bail out early if a writer stole the lock.
+         */
+        if (wake_type != RWSEM_WAKE_READ_OWNED) {
+                struct task_struct *owner;
+                adjustment = RWSEM_READER_BIAS;
+                oldcount = atomic_long_fetch_add(adjustment, &sem->count);
+                if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
+                        /*
+                         * When we've been waiting "too" long (for writers
+                         * to give up the lock), request a HANDOFF to
+                         * force the issue.
+                         */
+                        if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
+                            time_after(jiffies, waiter->timeout)) {
+                                adjustment -= RWSEM_FLAG_HANDOFF;
+                                lockevent_inc(rwsem_rlock_handoff);
+                        }
+                        atomic_long_add(-adjustment, &sem->count);
+                        return;
+                }
+                /*
+                 * Set it to reader-owned to give spinners an early
+                 * indication that readers now have the lock.
+                 * The reader nonspinnable bit seen at slowpath entry of
+                 * the reader is copied over.
+                 */
+                owner = waiter->task;
+                if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) {
+                        owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE);
+                        lockevent_inc(rwsem_opt_norspin);
+                }
+                __rwsem_set_reader_owned(sem, owner);
+        }
+        /*
+         * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
+         * queue. We know that the woken will be at least 1 as we accounted
+         * for above. Note we increment the 'active part' of the count by the
+         * number of readers before waking any processes up.
+         *
+         * This is an adaptation of the phase-fair R/W locks where at the
+         * reader phase (first waiter is a reader), all readers are eligible
+         * to acquire the lock at the same time irrespective of their order
+         * in the queue. The writers acquire the lock according to their
+         * order in the queue.
+         *
+         * We have to do wakeup in 2 passes to prevent the possibility that
+         * the reader count may be decremented before it is incremented. It
+         * is because the to-be-woken waiter may not have slept yet. So it
+         * may see waiter->task got cleared, finish its critical section and
+         * do an unlock before the reader count increment.
+         *
+         * 1) Collect the read-waiters in a separate list, count them and
+         *    fully increment the reader count in rwsem.
+         * 2) For each waiters in the new list, clear waiter->task and
+         *    put them into wake_q to be woken up later.
+         */
+        INIT_LIST_HEAD(&wlist);
+        list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+                if (waiter->type == RWSEM_WAITING_FOR_WRITE)
+                        continue;
+                woken++;
+                list_move_tail(&waiter->list, &wlist);
+                /*
+                 * Limit # of readers that can be woken up per wakeup call.
+                 */
+                if (woken >= MAX_READERS_WAKEUP)
+                        break;
+        }
+        adjustment = woken * RWSEM_READER_BIAS - adjustment;
+        lockevent_cond_inc(rwsem_wake_reader, woken);
+        if (list_empty(&sem->wait_list)) {
+                /* hit end of list above */
+                adjustment -= RWSEM_FLAG_WAITERS;
+        }
+        /*
+         * When we've woken a reader, we no longer need to force writers
+         * to give up the lock and we can clear HANDOFF.
+         */
+        if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
+                adjustment -= RWSEM_FLAG_HANDOFF;
+        if (adjustment)
+                atomic_long_add(adjustment, &sem->count);
+        /* 2nd pass */
+        list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+                struct task_struct *tsk;
+                tsk = waiter->task;
+                get_task_struct(tsk);
+                /*
+                 * Ensure calling get_task_struct() before setting the reader
+                 * waiter to nil such that rwsem_down_read_slowpath() cannot
+                 * race with do_exit() by always holding a reference count
+                 * to the task to wakeup.
+                 */
+                smp_store_release(&waiter->task, NULL);
+                /*
+                 * Ensure issuing the wakeup (either by us or someone else)
+                 * after setting the reader waiter to nil.
+                 */
+                wake_q_add_safe(wake_q, tsk);
+        }
+}
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ *
+ * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
+ * bit is set or the lock is acquired with handoff bit cleared.
+ */
+static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
+                                        enum writer_wait_state wstate)
+{
+        long count, new;
+        lockdep_assert_held(&sem->wait_lock);
+        count = atomic_long_read(&sem->count);
+        do {
+                bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
+                if (has_handoff && wstate == WRITER_NOT_FIRST)
+                        return false;
+                new = count;
+                if (count & RWSEM_LOCK_MASK) {
+                        if (has_handoff || (wstate != WRITER_HANDOFF))
+                                return false;
+                        new |= RWSEM_FLAG_HANDOFF;
+                } else {
+                        new |= RWSEM_WRITER_LOCKED;
+                        new &= ~RWSEM_FLAG_HANDOFF;
+                        if (list_is_singular(&sem->wait_list))
+                                new &= ~RWSEM_FLAG_WAITERS;
+                }
+        } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
+        /*
+         * We have either acquired the lock with handoff bit cleared or
+         * set the handoff bit.
+         */
+        if (new & RWSEM_FLAG_HANDOFF)
+                return false;
+        rwsem_set_owner(sem);
+        return true;
+}
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * Try to acquire read lock before the reader is put on wait queue.
+ * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff
+ * is ongoing.
+ */
+static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem)
+{
+        long count = atomic_long_read(&sem->count);
+        if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))
+                return false;
+        count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count);
+        if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+                rwsem_set_reader_owned(sem);
+                lockevent_inc(rwsem_opt_rlock);
+                return true;
+        }
+        /* Back out the change */
+        atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
+        return false;
+}
+/*
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+        long count = atomic_long_read(&sem->count);
+        while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
+                if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
+                                        count | RWSEM_WRITER_LOCKED)) {
+                        rwsem_set_owner(sem);
+                        lockevent_inc(rwsem_opt_wlock);
+                        return true;
+                }
+        }
+        return false;
+}
+static inline bool owner_on_cpu(struct task_struct *owner)
+{
+        /*
+         * As lock holder preemption issue, we both skip spinning if
+         * task is not on cpu or its cpu is preempted
+         */
+        return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
+}
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
+                                           unsigned long nonspinnable)
+{
+        struct task_struct *owner;
+        unsigned long flags;
+        bool ret = true;
+        BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
+        if (need_resched()) {
+                lockevent_inc(rwsem_opt_fail);
+                return false;
+        }
+        preempt_disable();
+        rcu_read_lock();
+        owner = rwsem_owner_flags(sem, &flags);
+        if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner)))
+                ret = false;
+        rcu_read_unlock();
+        preempt_enable();
+        lockevent_cond_inc(rwsem_opt_fail, !ret);
+        return ret;
+}
+/*
+ * The rwsem_spin_on_owner() function returns the folowing 4 values
+ * depending on the lock owner state.
+ *   OWNER_NULL  : owner is currently NULL
+ *   OWNER_WRITER: when owner changes and is a writer
+ *   OWNER_READER: when owner changes and the new owner may be a reader.
+ *   OWNER_NONSPINNABLE:
+ *                 when optimistic spinning has to stop because either the
+ *                 owner stops running, is unknown, or its timeslice has
+ *                 been used up.
+ */
+enum owner_state {
+        OWNER_NULL              = 1 << 0,
+        OWNER_WRITER            = 1 << 1,
+        OWNER_READER            = 1 << 2,
+        OWNER_NONSPINNABLE      = 1 << 3,
+};
+#define OWNER_SPINNABLE         (OWNER_NULL | OWNER_WRITER | OWNER_READER)
+static inline enum owner_state
+rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable)
+{
+        if (flags & nonspinnable)
+                return OWNER_NONSPINNABLE;
+        if (flags & RWSEM_READER_OWNED)
+                return OWNER_READER;
+        return owner ? OWNER_WRITER : OWNER_NULL;
+}
+static noinline enum owner_state
+rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
+{
+        struct task_struct *new, *owner;
+        unsigned long flags, new_flags;
+        enum owner_state state;
+        owner = rwsem_owner_flags(sem, &flags);
+        state = rwsem_owner_state(owner, flags, nonspinnable);
+        if (state != OWNER_WRITER)
+                return state;
+        rcu_read_lock();
+        for (;;) {
+                if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) {
+                        state = OWNER_NONSPINNABLE;
+                        break;
+                }
+                new = rwsem_owner_flags(sem, &new_flags);
+                if ((new != owner) || (new_flags != flags)) {
+                        state = rwsem_owner_state(new, new_flags, nonspinnable);
+                        break;
+                }
+                /*
+                 * Ensure we emit the owner->on_cpu, dereference _after_
+                 * checking sem->owner still matches owner, if that fails,
+                 * owner might point to free()d memory, if it still matches,
+                 * the rcu_read_lock() ensures the memory stays valid.
+                 */
+                barrier();
+                if (need_resched() || !owner_on_cpu(owner)) {
+                        state = OWNER_NONSPINNABLE;
+                        break;
+                }
+                cpu_relax();
+        }
+        rcu_read_unlock();
+        return state;
+}
+/*
+ * Calculate reader-owned rwsem spinning threshold for writer
+ *
+ * The more readers own the rwsem, the longer it will take for them to
+ * wind down and free the rwsem. So the empirical formula used to
+ * determine the actual spinning time limit here is:
+ *
+ *   Spinning threshold = (10 + nr_readers/2)us
+ *
+ * The limit is capped to a maximum of 25us (30 readers). This is just
+ * a heuristic and is subjected to change in the future.
+ */
+static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
+{
+        long count = atomic_long_read(&sem->count);
+        int readers = count >> RWSEM_READER_SHIFT;
+        u64 delta;
+        if (readers > 30)
+                readers = 30;
+        delta = (20 + readers) * NSEC_PER_USEC / 2;
+        return sched_clock() + delta;
+}
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+{
+        bool taken = false;
+        int prev_owner_state = OWNER_NULL;
+        int loop = 0;
+        u64 rspin_threshold = 0;
+        unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE
+                                           : RWSEM_RD_NONSPINNABLE;
+        preempt_disable();
+        /* sem->wait_lock should not be held when doing optimistic spinning */
+        if (!osq_lock(&sem->osq))
+                goto done;
+        /*
+         * Optimistically spin on the owner field and attempt to acquire the
+         * lock whenever the owner changes. Spinning will be stopped when:
+         *  1) the owning writer isn't running; or
+         *  2) readers own the lock and spinning time has exceeded limit.
+         */
+        for (;;) {
+                enum owner_state owner_state;
+                owner_state = rwsem_spin_on_owner(sem, nonspinnable);
+                if (!(owner_state & OWNER_SPINNABLE))
+                        break;
+                /*
+                 * Try to acquire the lock
+                 */
+                taken = wlock ? rwsem_try_write_lock_unqueued(sem)
+                              : rwsem_try_read_lock_unqueued(sem);
+                if (taken)
+                        break;
+                /*
+                 * Time-based reader-owned rwsem optimistic spinning
+                 */
+                if (wlock && (owner_state == OWNER_READER)) {
+                        /*
+                         * Re-initialize rspin_threshold every time when
+                         * the owner state changes from non-reader to reader.
+                         * This allows a writer to steal the lock in between
+                         * 2 reader phases and have the threshold reset at
+                         * the beginning of the 2nd reader phase.
+                         */
+                        if (prev_owner_state != OWNER_READER) {
+                                if (rwsem_test_oflags(sem, nonspinnable))
+                                        break;
+                                rspin_threshold = rwsem_rspin_threshold(sem);
+                                loop = 0;
+                        }
+                        /*
+                         * Check time threshold once every 16 iterations to
+                         * avoid calling sched_clock() too frequently so
+                         * as to reduce the average latency between the times
+                         * when the lock becomes free and when the spinner
+                         * is ready to do a trylock.
+                         */
+                        else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
+                                rwsem_set_nonspinnable(sem);
+                                lockevent_inc(rwsem_opt_nospin);
+                                break;
+                        }
+                }
+                /*
+                 * An RT task cannot do optimistic spinning if it cannot
+                 * be sure the lock holder is running or live-lock may
+                 * happen if the current task and the lock holder happen
+                 * to run in the same CPU. However, aborting optimistic
+                 * spinning while a NULL owner is detected may miss some
+                 * opportunity where spinning can continue without causing
+                 * problem.
+                 *
+                 * There are 2 possible cases where an RT task may be able
+                 * to continue spinning.
+                 *
+                 * 1) The lock owner is in the process of releasing the
+                 *    lock, sem->owner is cleared but the lock has not
+                 *    been released yet.
+                 * 2) The lock was free and owner cleared, but another
+                 *    task just comes in and acquire the lock before
+                 *    we try to get it. The new owner may be a spinnable
+                 *    writer.
+                 *
+                 * To take advantage of two scenarios listed agove, the RT
+                 * task is made to retry one more time to see if it can
+                 * acquire the lock or continue spinning on the new owning
+                 * writer. Of course, if the time lag is long enough or the
+                 * new owner is not a writer or spinnable, the RT task will
+                 * quit spinning.
+                 *
+                 * If the owner is a writer, the need_resched() check is
+                 * done inside rwsem_spin_on_owner(). If the owner is not
+                 * a writer, need_resched() check needs to be done here.
+                 */
+                if (owner_state != OWNER_WRITER) {
+                        if (need_resched())
+                                break;
+                        if (rt_task(current) &&
+                           (prev_owner_state != OWNER_WRITER))
+                                break;
+                }
+                prev_owner_state = owner_state;
+                /*
+                 * The cpu_relax() call is a compiler barrier which forces
+                 * everything in this loop to be re-loaded. We don't need
+                 * memory barriers as we'll eventually observe the right
+                 * values at the cost of a few extra spins.
+                 */
+                cpu_relax();
+        }
+        osq_unlock(&sem->osq);
+done:
+        preempt_enable();
+        lockevent_cond_inc(rwsem_opt_fail, !taken);
+        return taken;
+}
+/*
+ * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should
+ * only be called when the reader count reaches 0.
+ *
+ * This give writers better chance to acquire the rwsem first before
+ * readers when the rwsem was being held by readers for a relatively long
+ * period of time. Race can happen that an optimistic spinner may have
+ * just stolen the rwsem and set the owner, but just clearing the
+ * RWSEM_WR_NONSPINNABLE bit will do no harm anyway.
+ */
+static inline void clear_wr_nonspinnable(struct rw_semaphore *sem)
+{
+        if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE))
+                atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner);
+}
+/*
+ * This function is called when the reader fails to acquire the lock via
+ * optimistic spinning. In this case we will still attempt to do a trylock
+ * when comparing the rwsem state right now with the state when entering
+ * the slowpath indicates that the reader is still in a valid reader phase.
+ * This happens when the following conditions are true:
+ *
+ * 1) The lock is currently reader owned, and
+ * 2) The lock is previously not reader-owned or the last read owner changes.
+ *
+ * In the former case, we have transitioned from a writer phase to a
+ * reader-phase while spinning. In the latter case, it means the reader
+ * phase hasn't ended when we entered the optimistic spinning loop. In
+ * both cases, the reader is eligible to acquire the lock. This is the
+ * secondary path where a read lock is acquired optimistically.
+ *
+ * The reader non-spinnable bit wasn't set at time of entry or it will
+ * not be here at all.
+ */
+static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
+                                              unsigned long last_rowner)
+{
+        unsigned long owner = atomic_long_read(&sem->owner);
+        if (!(owner & RWSEM_READER_OWNED))
+                return false;
+        if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) &&
+            rwsem_try_read_lock_unqueued(sem)) {
+                lockevent_inc(rwsem_opt_rlock2);
+                lockevent_add(rwsem_opt_fail, -1);
+                return true;
+        }
+        return false;
+}
+#else
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
+                                           unsigned long nonspinnable)
+{
+        return false;
+}
+static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+{
+        return false;
+}
+static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { }
+static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
+                                              unsigned long last_rowner)
+{
+        return false;
+}
+#endif
+/*
+ * Wait for the read lock to be granted
+ */
+static struct rw_semaphore __sched *
+rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
+{
+        long count, adjustment = -RWSEM_READER_BIAS;
+        struct rwsem_waiter waiter;
+        DEFINE_WAKE_Q(wake_q);
+        bool wake = false;
+        /*
+         * Save the current read-owner of rwsem, if available, and the
+         * reader nonspinnable bit.
+         */
+        waiter.last_rowner = atomic_long_read(&sem->owner);
+        if (!(waiter.last_rowner & RWSEM_READER_OWNED))
+                waiter.last_rowner &= RWSEM_RD_NONSPINNABLE;
+        if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE))
+                goto queue;
+        /*
+         * Undo read bias from down_read() and do optimistic spinning.
+         */
+        atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
+        adjustment = 0;
+        if (rwsem_optimistic_spin(sem, false)) {
+                /*
+                 * Wake up other readers in the wait list if the front
+                 * waiter is a reader.
+                 */
+                if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) {
+                        raw_spin_lock_irq(&sem->wait_lock);
+                        if (!list_empty(&sem->wait_list))
+                                rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
+                                                &wake_q);
+                        raw_spin_unlock_irq(&sem->wait_lock);
+                        wake_up_q(&wake_q);
+                }
+                return sem;
+        } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) {
+                return sem;
+        }
+queue:
+        waiter.task = current;
+        waiter.type = RWSEM_WAITING_FOR_READ;
+        waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+        raw_spin_lock_irq(&sem->wait_lock);
+        if (list_empty(&sem->wait_list)) {
+                /*
+                 * In case the wait queue is empty and the lock isn't owned
+                 * by a writer or has the handoff bit set, this reader can
+                 * exit the slowpath and return immediately as its
+                 * RWSEM_READER_BIAS has already been set in the count.
+                 */
+                if (adjustment && !(atomic_long_read(&sem->count) &
+                     (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+                        raw_spin_unlock_irq(&sem->wait_lock);
+                        rwsem_set_reader_owned(sem);
+                        lockevent_inc(rwsem_rlock_fast);
+                        return sem;
+                }
+                adjustment += RWSEM_FLAG_WAITERS;
+        }
+        list_add_tail(&waiter.list, &sem->wait_list);
+        /* we're now waiting on the lock, but no longer actively locking */
+        if (adjustment)
+                count = atomic_long_add_return(adjustment, &sem->count);
+        else
+                count = atomic_long_read(&sem->count);
+        /*
+         * If there are no active locks, wake the front queued process(es).
+         *
+         * If there are no writers and we are first in the queue,
+         * wake our own waiter to join the existing active readers !
+         */
+        if (!(count & RWSEM_LOCK_MASK)) {
+                clear_wr_nonspinnable(sem);
+                wake = true;
+        }
+        if (wake || (!(count & RWSEM_WRITER_MASK) &&
+                    (adjustment & RWSEM_FLAG_WAITERS)))
+                rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+        raw_spin_unlock_irq(&sem->wait_lock);
+        wake_up_q(&wake_q);
+        /* wait to be given the lock */
+        while (true) {
+                set_current_state(state);
+                if (!waiter.task)
+                        break;
+                if (signal_pending_state(state, current)) {
+                        raw_spin_lock_irq(&sem->wait_lock);
+                        if (waiter.task)
+                                goto out_nolock;
+                        raw_spin_unlock_irq(&sem->wait_lock);
+                        break;
+                }
+                schedule();
+                lockevent_inc(rwsem_sleep_reader);
+        }
+        __set_current_state(TASK_RUNNING);
+        lockevent_inc(rwsem_rlock);
+        return sem;
+out_nolock:
+        list_del(&waiter.list);
+        if (list_empty(&sem->wait_list)) {
+                atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
+                                   &sem->count);
+        }
+        raw_spin_unlock_irq(&sem->wait_lock);
+        __set_current_state(TASK_RUNNING);
+        lockevent_inc(rwsem_rlock_fail);
+        return ERR_PTR(-EINTR);
+}
+/*
+ * This function is called by the a write lock owner. So the owner value
+ * won't get changed by others.
+ */
+static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem,
+                                                bool disable)
+{
+        if (unlikely(disable)) {
+                atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner);
+                lockevent_inc(rwsem_opt_norspin);
+        }
+}
+/*
+ * Wait until we successfully acquire the write lock
+ */
+static struct rw_semaphore *
+rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
+{
+        long count;
+        bool disable_rspin;
+        enum writer_wait_state wstate;
+        struct rwsem_waiter waiter;
+        struct rw_semaphore *ret = sem;
+        DEFINE_WAKE_Q(wake_q);
+        /* do optimistic spinning and steal lock if possible */
+        if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) &&
+            rwsem_optimistic_spin(sem, true))
+                return sem;
+        /*
+         * Disable reader optimistic spinning for this rwsem after
+         * acquiring the write lock when the setting of the nonspinnable
+         * bits are observed.
+         */
+        disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE;
+        /*
+         * Optimistic spinning failed, proceed to the slowpath
+         * and block until we can acquire the sem.
+         */
+        waiter.task = current;
+        waiter.type = RWSEM_WAITING_FOR_WRITE;
+        waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+        raw_spin_lock_irq(&sem->wait_lock);
+        /* account for this before adding a new element to the list */
+        wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
+        list_add_tail(&waiter.list, &sem->wait_list);
+        /* we're now waiting on the lock */
+        if (wstate == WRITER_NOT_FIRST) {
+                count = atomic_long_read(&sem->count);
+                /*
+                 * If there were already threads queued before us and:
+                 *  1) there are no no active locks, wake the front
+                 *     queued process(es) as the handoff bit might be set.
+                 *  2) there are no active writers and some readers, the lock
+                 *     must be read owned; so we try to wake any read lock
+                 *     waiters that were queued ahead of us.
+                 */
+                if (count & RWSEM_WRITER_MASK)
+                        goto wait;
+                rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
+                                        ? RWSEM_WAKE_READERS
+                                        : RWSEM_WAKE_ANY, &wake_q);
+                if (!wake_q_empty(&wake_q)) {
+                        /*
+                         * We want to minimize wait_lock hold time especially
+                         * when a large number of readers are to be woken up.
+                         */
+                        raw_spin_unlock_irq(&sem->wait_lock);
+                        wake_up_q(&wake_q);
+                        wake_q_init(&wake_q);   /* Used again, reinit */
+                        raw_spin_lock_irq(&sem->wait_lock);
+                }
+        } else {
+                atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
+        }
+wait:
+        /* wait until we successfully acquire the lock */
+        set_current_state(state);
+        while (true) {
+                if (rwsem_try_write_lock(sem, wstate))
+                        break;
+                raw_spin_unlock_irq(&sem->wait_lock);
+                /* Block until there are no active lockers. */
+                for (;;) {
+                        if (signal_pending_state(state, current))
+                                goto out_nolock;
+                        schedule();
+                        lockevent_inc(rwsem_sleep_writer);
+                        set_current_state(state);
+                        /*
+                         * If HANDOFF bit is set, unconditionally do
+                         * a trylock.
+                         */
+                        if (wstate == WRITER_HANDOFF)
+                                break;
+                        if ((wstate == WRITER_NOT_FIRST) &&
+                            (rwsem_first_waiter(sem) == &waiter))
+                                wstate = WRITER_FIRST;
+                        count = atomic_long_read(&sem->count);
+                        if (!(count & RWSEM_LOCK_MASK))
+                                break;
+                        /*
+                         * The setting of the handoff bit is deferred
+                         * until rwsem_try_write_lock() is called.
+                         */
+                        if ((wstate == WRITER_FIRST) && (rt_task(current) ||
+                            time_after(jiffies, waiter.timeout))) {
+                                wstate = WRITER_HANDOFF;
+                                lockevent_inc(rwsem_wlock_handoff);
+                                break;
+                        }
+                }
+                raw_spin_lock_irq(&sem->wait_lock);
+        }
+        __set_current_state(TASK_RUNNING);
+        list_del(&waiter.list);
+        rwsem_disable_reader_optspin(sem, disable_rspin);
+        raw_spin_unlock_irq(&sem->wait_lock);
+        lockevent_inc(rwsem_wlock);
+        return ret;
+out_nolock:
+        __set_current_state(TASK_RUNNING);
+        raw_spin_lock_irq(&sem->wait_lock);
+        list_del(&waiter.list);
+        if (unlikely(wstate == WRITER_HANDOFF))
+                atomic_long_add(-RWSEM_FLAG_HANDOFF,  &sem->count);
+        if (list_empty(&sem->wait_list))
+                atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
+        else
+                rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+        raw_spin_unlock_irq(&sem->wait_lock);
+        wake_up_q(&wake_q);
+        lockevent_inc(rwsem_wlock_fail);
+        return ERR_PTR(-EINTR);
+}
+/*
+ * handle waking up a waiter on the semaphore
+ * - up_read/up_write has decremented the active part of count if we come here
+ */
+static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count)
+{
+        unsigned long flags;
+        DEFINE_WAKE_Q(wake_q);
+        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        if (!list_empty(&sem->wait_list))
+                rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+        wake_up_q(&wake_q);
+        return sem;
+}
+/*
+ * downgrade a write lock into a read lock
+ * - caller incremented waiting part of count and discovered it still negative
+ * - just wake up any readers at the front of the queue
+ */
+static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
+{
+        unsigned long flags;
+        DEFINE_WAKE_Q(wake_q);
+        raw_spin_lock_irqsave(&sem->wait_lock, flags);
+        if (!list_empty(&sem->wait_list))
+                rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
+        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+        wake_up_q(&wake_q);
+        return sem;
+}
+/*
+ * lock for reading
+ */
+inline void __down_read(struct rw_semaphore *sem)
+{
+        if (!rwsem_read_trylock(sem)) {
+                rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
+                DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+        } else {
+                rwsem_set_reader_owned(sem);
+        }
+}
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+        if (!rwsem_read_trylock(sem)) {
+                if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
+                        return -EINTR;
+                DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+        } else {
+                rwsem_set_reader_owned(sem);
+        }
+        return 0;
+}
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+        /*
+         * Optimize for the case when the rwsem is not locked at all.
+         */
+        long tmp = RWSEM_UNLOCKED_VALUE;
+        do {
+                if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+                                        tmp + RWSEM_READER_BIAS)) {
+                        rwsem_set_reader_owned(sem);
+                        return 1;
+                }
+        } while (!(tmp & RWSEM_READ_FAILED_MASK));
+        return 0;
+}
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct rw_semaphore *sem)
+{
+        long tmp = RWSEM_UNLOCKED_VALUE;
+        if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+                                                      RWSEM_WRITER_LOCKED)))
+                rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE);
+        else
+                rwsem_set_owner(sem);
+}
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+        long tmp = RWSEM_UNLOCKED_VALUE;
+        if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+                                                      RWSEM_WRITER_LOCKED))) {
+                if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE)))
+                        return -EINTR;
+        } else {
+                rwsem_set_owner(sem);
+        }
+        return 0;
+}
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+        long tmp = RWSEM_UNLOCKED_VALUE;
+        if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+                                            RWSEM_WRITER_LOCKED)) {
+                rwsem_set_owner(sem);
+                return true;
+        }
+        return false;
+}
+/*
+ * unlock after reading
+ */
+inline void __up_read(struct rw_semaphore *sem)
+{
+        long tmp;
+        DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+        rwsem_clear_reader_owned(sem);
+        tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
+        DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
+        if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
+                      RWSEM_FLAG_WAITERS)) {
+                clear_wr_nonspinnable(sem);
+                rwsem_wake(sem, tmp);
+        }
+}
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+        long tmp;
+        /*
+         * sem->owner may differ from current if the ownership is transferred
+         * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
+         */
+        DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
+                            !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+        rwsem_clear_owner(sem);
+        tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
+        if (unlikely(tmp & RWSEM_FLAG_WAITERS))
+                rwsem_wake(sem, tmp);
+}
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+        long tmp;
+        /*
+         * When downgrading from exclusive to shared ownership,
+         * anything inside the write-locked region cannot leak
+         * into the read side. In contrast, anything in the
+         * read-locked region is ok to be re-ordered into the
+         * write side. As such, rely on RELEASE semantics.
+         */
+        DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+        tmp = atomic_long_fetch_add_release(
+                -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
+        rwsem_set_reader_owned(sem);
+        if (tmp & RWSEM_FLAG_WAITERS)
+                rwsem_downgrade_wake(sem);
+}
 /*
 * lock for reading
@@ -25,7 +1446,6 @@ void __sched down_read(struct rw_semaphore *sem)
        LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
 }
 EXPORT_SYMBOL(down_read);
 int __sched down_read_killable(struct rw_semaphore *sem)
@@ -40,7 +1460,6 @@ int __sched down_read_killable(struct rw_semaphore *sem)
        return 0;
 }
 EXPORT_SYMBOL(down_read_killable);
 /*
@@ -54,7 +1473,6 @@ int down_read_trylock(struct rw_semaphore *sem)
                rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
        return ret;
 }
 EXPORT_SYMBOL(down_read_trylock);
 /*
@@ -64,10 +1482,8 @@ void __sched down_write(struct rw_semaphore *sem)
 {
        might_sleep();
        rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
 }
 EXPORT_SYMBOL(down_write);
 /*
@@ -78,14 +1494,14 @@ int __sched down_write_killable(struct rw_semaphore *sem)
        might_sleep();
        rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
-        if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+        if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
+                                  __down_write_killable)) {
                rwsem_release(&sem->dep_map, 1, _RET_IP_);
                return -EINTR;
        }
        return 0;
 }
 EXPORT_SYMBOL(down_write_killable);
 /*
@@ -100,7 +1516,6 @@ int down_write_trylock(struct rw_semaphore *sem)
        return ret;
 }
 EXPORT_SYMBOL(down_write_trylock);
 /*
@@ -109,10 +1524,8 @@ EXPORT_SYMBOL(down_write_trylock);
 void up_read(struct rw_semaphore *sem)
 {
        rwsem_release(&sem->dep_map, 1, _RET_IP_);
        __up_read(sem);
 }
 EXPORT_SYMBOL(up_read);
 /*
@@ -121,10 +1534,8 @@ EXPORT_SYMBOL(up_read);
 void up_write(struct rw_semaphore *sem)
 {
        rwsem_release(&sem->dep_map, 1, _RET_IP_);
        __up_write(sem);
 }
 EXPORT_SYMBOL(up_write);
 /*
@@ -133,10 +1544,8 @@ EXPORT_SYMBOL(up_write);
 void downgrade_write(struct rw_semaphore *sem)
 {
        lock_downgrade(&sem->dep_map, _RET_IP_);
        __downgrade_write(sem);
 }
 EXPORT_SYMBOL(downgrade_write);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -145,40 +1554,32 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 {
        might_sleep();
        rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
 }
 EXPORT_SYMBOL(down_read_nested);
 void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
 {
        might_sleep();
        rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
 }
 EXPORT_SYMBOL(_down_write_nest_lock);
 void down_read_non_owner(struct rw_semaphore *sem)
 {
        might_sleep();
        __down_read(sem);
        __rwsem_set_reader_owned(sem, NULL);
 }
 EXPORT_SYMBOL(down_read_non_owner);
 void down_write_nested(struct rw_semaphore *sem, int subclass)
 {
        might_sleep();
        rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
 }
 EXPORT_SYMBOL(down_write_nested);
 int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
@@ -186,23 +1587,21 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
        might_sleep();
        rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
-        if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+        if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
+                                  __down_write_killable)) {
                rwsem_release(&sem->dep_map, 1, _RET_IP_);
                return -EINTR;
        }
        return 0;
 }
 EXPORT_SYMBOL(down_write_killable_nested);
 void up_read_non_owner(struct rw_semaphore *sem)
 {
-        DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
+        DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
-                                sem);
        __up_read(sem);
 }
 EXPORT_SYMBOL(up_read_non_owner);
 #endif
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 64877f5294e3..2534ce49f648 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,304 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*
- * The least significant 2 bits of the owner value has the following
- * meanings when set.
- *  - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers
- *  - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned,
- *    i.e. the owner(s) cannot be readily determined. It can be reader
- *    owned or the owning writer is indeterminate.
- *
- * When a writer acquires a rwsem, it puts its task_struct pointer
- * into the owner field. It is cleared after an unlock.
- *
- * When a reader acquires a rwsem, it will also puts its task_struct
- * pointer into the owner field with both the RWSEM_READER_OWNED and
- * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will
- * largely be left untouched. So for a free or reader-owned rwsem,
- * the owner value may contain information about the last reader that
- * acquires the rwsem. The anonymous bit is set because that particular
- * reader may or may not still own the lock.
- *
- * That information may be helpful in debugging cases where the system
- * seems to hang on a reader owned rwsem especially if only one reader
- * is involved. Ideally we would like to track all the readers that own
- * a rwsem, but the overhead is simply too big.
- */
-#include "lock_events.h"
-#define RWSEM_READER_OWNED      (1UL << 0)
+#ifndef __INTERNAL_RWSEM_H
-#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1)
+#define __INTERNAL_RWSEM_H
+#include <linux/rwsem.h>
-#ifdef CONFIG_DEBUG_RWSEMS
+extern void __down_read(struct rw_semaphore *sem);
-# define DEBUG_RWSEMS_WARN_ON(c, sem)   do {                    \
+extern void __up_read(struct rw_semaphore *sem);
-        if (!debug_locks_silent &&                              \
-            WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
-                #c, atomic_long_read(&(sem)->count),            \
-                (long)((sem)->owner), (long)current,            \
-                list_empty(&(sem)->wait_list) ? "" : "not "))   \
-                        debug_locks_off();                      \
-        } while (0)
-#else
-# define DEBUG_RWSEMS_WARN_ON(c, sem)
-#endif
-/*
+#endif /* __INTERNAL_RWSEM_H */
- * R/W semaphores originally for PPC using the stuff in lib/rwsem.c.
- * Adapted largely from include/asm-i386/rwsem.h
- * by Paul Mackerras <paulus@samba.org>.
- */
-/*
- * the semaphore definition
- */
-#ifdef CONFIG_64BIT
-# define RWSEM_ACTIVE_MASK              0xffffffffL
-#else
-# define RWSEM_ACTIVE_MASK              0x0000ffffL
-#endif
-#define RWSEM_ACTIVE_BIAS               0x00000001L
-#define RWSEM_WAITING_BIAS              (-RWSEM_ACTIVE_MASK-1)
-#define RWSEM_ACTIVE_READ_BIAS          RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS         (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-/*
- * All writes to owner are protected by WRITE_ONCE() to make sure that
- * store tearing can't happen as optimistic spinners may read and use
- * the owner value concurrently without lock. Read from owner, however,
- * may not need READ_ONCE() as long as the pointer value is only used
- * for comparison and isn't being dereferenced.
- */
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-        WRITE_ONCE(sem->owner, current);
-}
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-        WRITE_ONCE(sem->owner, NULL);
-}
-/*
- * The task_struct pointer of the last owning reader will be left in
- * the owner field.
- *
- * Note that the owner value just indicates the task has owned the rwsem
- * previously, it may not be the real owner or one of the real owners
- * anymore when that field is examined, so take it with a grain of salt.
- */
-static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
-                                            struct task_struct *owner)
-{
-        unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED
-                                                 | RWSEM_ANONYMOUSLY_OWNED;
-        WRITE_ONCE(sem->owner, (struct task_struct *)val);
-}
-static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
-{
-        __rwsem_set_reader_owned(sem, current);
-}
-/*
- * Return true if the a rwsem waiter can spin on the rwsem's owner
- * and steal the lock, i.e. the lock is not anonymously owned.
- * N.B. !owner is considered spinnable.
- */
-static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
-{
-        return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
-}
-/*
- * Return true if rwsem is owned by an anonymous writer or readers.
- */
-static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
-{
-        return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
-}
-#ifdef CONFIG_DEBUG_RWSEMS
-/*
- * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
- * is a task pointer in owner of a reader-owned rwsem, it will be the
- * real owner or one of the real owners. The only exception is when the
- * unlock is done by up_read_non_owner().
- */
-#define rwsem_clear_reader_owned rwsem_clear_reader_owned
-static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
-{
-        unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
-                                                   | RWSEM_ANONYMOUSLY_OWNED;
-        if (READ_ONCE(sem->owner) == (struct task_struct *)val)
-                cmpxchg_relaxed((unsigned long *)&sem->owner, val,
-                                RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED);
-}
-#endif
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
-                                           struct task_struct *owner)
-{
-}
-static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
-{
-}
-#endif
-#ifndef rwsem_clear_reader_owned
-static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
-{
-}
-#endif
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-/*
- * lock for reading
- */
-static inline void __down_read(struct rw_semaphore *sem)
-{
-        if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
-                rwsem_down_read_failed(sem);
-                DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
-                                        RWSEM_READER_OWNED), sem);
-        } else {
-                rwsem_set_reader_owned(sem);
-        }
-}
-static inline int __down_read_killable(struct rw_semaphore *sem)
-{
-        if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
-                if (IS_ERR(rwsem_down_read_failed_killable(sem)))
-                        return -EINTR;
-                DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
-                                        RWSEM_READER_OWNED), sem);
-        } else {
-                rwsem_set_reader_owned(sem);
-        }
-        return 0;
-}
-static inline int __down_read_trylock(struct rw_semaphore *sem)
-{
-        /*
-         * Optimize for the case when the rwsem is not locked at all.
-         */
-        long tmp = RWSEM_UNLOCKED_VALUE;
-        lockevent_inc(rwsem_rtrylock);
-        do {
-                if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
-                                        tmp + RWSEM_ACTIVE_READ_BIAS)) {
-                        rwsem_set_reader_owned(sem);
-                        return 1;
-                }
-        } while (tmp >= 0);
-        return 0;
-}
-/*
- * lock for writing
- */
-static inline void __down_write(struct rw_semaphore *sem)
-{
-        long tmp;
-        tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
-                                             &sem->count);
-        if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
-                rwsem_down_write_failed(sem);
-        rwsem_set_owner(sem);
-}
-static inline int __down_write_killable(struct rw_semaphore *sem)
-{
-        long tmp;
-        tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
-                                             &sem->count);
-        if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
-                if (IS_ERR(rwsem_down_write_failed_killable(sem)))
-                        return -EINTR;
-        rwsem_set_owner(sem);
-        return 0;
-}
-static inline int __down_write_trylock(struct rw_semaphore *sem)
-{
-        long tmp;
-        lockevent_inc(rwsem_wtrylock);
-        tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
-                      RWSEM_ACTIVE_WRITE_BIAS);
-        if (tmp == RWSEM_UNLOCKED_VALUE) {
-                rwsem_set_owner(sem);
-                return true;
-        }
-        return false;
-}
-/*
- * unlock after reading
- */
-static inline void __up_read(struct rw_semaphore *sem)
-{
-        long tmp;
-        DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
-                                sem);
-        rwsem_clear_reader_owned(sem);
-        tmp = atomic_long_dec_return_release(&sem->count);
-        if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
-                rwsem_wake(sem);
-}
-/*
- * unlock after writing
- */
-static inline void __up_write(struct rw_semaphore *sem)
-{
-        DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
-        rwsem_clear_owner(sem);
-        if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
-                                                    &sem->count) < 0))
-                rwsem_wake(sem);
-}
-/*
- * downgrade write lock to read lock
- */
-static inline void __downgrade_write(struct rw_semaphore *sem)
-{
-        long tmp;
-        /*
-         * When downgrading from exclusive to shared ownership,
-         * anything inside the write-locked region cannot leak
-         * into the read side. In contrast, anything in the
-         * read-locked region is ok to be re-ordered into the
-         * write side. As such, rely on RELEASE semantics.
-         */
-        DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
-        tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
-        rwsem_set_reader_owned(sem);
-        if (tmp < 0)
-                rwsem_downgrade_wake(sem);
-}
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 561acdd39960..d9dd94defc0a 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Copyright (c) 2008 Intel Corporation
 * Author: Matthew Wilcox <willy@linux.intel.com>
 *
- * Distributed under the terms of the GNU GPL, version 2
- *
 * This file implements counting semaphores.
 * A counting semaphore may be acquired 'n' times before sleeping.
 * See mutex.c for single-acquisition sleeping locks which enforce
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 65a3b7e55b9f..3e82f449b4ff 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * Module-based API test facility for ww_mutexes
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
 */
 #include <linux/kernel.h>
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 1490e63f69a9..6e1970719dc2 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -95,6 +95,7 @@ static void devm_memremap_pages_release(void *data)
        pgmap->kill(pgmap->ref);
        for_each_device_pfn(pfn, pgmap)
                put_page(pfn_to_page(pfn));
+        pgmap->cleanup(pgmap->ref);
        /* pages are dead and unused, undo the arch mapping */
        align_start = res->start & ~(SECTION_SIZE - 1);
@@ -133,8 +134,8 @@ static void devm_memremap_pages_release(void *data)
 * 2/ The altmap field may optionally be initialized, in which case altmap_valid
 *    must be set to true
 *
- * 3/ pgmap->ref must be 'live' on entry and will be killed at
+ * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
- *    devm_memremap_pages_release() time, or if this routine fails.
+ *    at devm_memremap_pages_release() time, or if this routine fails.
 *
 * 4/ res is expected to be a host memory range that could feasibly be
 *    treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -156,8 +157,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
        pgprot_t pgprot = PAGE_KERNEL;
        int error, nid, is_ram;
-        if (!pgmap->ref || !pgmap->kill)
+        if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
+                WARN(1, "Missing reference count teardown definition\n");
                return ERR_PTR(-EINVAL);
+        }
        align_start = res->start & ~(SECTION_SIZE - 1);
        align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -168,14 +171,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
        if (conflict_pgmap) {
                dev_WARN(dev, "Conflicting mapping in same section\n");
                put_dev_pagemap(conflict_pgmap);
-                return ERR_PTR(-ENOMEM);
+                error = -ENOMEM;
+                goto err_array;
        }
        conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
        if (conflict_pgmap) {
                dev_WARN(dev, "Conflicting mapping in same section\n");
                put_dev_pagemap(conflict_pgmap);
-                return ERR_PTR(-ENOMEM);
+                error = -ENOMEM;
+                goto err_array;
        }
        is_ram = region_intersects(align_start, align_size,
@@ -267,10 +272,18 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
        pgmap_array_delete(res);
 err_array:
        pgmap->kill(pgmap->ref);
+        pgmap->cleanup(pgmap->ref);
        return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
+void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+        devm_release_action(dev, devm_memremap_pages_release, pgmap);
+}
+EXPORT_SYMBOL_GPL(devm_memunmap_pages);
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
 {
        /* number of pfns from base where pfn_to_page() is valid */
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
index 79c9be2dbbe9..33783abc377b 100644
--- a/kernel/module-internal.h
+++ b/kernel/module-internal.h
@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /* Module internals
 *
 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
 */
 #include <linux/elf.h>
@@ -20,7 +16,7 @@ struct load_info {
        unsigned long len;
        Elf_Shdr *sechdrs;
        char *secstrings, *strtab;
-        unsigned long symoffs, stroffs;
+        unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
        struct _ddebug *debug;
        unsigned int num_debug;
        bool sig_ok;
diff --git a/kernel/module.c b/kernel/module.c
index a9e1e7f2c224..a2cee14a83f3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
   Copyright (C) 2002 Richard Henderson
   Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/export.h>
 #include <linux/extable.h>
@@ -2642,6 +2630,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
        info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
        info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
        mod->core_layout.size += strtab_size;
+        info->core_typeoffs = mod->core_layout.size;
+        mod->core_layout.size += ndst * sizeof(char);
        mod->core_layout.size = debug_align(mod->core_layout.size);
        /* Put string table section at end of init part of module. */
@@ -2655,6 +2645,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
                                      __alignof__(struct mod_kallsyms));
        info->mod_kallsyms_init_off = mod->init_layout.size;
        mod->init_layout.size += sizeof(struct mod_kallsyms);
+        info->init_typeoffs = mod->init_layout.size;
+        mod->init_layout.size += nsrc * sizeof(char);
        mod->init_layout.size = debug_align(mod->init_layout.size);
 }
@@ -2678,20 +2670,23 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
        mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
        /* Make sure we get permanent strtab: don't use info->strtab. */
        mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+        mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
-        /* Set types up while we still have access to sections. */
+        /*
-        for (i = 0; i < mod->kallsyms->num_symtab; i++)
+         * Now populate the cut down core kallsyms for after init
-                mod->kallsyms->symtab[i].st_size
+         * and set types up while we still have access to sections.
-                        = elf_type(&mod->kallsyms->symtab[i], info);
+         */
-        /* Now populate the cut down core kallsyms for after init. */
        mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
        mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
+        mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
        src = mod->kallsyms->symtab;
        for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
+                mod->kallsyms->typetab[i] = elf_type(src + i, info);
                if (i == 0 || is_livepatch_module(mod) ||
                    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
                                   info->index.pcpu)) {
+                        mod->core_kallsyms.typetab[ndst] =
+                            mod->kallsyms->typetab[i];
                        dst[ndst] = src[i];
                        dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
                        s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
@@ -3088,6 +3083,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
                                             sizeof(*mod->tracepoints_ptrs),
                                             &mod->num_tracepoints);
 #endif
+#ifdef CONFIG_TREE_SRCU
+        mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
+                                             sizeof(*mod->srcu_struct_ptrs),
+                                             &mod->num_srcu_structs);
+#endif
 #ifdef CONFIG_BPF_EVENTS
        mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
                                           sizeof(*mod->bpf_raw_events),
@@ -4091,7 +4091,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                        const Elf_Sym *sym = &kallsyms->symtab[symnum];
                        *value = kallsyms_symbol_value(sym);
-                        *type = sym->st_size;
+                        *type = kallsyms->typetab[symnum];
                        strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
                        strlcpy(module_name, mod->name, MODULE_NAME_LEN);
                        *exported = is_exported(name, *value, mod);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 6b9a926fd86b..b10fb1986ca9 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Module signature checker
 *
 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
 */
 #include <linux/kernel.h>
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 6196af8a8223..d9f5081d578d 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/kdebug.h>
 #include <linux/kprobes.h>
 #include <linux/export.h>
@@ -22,6 +23,7 @@ static int notifier_chain_register(struct notifier_block **nl,
                struct notifier_block *n)
 {
        while ((*nl) != NULL) {
+                WARN_ONCE(((*nl) == n), "double register detected");
                if (n->priority > (*nl)->priority)
                        break;
                nl = &((*nl)->next);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f6c5d330059a..c815f58e6bc0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  Copyright (C) 2006 IBM Corporation
 *
 *  Author: Serge Hallyn <serue@us.ibm.com>
 *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License as
- *  published by the Free Software Foundation, version 2 of the
- *  License.
- *
 *  Jun 2006 - namespaces support
 *             OpenVZ, SWsoft Inc.
 *             Pavel Emelianov <xemul@openvz.org>
diff --git a/kernel/panic.c b/kernel/panic.c
index c1fcaad337b7..4d9f55bf7d38 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  linux/kernel/panic.c
 *
@@ -51,6 +52,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
 #define PANIC_PRINT_TIMER_INFO          0x00000004
 #define PANIC_PRINT_LOCK_INFO           0x00000008
 #define PANIC_PRINT_FTRACE_INFO         0x00000010
+#define PANIC_PRINT_ALL_PRINTK_MSG      0x00000020
 unsigned long panic_print;
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -134,6 +136,9 @@ EXPORT_SYMBOL(nmi_panic);
 static void panic_print_sys_info(void)
 {
+        if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
+                console_flush_on_panic(CONSOLE_REPLAY_ALL);
        if (panic_print & PANIC_PRINT_TASK_INFO)
                show_state();
@@ -277,7 +282,7 @@ void panic(const char *fmt, ...)
         * panic() is not being callled from OOPS.
         */
        debug_locks_off();
-        console_flush_on_panic();
+        console_flush_on_panic(CONSOLE_FLUSH_PENDING);
        panic_print_sys_info();
@@ -306,6 +311,8 @@ void panic(const char *fmt, ...)
                 * shutting down.  But if there is a chance of
                 * rebooting the system it will be rebooted.
                 */
+                if (panic_reboot_mode != REBOOT_UNDEFINED)
+                        reboot_mode = panic_reboot_mode;
                emergency_restart();
        }
 #ifdef __sparc__
@@ -321,6 +328,9 @@ void panic(const char *fmt, ...)
        disabled_wait();
 #endif
        pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
+        /* Do not scroll important messages printed above */
+        suppress_printk = 1;
        local_irq_enable();
        for (i = 0; ; i += PANIC_TIMER_STEP) {
                touch_softlockup_watchdog();
diff --git a/kernel/params.c b/kernel/params.c
index ce89f757e6da..cf448785d058 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /* Helpers for initial module or kernel cmdline parsing
   Copyright (C) 2001 Rusty Russell.
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/kernel.h>
 #include <linux/string.h>
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..16263b526560 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Generic pidhash and scalable, time-bounded PID allocator
 *
@@ -32,12 +33,13 @@
 #include <linux/init.h>
 #include <linux/rculist.h>
 #include <linux/memblock.h>
-#include <linux/hash.h>
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
 #include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/idr.h>
@@ -214,6 +216,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);
+        init_waitqueue_head(&pid->wait_pidfd);
        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
        if (!(ns->pid_allocated & PIDNS_ADDING))
@@ -451,6 +455,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
        return idr_get_next(&ns->idr, &nr);
 }
+/**
+ * pidfd_create() - Create a new pid file descriptor.
+ *
+ * @pid:  struct pid that the pidfd will reference
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set.
+ *
+ * Note, that this function can only be called after the fd table has
+ * been unshared to avoid leaking the pidfd to the new process.
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ *         On error, a negative errno number will be returned.
+ */
+static int pidfd_create(struct pid *pid)
+{
+        int fd;
+        fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
+                              O_RDWR | O_CLOEXEC);
+        if (fd < 0)
+                put_pid(pid);
+        return fd;
+}
+/**
+ * pidfd_open() - Open new pid file descriptor.
+ *
+ * @pid:   pid for which to retrieve a pidfd
+ * @flags: flags to pass
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set for
+ * the process identified by @pid. Currently, the process identified by
+ * @pid must be a thread-group leader. This restriction currently exists
+ * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
+ * be used with CLONE_THREAD) and pidfd polling (only supports thread group
+ * leaders).
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ *         On error, a negative errno number will be returned.
+ */
+SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
+{
+        int fd, ret;
+        struct pid *p;
+        if (flags)
+                return -EINVAL;
+        if (pid <= 0)
+                return -EINVAL;
+        p = find_get_pid(pid);
+        if (!p)
+                return -ESRCH;
+        ret = 0;
+        rcu_read_lock();
+        if (!pid_task(p, PIDTYPE_TGID))
+                ret = -EINVAL;
+        rcu_read_unlock();
+        fd = ret ?: pidfd_create(p);
+        put_pid(p);
+        return fd;
+}
 void __init pid_idr_init(void)
 {
        /* Verify no one has done anything silly: */
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index aa6e72fb7c08..6d726cef241c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Pid namespaces
 *
@@ -325,7 +326,7 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
        }
        read_lock(&tasklist_lock);
-        force_sig(SIGKILL, pid_ns->child_reaper);
+        send_sig(SIGKILL, pid_ns->child_reaper, 1);
        read_unlock(&tasklist_lock);
        do_exit(0);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9bbaaab14b36..ff8592ddedee 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 config SUSPEND
        bool "Suspend to RAM and standby"
        depends on ARCH_SUSPEND_POSSIBLE
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 7d66ee68aaaf..0a9326f5f421 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
                 * All CPUs of a domain must have the same micro-architecture
                 * since they all share the same table.
                 */
-                cap = arch_scale_cpu_capacity(NULL, cpu);
+                cap = arch_scale_cpu_capacity(cpu);
                if (prev_cap && prev_cap != cap) {
                        pr_err("CPUs of %*pbl must have the same capacity\n",
                                                        cpumask_pr_args(span));
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c8c272df7154..cd7434e6000d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
 *
@@ -6,8 +7,6 @@
 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
 * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
- *
- * This file is released under the GPLv2.
 */
 #define pr_fmt(fmt) "PM: " fmt
@@ -129,7 +128,7 @@ static int hibernation_test(int level) { return 0; }
 static int platform_begin(int platform_mode)
 {
        return (platform_mode && hibernation_ops) ?
-                hibernation_ops->begin() : 0;
+                hibernation_ops->begin(PMSG_FREEZE) : 0;
 }
 /**
@@ -257,6 +256,11 @@ void swsusp_show_speed(ktime_t start, ktime_t stop,
                (kps % 1000) / 10);
 }
+__weak int arch_resume_nosmt(void)
+{
+        return 0;
+}
 /**
 * create_image - Create a hibernation image.
 * @platform_mode: Whether or not to use the platform driver.
@@ -324,6 +328,10 @@ static int create_image(int platform_mode)
 Enable_cpus:
        suspend_enable_secondary_cpus();
+        /* Allow architectures to do nosmt-specific post-resume dances */
+        if (!in_suspend)
+                error = arch_resume_nosmt();
 Platform_finish:
        platform_finish(platform_mode);
@@ -542,7 +550,7 @@ int hibernation_platform_enter(void)
         * hibernation_ops->finish() before saving the image, so we should let
         * the firmware know that we're going to enter the sleep state after all
         */
-        error = hibernation_ops->begin();
+        error = hibernation_ops->begin(PMSG_HIBERNATE);
        if (error)
                goto Close;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4f43e724f6eb..bdbd605c4215 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kernel/power/main.c - PM subsystem core functionality.
 *
 * Copyright (c) 2003 Patrick Mochel
 * Copyright (c) 2003 Open Source Development Lab
- *
- * This file is released under the GPLv2
- *
 */
 #include <linux/export.h>
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9e58bdc8a562..44bee462ff57 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -75,8 +75,6 @@ static inline void hibernate_reserved_size_init(void) {}
 static inline void hibernate_image_size_init(void) {}
 #endif /* !CONFIG_HIBERNATION */
-extern int pfn_is_nosave(unsigned long);
 #define power_attr(_name) \
 static struct kobj_attribute _name##_attr = {   \
        .attr   = {                             \
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 7ef6866b521d..6d475281c730 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -1,7 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * poweroff.c - sysrq handler to gracefully power down machine.
- *
- * This file is released under the GPL v2
 */
 #include <linux/kernel.h>
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9d22131afc1e..33e3febaba53 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * This module exposes the interface to kernel space for specifying
 * QoS dependencies.  It provides infrastructure for registration of:
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index bc9558ab1e5b..83105874f255 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * linux/kernel/power/snapshot.c
 *
@@ -5,9 +6,6 @@
 *
 * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- *
 */
 #define pr_fmt(fmt) "PM: " fmt
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ef908c134b34..c874a7026e24 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kernel/power/suspend.c - Suspend to RAM and standby functionality.
 *
 * Copyright (c) 2003 Patrick Mochel
 * Copyright (c) 2003 Open Source Development Lab
 * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
 */
 #define pr_fmt(fmt) "PM: " fmt
@@ -62,11 +61,17 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
 enum s2idle_states __read_mostly s2idle_state;
 static DEFINE_RAW_SPINLOCK(s2idle_lock);
-bool pm_suspend_via_s2idle(void)
+/**
+ * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend.
+ *
+ * Return 'true' if suspend-to-idle has been selected as the default system
+ * suspend method.
+ */
+bool pm_suspend_default_s2idle(void)
 {
        return mem_sleep_current == PM_SUSPEND_TO_IDLE;
 }
-EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle);
+EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle);
 void s2idle_set_ops(const struct platform_s2idle_ops *ops)
 {
@@ -488,6 +493,9 @@ int suspend_devices_and_enter(suspend_state_t state)
        pm_suspend_target_state = state;
+        if (state == PM_SUSPEND_TO_IDLE)
+                pm_set_suspend_no_platform();
        error = platform_suspend_begin(state);
        if (error)
                goto Close;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 6a897e8b2a88..60564b58de07 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
 *
 * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
- *
- * This file is released under the GPLv2.
 */
 #include <linux/init.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index d7f6c1a288d3..ca0fcb5ced71 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * linux/kernel/power/swap.c
 *
@@ -7,9 +8,6 @@
 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
 * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
- *
- * This file is released under the GPLv2.
- *
 */
 #define pr_fmt(fmt) "PM: " fmt
@@ -976,12 +974,11 @@ static int get_swap_reader(struct swap_map_handle *handle,
        last = handle->maps = NULL;
        offset = swsusp_header->image;
        while (offset) {
-                tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
+                tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL);
                if (!tmp) {
                        release_swap_reader(handle);
                        return -ENOMEM;
                }
-                memset(tmp, 0, sizeof(*tmp));
                if (!handle->maps)
                        handle->maps = tmp;
                if (last)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index cb24e840a3e6..77438954cc2b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * linux/kernel/power/user.c
 *
 * This file provides the user space interface for software suspend/resume.
 *
 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- *
 */
 #include <linux/suspend.h>
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4a2ffc39eb95..4d052fc6bcde 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 obj-y   = printk.o
 obj-$(CONFIG_PRINTK)    += printk_safe.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)      += braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 0f1898820cba..c8e6ab689d42 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -1,18 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
 * internal.h - printk internal definitions
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 #include <linux/percpu.h>
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02ca827b8fac..1888f6a3b694 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  linux/kernel/printk.c
 *
@@ -86,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem);
 struct console *console_drivers;
 EXPORT_SYMBOL_GPL(console_drivers);
+/*
+ * System may need to suppress printk message under certain
+ * circumstances, like after kernel panic happens.
+ */
+int __read_mostly suppress_printk;
 #ifdef CONFIG_LOCKDEP
 static struct lockdep_map console_lock_dep_map = {
        .name = "console_lock"
@@ -1943,6 +1950,10 @@ asmlinkage int vprintk_emit(int facility, int level,
        unsigned long flags;
        u64 curr_log_seq;
+        /* Suppress unimportant messages after panic happens */
+        if (unlikely(suppress_printk))
+                return 0;
        if (level == LOGLEVEL_SCHED) {
                level = LOGLEVEL_DEFAULT;
                in_sched = true;
@@ -2525,10 +2536,11 @@ void console_unblank(void)
 /**
 * console_flush_on_panic - flush console content on panic
+ * @mode: flush all messages in buffer or just the pending ones
 *
 * Immediately output all pending messages no matter what.
 */
-void console_flush_on_panic(void)
+void console_flush_on_panic(enum con_flush_mode mode)
 {
        /*
         * If someone else is holding the console lock, trylock will fail
@@ -2539,6 +2551,15 @@ void console_flush_on_panic(void)
         */
        console_trylock();
        console_may_schedule = 0;
+        if (mode == CONSOLE_REPLAY_ALL) {
+                unsigned long flags;
+                logbuf_lock_irqsave(flags);
+                console_seq = log_first_seq;
+                console_idx = log_first_idx;
+                logbuf_unlock_irqrestore(flags);
+        }
        console_unlock();
 }
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 0913b4d385de..b4045e782743 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -1,18 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * printk_safe.c - Safe printk for printk-deadlock-prone contexts
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 #include <linux/preempt.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 9c08a2c7cb1d..af7c94bf5fa1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  linux/kernel/profile.c
 *  Simple profiling. Manages a direct-mapped profile hit count buffer,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6f357f4fc859..83a531cea2f3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * linux/kernel/ptrace.c
 *
@@ -78,9 +79,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
 */
 static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 {
-        rcu_read_lock();
+        __ptrace_link(child, new_parent, current_cred());
-        __ptrace_link(child, new_parent, __task_cred(new_parent));
-        rcu_read_unlock();
 }
 /**
@@ -117,6 +116,9 @@ void __ptrace_unlink(struct task_struct *child)
        BUG_ON(!child->ptrace);
        clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+        clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
        child->parent = child->real_parent;
        list_del_init(&child->ptrace_entry);
@@ -323,6 +325,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
        return -EPERM;
 ok:
        rcu_read_unlock();
+        /*
+         * If a task drops privileges and becomes nondumpable (through a syscall
+         * like setresuid()) while we are trying to access it, we must ensure
+         * that the dumpability is read after the credentials; otherwise,
+         * we may be able to attach to a task that we shouldn't be able to
+         * attach to (as if the task had dropped privileges without becoming
+         * nondumpable).
+         * Pairs with a write barrier in commit_creds().
+         */
+        smp_rmb();
        mm = task->mm;
        if (mm &&
            ((get_dumpable(mm) != SUID_DUMP_USER) &&
@@ -704,6 +716,10 @@ static int ptrace_peek_siginfo(struct task_struct *child,
        if (arg.nr < 0)
                return -EINVAL;
+        /* Ensure arg.off fits in an unsigned long */
+        if (arg.off > ULONG_MAX)
+                return 0;
        if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
                pending = &child->signal->shared_pending;
        else
@@ -711,18 +727,20 @@ static int ptrace_peek_siginfo(struct task_struct *child,
        for (i = 0; i < arg.nr; ) {
                kernel_siginfo_t info;
-                s32 off = arg.off + i;
+                unsigned long off = arg.off + i;
+                bool found = false;
                spin_lock_irq(&child->sighand->siglock);
                list_for_each_entry(q, &pending->list, list) {
                        if (!off--) {
+                                found = true;
                                copy_siginfo(&info, &q->info);
                                break;
                        }
                }
                spin_unlock_irq(&child->sighand->siglock);
-                if (off >= 0) /* beyond the end of the list */
+                if (!found) /* beyond the end of the list */
                        break;
 #ifdef CONFIG_COMPAT
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 37301430970e..480edf328b51 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # RCU-related configuration options
 #
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 0ec7d1d33a14..5ec3ea4028e2 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # RCU-related debugging configuration options
 #
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 4b58c907b4b7..5290b01de534 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -11,11 +11,6 @@
 #define __LINUX_RCU_H
 #include <trace/events/rcu.h>
-#ifdef CONFIG_RCU_TRACE
-#define RCU_TRACE(stmt) stmt
-#else /* #ifdef CONFIG_RCU_TRACE */
-#define RCU_TRACE(stmt)
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
 /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
 #define DYNTICK_IRQ_NONIDLE     ((LONG_MAX / 2) + 1)
@@ -216,12 +211,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
        rcu_lock_acquire(&rcu_callback_map);
        if (__is_kfree_rcu_offset(offset)) {
-                RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);)
+                trace_rcu_invoke_kfree_callback(rn, head, offset);
                kfree((void *)head - offset);
                rcu_lock_release(&rcu_callback_map);
                return true;
        } else {
-                RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
+                trace_rcu_invoke_callback(rn, head);
                f = head->func;
                WRITE_ONCE(head->func, (rcu_callback_t)0L);
                f(head);
@@ -451,6 +446,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t);
 enum rcutorture_type {
        RCU_FLAVOR,
        RCU_TASKS_FLAVOR,
+        RCU_TRIVIAL_FLAVOR,
        SRCU_FLAVOR,
        INVALID_RCU_FLAVOR
 };
@@ -484,6 +480,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
 #endif
 #endif
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+#endif
 #ifdef CONFIG_TINY_SRCU
 static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efaa5b3f4d3f..fce4e7e6f502 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -299,6 +299,7 @@ struct rcu_torture_ops {
        int irq_capable;
        int can_boost;
        int extendables;
+        int slow_gps;
        const char *name;
 };
@@ -667,9 +668,51 @@ static struct rcu_torture_ops tasks_ops = {
        .fqs            = NULL,
        .stats          = NULL,
        .irq_capable    = 1,
+        .slow_gps       = 1,
        .name           = "tasks"
 };
+/*
+ * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
+ * This implementation does not necessarily work well with CPU hotplug.
+ */
+static void synchronize_rcu_trivial(void)
+{
+        int cpu;
+        for_each_online_cpu(cpu) {
+                rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
+                WARN_ON_ONCE(raw_smp_processor_id() != cpu);
+        }
+}
+static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
+{
+        preempt_disable();
+        return 0;
+}
+static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
+{
+        preempt_enable();
+}
+static struct rcu_torture_ops trivial_ops = {
+        .ttype          = RCU_TRIVIAL_FLAVOR,
+        .init           = rcu_sync_torture_init,
+        .readlock       = rcu_torture_read_lock_trivial,
+        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
+        .readunlock     = rcu_torture_read_unlock_trivial,
+        .get_gp_seq     = rcu_no_completed,
+        .sync           = synchronize_rcu_trivial,
+        .exp_sync       = synchronize_rcu_trivial,
+        .fqs            = NULL,
+        .stats          = NULL,
+        .irq_capable    = 1,
+        .name           = "trivial"
+};
 static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
 {
        if (!cur_ops->gp_diff)
@@ -1010,10 +1053,17 @@ rcu_torture_writer(void *arg)
                                       !rcu_gp_is_normal();
                }
                rcu_torture_writer_state = RTWS_STUTTER;
-                if (stutter_wait("rcu_torture_writer"))
+                if (stutter_wait("rcu_torture_writer") &&
+                    !READ_ONCE(rcu_fwd_cb_nodelay) &&
+                    !cur_ops->slow_gps &&
+                    !torture_must_stop())
                        for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
-                                if (list_empty(&rcu_tortures[i].rtort_free))
+                                if (list_empty(&rcu_tortures[i].rtort_free) &&
-                                        WARN_ON_ONCE(1);
+                                    rcu_access_pointer(rcu_torture_current) !=
+                                    &rcu_tortures[i]) {
+                                        rcu_ftrace_dump(DUMP_ALL);
+                                        WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
+                                }
        } while (!torture_must_stop());
        /* Reset expediting back to unexpedited. */
        if (expediting > 0)
@@ -1358,8 +1408,9 @@ rcu_torture_stats_print(void)
        }
        pr_alert("%s%s ", torture_type, TORTURE_FLAG);
-        pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+        pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
                rcu_torture_current,
+                rcu_torture_current ? "ver" : "VER",
                rcu_torture_current_version,
                list_empty(&rcu_torture_freelist),
                atomic_read(&n_rcu_torture_alloc),
@@ -1661,6 +1712,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
        spin_unlock_irqrestore(&rcu_fwd_lock, flags);
 }
+// Give the scheduler a chance, even on nohz_full CPUs.
+static void rcu_torture_fwd_prog_cond_resched(void)
+{
+        if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+                if (need_resched())
+                        schedule();
+        } else {
+                cond_resched();
+        }
+}
 /*
 * Free all callbacks on the rcu_fwd_cb_head list, either because the
 * test is over or because we hit an OOM event.
@@ -1674,16 +1736,18 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
        for (;;) {
                spin_lock_irqsave(&rcu_fwd_lock, flags);
                rfcp = rcu_fwd_cb_head;
-                if (!rfcp)
+                if (!rfcp) {
+                        spin_unlock_irqrestore(&rcu_fwd_lock, flags);
                        break;
+                }
                rcu_fwd_cb_head = rfcp->rfc_next;
                if (!rcu_fwd_cb_head)
                        rcu_fwd_cb_tail = &rcu_fwd_cb_head;
                spin_unlock_irqrestore(&rcu_fwd_lock, flags);
                kfree(rfcp);
                freed++;
+                rcu_torture_fwd_prog_cond_resched();
        }
-        spin_unlock_irqrestore(&rcu_fwd_lock, flags);
        return freed;
 }
@@ -1707,6 +1771,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
        }
        /* Tight loop containing cond_resched(). */
+        WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+        cur_ops->sync(); /* Later readers see above write. */
        if  (selfpropcb) {
                WRITE_ONCE(fcs.stop, 0);
                cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb);
@@ -1724,7 +1790,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
                udelay(10);
                cur_ops->readunlock(idx);
                if (!fwd_progress_need_resched || need_resched())
-                        cond_resched();
+                        rcu_torture_fwd_prog_cond_resched();
        }
        (*tested_tries)++;
        if (!time_before(jiffies, stopat) &&
@@ -1745,6 +1811,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
                WARN_ON(READ_ONCE(fcs.stop) != 2);
                destroy_rcu_head_on_stack(&fcs.rh);
        }
+        schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */
+        WRITE_ONCE(rcu_fwd_cb_nodelay, false);
 }
 /* Carry out call_rcu() forward-progress testing. */
@@ -1765,6 +1833,8 @@ static void rcu_torture_fwd_prog_cr(void)
        if (READ_ONCE(rcu_fwd_emergency_stop))
                return; /* Get out of the way quickly, no GP wait! */
+        if (!cur_ops->call)
+                return; /* Can't do call_rcu() fwd prog without ->call. */
        /* Loop continuously posting RCU callbacks. */
        WRITE_ONCE(rcu_fwd_cb_nodelay, true);
@@ -1805,7 +1875,7 @@ static void rcu_torture_fwd_prog_cr(void)
                        rfcp->rfc_gps = 0;
                }
                cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
-                cond_resched();
+                rcu_torture_fwd_prog_cond_resched();
        }
        stoppedat = jiffies;
        n_launders_cb_snap = READ_ONCE(n_launders_cb);
@@ -1814,7 +1884,6 @@ static void rcu_torture_fwd_prog_cr(void)
        cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
        (void)rcu_torture_fwd_prog_cbfree();
-        WRITE_ONCE(rcu_fwd_cb_nodelay, false);
        if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) {
                WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
                pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
@@ -1825,6 +1894,8 @@ static void rcu_torture_fwd_prog_cr(void)
                         n_max_gps, n_max_cbs, cver, gps);
                rcu_torture_fwd_cb_hist();
        }
+        schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
+        WRITE_ONCE(rcu_fwd_cb_nodelay, false);
 }
@@ -2240,7 +2311,7 @@ rcu_torture_init(void)
        int firsterr = 0;
        static struct rcu_torture_ops *torture_ops[] = {
                &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
-                &busted_srcud_ops, &tasks_ops,
+                &busted_srcud_ops, &tasks_ops, &trivial_ops,
        };
        if (!torture_init_begin(torture_type, verbose))
@@ -2363,7 +2434,10 @@ rcu_torture_init(void)
        if (stutter < 0)
                stutter = 0;
        if (stutter) {
-                firsterr = torture_stutter_init(stutter * HZ);
+                int t;
+                t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ;
+                firsterr = torture_stutter_init(stutter * HZ, t);
                if (firsterr)
                        goto unwind;
        }
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9b761e546de8..cf0e886314f2 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -831,8 +831,8 @@ static void srcu_leak_callback(struct rcu_head *rhp)
 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
 * srcu_struct structure.
 */
-void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
-                 rcu_callback_t func, bool do_norm)
+                        rcu_callback_t func, bool do_norm)
 {
        unsigned long flags;
        int idx;
@@ -1310,3 +1310,68 @@ void __init srcu_init(void)
                queue_work(rcu_gp_wq, &ssp->work.work);
        }
 }
+#ifdef CONFIG_MODULES
+/* Initialize any global-scope srcu_struct structures used by this module. */
+static int srcu_module_coming(struct module *mod)
+{
+        int i;
+        struct srcu_struct **sspp = mod->srcu_struct_ptrs;
+        int ret;
+        for (i = 0; i < mod->num_srcu_structs; i++) {
+                ret = init_srcu_struct(*(sspp++));
+                if (WARN_ON_ONCE(ret))
+                        return ret;
+        }
+        return 0;
+}
+/* Clean up any global-scope srcu_struct structures used by this module. */
+static void srcu_module_going(struct module *mod)
+{
+        int i;
+        struct srcu_struct **sspp = mod->srcu_struct_ptrs;
+        for (i = 0; i < mod->num_srcu_structs; i++)
+                cleanup_srcu_struct(*(sspp++));
+}
+/* Handle one module, either coming or going. */
+static int srcu_module_notify(struct notifier_block *self,
+                              unsigned long val, void *data)
+{
+        struct module *mod = data;
+        int ret = 0;
+        switch (val) {
+        case MODULE_STATE_COMING:
+                ret = srcu_module_coming(mod);
+                break;
+        case MODULE_STATE_GOING:
+                srcu_module_going(mod);
+                break;
+        default:
+                break;
+        }
+        return ret;
+}
+static struct notifier_block srcu_module_nb = {
+        .notifier_call = srcu_module_notify,
+        .priority = 0,
+};
+static __init int init_srcu_module_notifier(void)
+{
+        int ret;
+        ret = register_module_notifier(&srcu_module_nb);
+        if (ret)
+                pr_warn("Failed to register srcu module notifier\n");
+        return ret;
+}
+late_initcall(init_srcu_module_notifier);
+#endif /* #ifdef CONFIG_MODULES */
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index a8304d90573f..d4558ab7a07d 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -10,65 +10,18 @@
 #include <linux/rcu_sync.h>
 #include <linux/sched.h>
-#ifdef CONFIG_PROVE_RCU
+enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY };
-#define __INIT_HELD(func)       .held = func,
-#else
-#define __INIT_HELD(func)
-#endif
-static const struct {
-        void (*sync)(void);
-        void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
-        void (*wait)(void);
-#ifdef CONFIG_PROVE_RCU
-        int  (*held)(void);
-#endif
-} gp_ops[] = {
-        [RCU_SYNC] = {
-                .sync = synchronize_rcu,
-                .call = call_rcu,
-                .wait = rcu_barrier,
-                __INIT_HELD(rcu_read_lock_held)
-        },
-        [RCU_SCHED_SYNC] = {
-                .sync = synchronize_rcu,
-                .call = call_rcu,
-                .wait = rcu_barrier,
-                __INIT_HELD(rcu_read_lock_sched_held)
-        },
-        [RCU_BH_SYNC] = {
-                .sync = synchronize_rcu,
-                .call = call_rcu,
-                .wait = rcu_barrier,
-                __INIT_HELD(rcu_read_lock_bh_held)
-        },
-};
-enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
-enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
 #define rss_lock        gp_wait.lock
-#ifdef CONFIG_PROVE_RCU
-void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
-{
-        RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
-                         "suspicious rcu_sync_is_idle() usage");
-}
-EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
-#endif
 /**
 * rcu_sync_init() - Initialize an rcu_sync structure
 * @rsp: Pointer to rcu_sync structure to be initialized
- * @type: Flavor of RCU with which to synchronize rcu_sync structure
 */
-void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
+void rcu_sync_init(struct rcu_sync *rsp)
 {
        memset(rsp, 0, sizeof(*rsp));
        init_waitqueue_head(&rsp->gp_wait);
-        rsp->gp_type = type;
 }
 /**
@@ -86,56 +39,26 @@ void rcu_sync_enter_start(struct rcu_sync *rsp)
        rsp->gp_state = GP_PASSED;
 }
-/**
- * rcu_sync_enter() - Force readers onto slowpath
- * @rsp: Pointer to rcu_sync structure to use for synchronization
- *
- * This function is used by updaters who need readers to make use of
- * a slowpath during the update.  After this function returns, all
- * subsequent calls to rcu_sync_is_idle() will return false, which
- * tells readers to stay off their fastpaths.  A later call to
- * rcu_sync_exit() re-enables reader slowpaths.
- *
- * When called in isolation, rcu_sync_enter() must wait for a grace
- * period, however, closely spaced calls to rcu_sync_enter() can
- * optimize away the grace-period wait via a state machine implemented
- * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
- */
-void rcu_sync_enter(struct rcu_sync *rsp)
-{
-        bool need_wait, need_sync;
-        spin_lock_irq(&rsp->rss_lock);
+static void rcu_sync_func(struct rcu_head *rhp);
-        need_wait = rsp->gp_count++;
-        need_sync = rsp->gp_state == GP_IDLE;
-        if (need_sync)
-                rsp->gp_state = GP_PENDING;
-        spin_unlock_irq(&rsp->rss_lock);
-        WARN_ON_ONCE(need_wait && need_sync);
+static void rcu_sync_call(struct rcu_sync *rsp)
-        if (need_sync) {
+{
-                gp_ops[rsp->gp_type].sync();
+        call_rcu(&rsp->cb_head, rcu_sync_func);
-                rsp->gp_state = GP_PASSED;
-                wake_up_all(&rsp->gp_wait);
-        } else if (need_wait) {
-                wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED);
-        } else {
-                /*
-                 * Possible when there's a pending CB from a rcu_sync_exit().
-                 * Nobody has yet been allowed the 'fast' path and thus we can
-                 * avoid doing any sync(). The callback will get 'dropped'.
-                 */
-                WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
-        }
 }
 /**
 * rcu_sync_func() - Callback function managing reader access to fastpath
 * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
 *
- * This function is passed to one of the call_rcu() functions by
+ * This function is passed to call_rcu() function by rcu_sync_enter() and
 * rcu_sync_exit(), so that it is invoked after a grace period following the
- * that invocation of rcu_sync_exit().  It takes action based on events that
+ * that invocation of enter/exit.
+ *
+ * If it is called by rcu_sync_enter() it signals that all the readers were
+ * switched onto slow path.
+ *
+ * If it is called by rcu_sync_exit() it takes action based on events that
 * have taken place in the meantime, so that closely spaced rcu_sync_enter()
 * and rcu_sync_exit() pairs need not wait for a grace period.
 *
@@ -152,35 +75,88 @@ static void rcu_sync_func(struct rcu_head *rhp)
        struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
        unsigned long flags;
-        WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
+        WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
-        WARN_ON_ONCE(rsp->cb_state == CB_IDLE);
+        WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
        spin_lock_irqsave(&rsp->rss_lock, flags);
        if (rsp->gp_count) {
                /*
-                 * A new rcu_sync_begin() has happened; drop the callback.
+                 * We're at least a GP after the GP_IDLE->GP_ENTER transition.
                 */
-                rsp->cb_state = CB_IDLE;
+                WRITE_ONCE(rsp->gp_state, GP_PASSED);
-        } else if (rsp->cb_state == CB_REPLAY) {
+                wake_up_locked(&rsp->gp_wait);
+        } else if (rsp->gp_state == GP_REPLAY) {
                /*
-                 * A new rcu_sync_exit() has happened; requeue the callback
+                 * A new rcu_sync_exit() has happened; requeue the callback to
-                 * to catch a later GP.
+                 * catch a later GP.
                 */
-                rsp->cb_state = CB_PENDING;
+                WRITE_ONCE(rsp->gp_state, GP_EXIT);
-                gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+                rcu_sync_call(rsp);
        } else {
                /*
-                 * We're at least a GP after rcu_sync_exit(); eveybody will now
+                 * We're at least a GP after the last rcu_sync_exit(); eveybody
-                 * have observed the write side critical section. Let 'em rip!.
+                 * will now have observed the write side critical section.
+                 * Let 'em rip!.
                 */
-                rsp->cb_state = CB_IDLE;
+                WRITE_ONCE(rsp->gp_state, GP_IDLE);
-                rsp->gp_state = GP_IDLE;
        }
        spin_unlock_irqrestore(&rsp->rss_lock, flags);
 }
 /**
- * rcu_sync_exit() - Allow readers back onto fast patch after grace period
+ * rcu_sync_enter() - Force readers onto slowpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who need readers to make use of
+ * a slowpath during the update.  After this function returns, all
+ * subsequent calls to rcu_sync_is_idle() will return false, which
+ * tells readers to stay off their fastpaths.  A later call to
+ * rcu_sync_exit() re-enables reader slowpaths.
+ *
+ * When called in isolation, rcu_sync_enter() must wait for a grace
+ * period, however, closely spaced calls to rcu_sync_enter() can
+ * optimize away the grace-period wait via a state machine implemented
+ * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
+ */
+void rcu_sync_enter(struct rcu_sync *rsp)
+{
+        int gp_state;
+        spin_lock_irq(&rsp->rss_lock);
+        gp_state = rsp->gp_state;
+        if (gp_state == GP_IDLE) {
+                WRITE_ONCE(rsp->gp_state, GP_ENTER);
+                WARN_ON_ONCE(rsp->gp_count);
+                /*
+                 * Note that we could simply do rcu_sync_call(rsp) here and
+                 * avoid the "if (gp_state == GP_IDLE)" block below.
+                 *
+                 * However, synchronize_rcu() can be faster if rcu_expedited
+                 * or rcu_blocking_is_gp() is true.
+                 *
+                 * Another reason is that we can't wait for rcu callback if
+                 * we are called at early boot time but this shouldn't happen.
+                 */
+        }
+        rsp->gp_count++;
+        spin_unlock_irq(&rsp->rss_lock);
+        if (gp_state == GP_IDLE) {
+                /*
+                 * See the comment above, this simply does the "synchronous"
+                 * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED.
+                 */
+                synchronize_rcu();
+                rcu_sync_func(&rsp->cb_head);
+                /* Not really needed, wait_event() would see GP_PASSED. */
+                return;
+        }
+        wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED);
+}
+/**
+ * rcu_sync_exit() - Allow readers back onto fast path after grace period
 * @rsp: Pointer to rcu_sync structure to use for synchronization
 *
 * This function is used by updaters who have completed, and can therefore
@@ -191,13 +167,16 @@ static void rcu_sync_func(struct rcu_head *rhp)
 */
 void rcu_sync_exit(struct rcu_sync *rsp)
 {
+        WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
+        WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
        spin_lock_irq(&rsp->rss_lock);
        if (!--rsp->gp_count) {
-                if (rsp->cb_state == CB_IDLE) {
+                if (rsp->gp_state == GP_PASSED) {
-                        rsp->cb_state = CB_PENDING;
+                        WRITE_ONCE(rsp->gp_state, GP_EXIT);
-                        gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+                        rcu_sync_call(rsp);
-                } else if (rsp->cb_state == CB_PENDING) {
+                } else if (rsp->gp_state == GP_EXIT) {
-                        rsp->cb_state = CB_REPLAY;
+                        WRITE_ONCE(rsp->gp_state, GP_REPLAY);
                }
        }
        spin_unlock_irq(&rsp->rss_lock);
@@ -209,18 +188,19 @@ void rcu_sync_exit(struct rcu_sync *rsp)
 */
 void rcu_sync_dtor(struct rcu_sync *rsp)
 {
-        int cb_state;
+        int gp_state;
-        WARN_ON_ONCE(rsp->gp_count);
+        WARN_ON_ONCE(READ_ONCE(rsp->gp_count));
+        WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
        spin_lock_irq(&rsp->rss_lock);
-        if (rsp->cb_state == CB_REPLAY)
+        if (rsp->gp_state == GP_REPLAY)
-                rsp->cb_state = CB_PENDING;
+                WRITE_ONCE(rsp->gp_state, GP_EXIT);
-        cb_state = rsp->cb_state;
+        gp_state = rsp->gp_state;
        spin_unlock_irq(&rsp->rss_lock);
-        if (cb_state != CB_IDLE) {
+        if (gp_state != GP_IDLE) {
-                gp_ops[rsp->gp_type].wait();
+                rcu_barrier();
-                WARN_ON_ONCE(rsp->cb_state != CB_IDLE);
+                WARN_ON_ONCE(rsp->gp_state != GP_IDLE);
        }
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b4d88a594785..a14e5fbbea46 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -51,6 +51,12 @@
 #include <linux/tick.h>
 #include <linux/sysrq.h>
 #include <linux/kprobes.h>
+#include <linux/gfp.h>
+#include <linux/oom.h>
+#include <linux/smpboot.h>
+#include <linux/jiffies.h>
+#include <linux/sched/isolation.h>
+#include "../time/tick-internal.h"
 #include "tree.h"
 #include "rcu.h"
@@ -92,6 +98,9 @@ struct rcu_state rcu_state = {
 /* Dump rcu_node combining tree at boot to verify correct setup. */
 static bool dump_tree;
 module_param(dump_tree, bool, 0444);
+/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
+static bool use_softirq = 1;
+module_param(use_softirq, bool, 0444);
 /* Control rcu_node-tree auto-balancing at boot time. */
 static bool rcu_fanout_exact;
 module_param(rcu_fanout_exact, bool, 0444);
@@ -138,7 +147,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
-static void invoke_rcu_callbacks(struct rcu_data *rdp);
 static void rcu_report_exp_rdp(struct rcu_data *rdp);
 static void sync_sched_exp_online_cleanup(int cpu);
@@ -368,19 +376,33 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void)
 }
 /**
- * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
+ * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
 *
- * If the current CPU is idle or running at a first-level (not nested)
+ * If the current CPU is idle and running at a first-level (not nested)
 * interrupt from idle, return true.  The caller must have at least
 * disabled preemption.
 */
 static int rcu_is_cpu_rrupt_from_idle(void)
 {
-        return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 &&
+        /* Called only from within the scheduling-clock interrupt */
-               __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1;
+        lockdep_assert_in_irq();
+        /* Check for counter underflows */
+        RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
+                         "RCU dynticks_nesting counter underflow!");
+        RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
+                         "RCU dynticks_nmi_nesting counter underflow/zero!");
+        /* Are we at first interrupt nesting level? */
+        if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1)
+                return false;
+        /* Does CPU appear to be idle from an RCU standpoint? */
+        return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
 }
-#define DEFAULT_RCU_BLIMIT 10     /* Maximum callbacks per rcu_do_batch. */
+#define DEFAULT_RCU_BLIMIT 10     /* Maximum callbacks per rcu_do_batch ... */
+#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */
 static long blimit = DEFAULT_RCU_BLIMIT;
 #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
 static long qhimark = DEFAULT_RCU_QHIMARK;
@@ -1969,14 +1991,14 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
 */
 int rcutree_dying_cpu(unsigned int cpu)
 {
-        RCU_TRACE(bool blkd;)
+        bool blkd;
-        RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);)
+        struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-        RCU_TRACE(struct rcu_node *rnp = rdp->mynode;)
+        struct rcu_node *rnp = rdp->mynode;
        if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
                return 0;
-        RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);)
+        blkd = !!(rnp->qsmask & rdp->grpmask);
        trace_rcu_grace_period(rcu_state.name, rnp->gp_seq,
                               blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
        return 0;
@@ -2113,7 +2135,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
        /* Reinstate batch limit if we have worked down the excess. */
        count = rcu_segcblist_n_cbs(&rdp->cblist);
-        if (rdp->blimit == LONG_MAX && count <= qlowmark)
+        if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
                rdp->blimit = blimit;
        /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
@@ -2253,7 +2275,7 @@ void rcu_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 /* Perform RCU core processing work for the current CPU.  */
-static __latent_entropy void rcu_core(struct softirq_action *unused)
+static __latent_entropy void rcu_core(void)
 {
        unsigned long flags;
        struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
@@ -2287,37 +2309,126 @@ static __latent_entropy void rcu_core(struct softirq_action *unused)
        rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
        /* If there are callbacks ready, invoke them. */
-        if (rcu_segcblist_ready_cbs(&rdp->cblist))
+        if (rcu_segcblist_ready_cbs(&rdp->cblist) &&
-                invoke_rcu_callbacks(rdp);
+            likely(READ_ONCE(rcu_scheduler_fully_active)))
+                rcu_do_batch(rdp);
        /* Do any needed deferred wakeups of rcuo kthreads. */
        do_nocb_deferred_wakeup(rdp);
        trace_rcu_utilization(TPS("End RCU core"));
 }
+static void rcu_core_si(struct softirq_action *h)
+{
+        rcu_core();
+}
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+        /*
+         * If the thread is yielding, only wake it when this
+         * is invoked from idle
+         */
+        if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
+                wake_up_process(t);
+}
+static void invoke_rcu_core_kthread(void)
+{
+        struct task_struct *t;
+        unsigned long flags;
+        local_irq_save(flags);
+        __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
+        t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
+        if (t != NULL && t != current)
+                rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
+        local_irq_restore(flags);
+}
 /*
- * Schedule RCU callback invocation.  If the running implementation of RCU
+ * Wake up this CPU's rcuc kthread to do RCU core processing.
- * does not support RCU priority boosting, just do a direct call, otherwise
- * wake up the per-CPU kernel kthread.  Note that because we are running
- * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task
- * cannot disappear out from under us.
 */
-static void invoke_rcu_callbacks(struct rcu_data *rdp)
+static void invoke_rcu_core(void)
 {
-        if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
+        if (!cpu_online(smp_processor_id()))
-                return;
-        if (likely(!rcu_state.boost)) {
-                rcu_do_batch(rdp);
                return;
+        if (use_softirq)
+                raise_softirq(RCU_SOFTIRQ);
+        else
+                invoke_rcu_core_kthread();
+}
+static void rcu_cpu_kthread_park(unsigned int cpu)
+{
+        per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+}
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
+{
+        return __this_cpu_read(rcu_data.rcu_cpu_has_work);
+}
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks.  This replaces
+ * the RCU softirq used in configurations of RCU that do not support RCU
+ * priority boosting.
+ */
+static void rcu_cpu_kthread(unsigned int cpu)
+{
+        unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
+        char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+        int spincnt;
+        for (spincnt = 0; spincnt < 10; spincnt++) {
+                trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
+                local_bh_disable();
+                *statusp = RCU_KTHREAD_RUNNING;
+                local_irq_disable();
+                work = *workp;
+                *workp = 0;
+                local_irq_enable();
+                if (work)
+                        rcu_core();
+                local_bh_enable();
+                if (*workp == 0) {
+                        trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
+                        *statusp = RCU_KTHREAD_WAITING;
+                        return;
+                }
        }
-        invoke_rcu_callbacks_kthread();
+        *statusp = RCU_KTHREAD_YIELDING;
+        trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
+        schedule_timeout_interruptible(2);
+        trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
+        *statusp = RCU_KTHREAD_WAITING;
 }
-static void invoke_rcu_core(void)
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+        .store                  = &rcu_data.rcu_cpu_kthread_task,
+        .thread_should_run      = rcu_cpu_kthread_should_run,
+        .thread_fn              = rcu_cpu_kthread,
+        .thread_comm            = "rcuc/%u",
+        .setup                  = rcu_cpu_kthread_setup,
+        .park                   = rcu_cpu_kthread_park,
+};
+/*
+ * Spawn per-CPU RCU core processing kthreads.
+ */
+static int __init rcu_spawn_core_kthreads(void)
 {
-        if (cpu_online(smp_processor_id()))
+        int cpu;
-                raise_softirq(RCU_SOFTIRQ);
+        for_each_possible_cpu(cpu)
+                per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
+        if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
+                return 0;
+        WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
+                  "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
+        return 0;
 }
+early_initcall(rcu_spawn_core_kthreads);
 /*
 * Handle any core-RCU processing required by a call_rcu() invocation.
@@ -2354,7 +2465,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
                        rcu_accelerate_cbs_unlocked(rdp->mynode, rdp);
                } else {
                        /* Give the grace period a kick. */
-                        rdp->blimit = LONG_MAX;
+                        rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
                        if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
                            rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
                                rcu_force_quiescent_state();
@@ -3355,7 +3466,8 @@ void __init rcu_init(void)
        rcu_init_one();
        if (dump_tree)
                rcu_dump_rcu_node_tree();
-        open_softirq(RCU_SOFTIRQ, rcu_core);
+        if (use_softirq)
+                open_softirq(RCU_SOFTIRQ, rcu_core_si);
        /*
         * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e253d11af3c4..7acaf3a62d39 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -154,13 +154,15 @@ struct rcu_data {
        bool            core_needs_qs;  /* Core waits for quiesc state. */
        bool            beenonline;     /* CPU online at least once. */
        bool            gpwrap;         /* Possible ->gp_seq wrap. */
-        bool            deferred_qs;    /* This CPU awaiting a deferred QS? */
+        bool            exp_deferred_qs; /* This CPU awaiting a deferred QS? */
        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
        unsigned long grpmask;          /* Mask to apply to leaf qsmask. */
        unsigned long   ticks_this_gp;  /* The number of scheduling-clock */
                                        /*  ticks this CPU has handled */
                                        /*  during and after the last grace */
                                        /* period it is aware of. */
+        struct irq_work defer_qs_iw;    /* Obtain later scheduler attention. */
+        bool defer_qs_iw_pending;       /* Scheduler attention pending? */
        /* 2) batch handling */
        struct rcu_segcblist cblist;    /* Segmented callback list, with */
@@ -407,8 +409,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
 static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static void invoke_rcu_callbacks_kthread(void);
 static bool rcu_is_callbacks_kthread(void);
+static void rcu_cpu_kthread_setup(unsigned int cpu);
 static void __init rcu_spawn_boost_kthreads(void);
 static void rcu_prepare_kthreads(int cpu);
 static void rcu_cleanup_after_idle(void);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 9c990df880d1..af7e7b9c86af 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -250,7 +250,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
 */
 static void rcu_report_exp_rdp(struct rcu_data *rdp)
 {
-        WRITE_ONCE(rdp->deferred_qs, false);
+        WRITE_ONCE(rdp->exp_deferred_qs, false);
        rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true);
 }
@@ -259,8 +259,7 @@ static bool sync_exp_work_done(unsigned long s)
 {
        if (rcu_exp_gp_seq_done(s)) {
                trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done"));
-                /* Ensure test happens before caller kfree(). */
+                smp_mb(); /* Ensure test happens before caller kfree(). */
-                smp_mb__before_atomic(); /* ^^^ */
                return true;
        }
        return false;
@@ -384,7 +383,12 @@ retry_ipi:
                        mask_ofl_test |= mask;
                        continue;
                }
+                if (get_cpu() == cpu) {
+                        put_cpu();
+                        continue;
+                }
                ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
+                put_cpu();
                if (!ret) {
                        mask_ofl_ipi &= ~mask;
                        continue;
@@ -611,7 +615,7 @@ static void rcu_exp_handler(void *unused)
                    rcu_dynticks_curr_cpu_in_eqs()) {
                        rcu_report_exp_rdp(rdp);
                } else {
-                        rdp->deferred_qs = true;
+                        rdp->exp_deferred_qs = true;
                        set_tsk_need_resched(t);
                        set_preempt_need_resched();
                }
@@ -633,7 +637,7 @@ static void rcu_exp_handler(void *unused)
        if (t->rcu_read_lock_nesting > 0) {
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
                if (rnp->expmask & rdp->grpmask) {
-                        rdp->deferred_qs = true;
+                        rdp->exp_deferred_qs = true;
                        t->rcu_read_unlock_special.b.exp_hint = true;
                }
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -656,7 +660,7 @@ static void rcu_exp_handler(void *unused)
         *
         * Otherwise, force a context switch after the CPU enables everything.
         */
-        rdp->deferred_qs = true;
+        rdp->exp_deferred_qs = true;
        if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
            WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) {
                rcu_preempt_deferred_qs(t);
@@ -694,6 +698,16 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
 #else /* #ifdef CONFIG_PREEMPT_RCU */
+/* Request an expedited quiescent state. */
+static void rcu_exp_need_qs(void)
+{
+        __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
+        /* Store .exp before .rcu_urgent_qs. */
+        smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
+        set_tsk_need_resched(current);
+        set_preempt_need_resched();
+}
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
 static void rcu_exp_handler(void *unused)
 {
@@ -709,25 +723,38 @@ static void rcu_exp_handler(void *unused)
                rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
                return;
        }
-        __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
+        rcu_exp_need_qs();
-        /* Store .exp before .rcu_urgent_qs. */
-        smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
-        set_tsk_need_resched(current);
-        set_preempt_need_resched();
 }
 /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
 static void sync_sched_exp_online_cleanup(int cpu)
 {
+        unsigned long flags;
+        int my_cpu;
        struct rcu_data *rdp;
        int ret;
        struct rcu_node *rnp;
        rdp = per_cpu_ptr(&rcu_data, cpu);
        rnp = rdp->mynode;
-        if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+        my_cpu = get_cpu();
+        /* Quiescent state either not needed or already requested, leave. */
+        if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+            __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) {
+                put_cpu();
+                return;
+        }
+        /* Quiescent state needed on current CPU, so set it up locally. */
+        if (my_cpu == cpu) {
+                local_irq_save(flags);
+                rcu_exp_need_qs();
+                local_irq_restore(flags);
+                put_cpu();
                return;
+        }
+        /* Quiescent state needed on some other CPU, send IPI. */
        ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
+        put_cpu();
        WARN_ON_ONCE(ret);
 }
@@ -765,7 +792,6 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
 */
 void synchronize_rcu_expedited(void)
 {
-        struct rcu_data *rdp;
        struct rcu_exp_work rew;
        struct rcu_node *rnp;
        unsigned long s;
@@ -802,7 +828,6 @@ void synchronize_rcu_expedited(void)
        }
        /* Wait for expedited grace period to complete. */
-        rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
        rnp = rcu_get_root();
        wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
                   sync_exp_work_done(s));
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1102765f91fd..acb225023ed1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -11,29 +11,7 @@
 *         Paul E. McKenney <paulmck@linux.ibm.com>
 */
-#include <linux/delay.h>
-#include <linux/gfp.h>
-#include <linux/oom.h>
-#include <linux/sched/debug.h>
-#include <linux/smpboot.h>
-#include <linux/sched/isolation.h>
-#include <uapi/linux/sched/types.h>
-#include "../time/tick-internal.h"
-#ifdef CONFIG_RCU_BOOST
 #include "../locking/rtmutex_common.h"
-#else /* #ifdef CONFIG_RCU_BOOST */
-/*
- * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
- * all uses are in dead code.  Provide a definition to keep the compiler
- * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
- * This probably needs to be excluded from -rt builds.
- */
-#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
-#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
 #ifdef CONFIG_RCU_NOCB_CPU
 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -94,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void)
                pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
        if (gp_cleanup_delay)
                pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+        if (!use_softirq)
+                pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
        if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
                pr_info("\tRCU debug extended QS entry/exit.\n");
        rcupdate_announce_bootup_oddness();
@@ -257,10 +237,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
         * no need to check for a subsequent expedited GP.  (Though we are
         * still in a quiescent state in any case.)
         */
-        if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs)
+        if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
                rcu_report_exp_rdp(rdp);
        else
-                WARN_ON_ONCE(rdp->deferred_qs);
+                WARN_ON_ONCE(rdp->exp_deferred_qs);
 }
 /*
@@ -357,7 +337,7 @@ void rcu_note_context_switch(bool preempt)
         * means that we continue to block the current grace period.
         */
        rcu_qs();
-        if (rdp->deferred_qs)
+        if (rdp->exp_deferred_qs)
                rcu_report_exp_rdp(rdp);
        trace_rcu_utilization(TPS("End context switch"));
        barrier(); /* Avoid RCU read-side critical sections leaking up. */
@@ -471,14 +451,15 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
         */
        special = t->rcu_read_unlock_special;
        rdp = this_cpu_ptr(&rcu_data);
-        if (!special.s && !rdp->deferred_qs) {
+        if (!special.s && !rdp->exp_deferred_qs) {
                local_irq_restore(flags);
                return;
        }
+        t->rcu_read_unlock_special.b.deferred_qs = false;
        if (special.b.need_qs) {
                rcu_qs();
                t->rcu_read_unlock_special.b.need_qs = false;
-                if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) {
+                if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) {
                        local_irq_restore(flags);
                        return;
                }
@@ -490,7 +471,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
         * tasks are handled when removing the task from the
         * blocked-tasks list below.
         */
-        if (rdp->deferred_qs) {
+        if (rdp->exp_deferred_qs) {
                rcu_report_exp_rdp(rdp);
                if (!t->rcu_read_unlock_special.s) {
                        local_irq_restore(flags);
@@ -579,7 +560,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 */
 static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 {
-        return (__this_cpu_read(rcu_data.deferred_qs) ||
+        return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
                READ_ONCE(t->rcu_read_unlock_special.s)) &&
               t->rcu_read_lock_nesting <= 0;
 }
@@ -607,6 +588,17 @@ static void rcu_preempt_deferred_qs(struct task_struct *t)
 }
 /*
+ * Minimal handler to give the scheduler a chance to re-evaluate.
+ */
+static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
+{
+        struct rcu_data *rdp;
+        rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
+        rdp->defer_qs_iw_pending = false;
+}
+/*
 * Handle special cases during rcu_read_unlock(), such as needing to
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
@@ -625,16 +617,41 @@ static void rcu_read_unlock_special(struct task_struct *t)
        local_irq_save(flags);
        irqs_were_disabled = irqs_disabled_flags(flags);
        if (preempt_bh_were_disabled || irqs_were_disabled) {
-                WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
+                bool exp;
-                /* Need to defer quiescent state until everything is enabled. */
+                struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-                if (irqs_were_disabled) {
+                struct rcu_node *rnp = rdp->mynode;
-                        /* Enabling irqs does not reschedule, so... */
+                t->rcu_read_unlock_special.b.exp_hint = false;
+                exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) ||
+                      (rdp->grpmask & rnp->expmask) ||
+                      tick_nohz_full_cpu(rdp->cpu);
+                // Need to defer quiescent state until everything is enabled.
+                if ((exp || in_irq()) && irqs_were_disabled && use_softirq &&
+                    (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) {
+                        // Using softirq, safe to awaken, and we get
+                        // no help from enabling irqs, unlike bh/preempt.
                        raise_softirq_irqoff(RCU_SOFTIRQ);
+                } else if (exp && irqs_were_disabled && !use_softirq &&
+                           !t->rcu_read_unlock_special.b.deferred_qs) {
+                        // Safe to awaken and we get no help from enabling
+                        // irqs, unlike bh/preempt.
+                        invoke_rcu_core();
                } else {
-                        /* Enabling BH or preempt does reschedule, so... */
+                        // Enabling BH or preempt does reschedule, so...
+                        // Also if no expediting or NO_HZ_FULL, slow is OK.
                        set_tsk_need_resched(current);
                        set_preempt_need_resched();
+                        if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+                            !rdp->defer_qs_iw_pending && exp) {
+                                // Get scheduler to re-evaluate and call hooks.
+                                // If !IRQ_WORK, FQS scan will eventually IPI.
+                                init_irq_work(&rdp->defer_qs_iw,
+                                              rcu_preempt_deferred_qs_handler);
+                                rdp->defer_qs_iw_pending = true;
+                                irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
+                        }
                }
+                t->rcu_read_unlock_special.b.deferred_qs = true;
                local_irq_restore(flags);
                return;
        }
@@ -760,7 +777,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
        i = 0;
        list_for_each(lhp, &rnp->blkd_tasks) {
                pr_cont(" %p", lhp);
-                if (++i >= 10)
+                if (++i >= ncheck)
                        break;
        }
        pr_cont("\n");
@@ -944,18 +961,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+/*
+ * If boosting, set rcuc kthreads to realtime priority.
+ */
+static void rcu_cpu_kthread_setup(unsigned int cpu)
+{
 #ifdef CONFIG_RCU_BOOST
+        struct sched_param sp;
-static void rcu_wake_cond(struct task_struct *t, int status)
+        sp.sched_priority = kthread_prio;
-{
+        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-        /*
+#endif /* #ifdef CONFIG_RCU_BOOST */
-         * If the thread is yielding, only wake it when this
-         * is invoked from idle
-         */
-        if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
-                wake_up_process(t);
 }
+#ifdef CONFIG_RCU_BOOST
 /*
 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1091,23 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 }
 /*
- * Wake up the per-CPU kthread to invoke RCU callbacks.
- */
-static void invoke_rcu_callbacks_kthread(void)
-{
-        unsigned long flags;
-        local_irq_save(flags);
-        __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
-        if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL &&
-            current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) {
-                rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task),
-                              __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
-        }
-        local_irq_restore(flags);
-}
-/*
 * Is the current CPU running the RCU-callbacks kthread?
 * Caller must have preemption disabled.
 */
@@ -1160,59 +1163,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
        return 0;
 }
-static void rcu_cpu_kthread_setup(unsigned int cpu)
-{
-        struct sched_param sp;
-        sp.sched_priority = kthread_prio;
-        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-}
-static void rcu_cpu_kthread_park(unsigned int cpu)
-{
-        per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-}
-static int rcu_cpu_kthread_should_run(unsigned int cpu)
-{
-        return __this_cpu_read(rcu_data.rcu_cpu_has_work);
-}
-/*
- * Per-CPU kernel thread that invokes RCU callbacks.  This replaces
- * the RCU softirq used in configurations of RCU that do not support RCU
- * priority boosting.
- */
-static void rcu_cpu_kthread(unsigned int cpu)
-{
-        unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
-        char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
-        int spincnt;
-        for (spincnt = 0; spincnt < 10; spincnt++) {
-                trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
-                local_bh_disable();
-                *statusp = RCU_KTHREAD_RUNNING;
-                local_irq_disable();
-                work = *workp;
-                *workp = 0;
-                local_irq_enable();
-                if (work)
-                        rcu_do_batch(this_cpu_ptr(&rcu_data));
-                local_bh_enable();
-                if (*workp == 0) {
-                        trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
-                        *statusp = RCU_KTHREAD_WAITING;
-                        return;
-                }
-        }
-        *statusp = RCU_KTHREAD_YIELDING;
-        trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
-        schedule_timeout_interruptible(2);
-        trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
-        *statusp = RCU_KTHREAD_WAITING;
-}
 /*
 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
 * served by the rcu_node in question.  The CPU hotplug lock is still
@@ -1243,27 +1193,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
        free_cpumask_var(cm);
 }
-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
-        .store                  = &rcu_data.rcu_cpu_kthread_task,
-        .thread_should_run      = rcu_cpu_kthread_should_run,
-        .thread_fn              = rcu_cpu_kthread,
-        .thread_comm            = "rcuc/%u",
-        .setup                  = rcu_cpu_kthread_setup,
-        .park                   = rcu_cpu_kthread_park,
-};
 /*
 * Spawn boost kthreads -- called as soon as the scheduler is running.
 */
 static void __init rcu_spawn_boost_kthreads(void)
 {
        struct rcu_node *rnp;
-        int cpu;
-        for_each_possible_cpu(cpu)
-                per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
-        if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__))
-                return;
        rcu_for_each_leaf_node(rnp)
                (void)rcu_spawn_one_boost_kthread(rnp);
 }
@@ -1286,11 +1222,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
-static void invoke_rcu_callbacks_kthread(void)
-{
-        WARN_ON_ONCE(1);
-}
 static bool rcu_is_callbacks_kthread(void)
 {
        return false;
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index f65a73a97323..065183391f75 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -630,7 +630,9 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
            time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
            time_before(j, rcu_state.gp_activity + gpssdelay) ||
            atomic_xchg(&warned, 1)) {
-                raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
+                if (rnp_root != rnp)
+                        /* irqs remain disabled. */
+                        raw_spin_unlock_rcu_node(rnp_root);
                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                return;
        }
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c3bf44ba42e5..61df2bf08563 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -423,6 +423,19 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
        do { } while (0)
 #endif
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+/* Get rcutorture access to sched_setaffinity(). */
+long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+        int ret;
+        ret = sched_setaffinity(pid, in_mask);
+        WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
+#endif
 #ifdef CONFIG_RCU_STALL_COMMON
 int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
 EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index e1b79b6a2735..c4d472b7f1b4 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  linux/kernel/reboot.c
 *
@@ -31,6 +32,7 @@ EXPORT_SYMBOL(cad_pid);
 #define DEFAULT_REBOOT_MODE
 #endif
 enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;
 /*
 * This variable is used privately to keep track of whether or not
@@ -519,6 +521,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot);
 static int __init reboot_setup(char *str)
 {
        for (;;) {
+                enum reboot_mode *mode;
                /*
                 * Having anything passed on the command line via
                 * reboot= will cause us to disable DMI checking
@@ -526,17 +530,24 @@ static int __init reboot_setup(char *str)
                 */
                reboot_default = 0;
+                if (!strncmp(str, "panic_", 6)) {
+                        mode = &panic_reboot_mode;
+                        str += 6;
+                } else {
+                        mode = &reboot_mode;
+                }
                switch (*str) {
                case 'w':
-                        reboot_mode = REBOOT_WARM;
+                        *mode = REBOOT_WARM;
                        break;
                case 'c':
-                        reboot_mode = REBOOT_COLD;
+                        *mode = REBOOT_COLD;
                        break;
                case 'h':
-                        reboot_mode = REBOOT_HARD;
+                        *mode = REBOOT_HARD;
                        break;
                case 's':
@@ -553,11 +564,11 @@ static int __init reboot_setup(char *str)
                                if (rc)
                                        return rc;
                        } else
-                                reboot_mode = REBOOT_SOFT;
+                                *mode = REBOOT_SOFT;
                        break;
                }
                case 'g':
-                        reboot_mode = REBOOT_GPIO;
+                        *mode = REBOOT_GPIO;
                        break;
                case 'b':
diff --git a/kernel/resource.c b/kernel/resource.c
index 8c15f846e8ef..158f04ec1d4f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *      linux/kernel/resource.c
 *
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 9424ee90589e..27c48eb7de40 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -277,7 +277,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 error:
        sig = ksig ? ksig->sig : 0;
-        force_sigsegv(sig, t);
+        force_sigsegv(sig);
 }
 #ifdef CONFIG_DEBUG_RSEQ
@@ -296,7 +296,7 @@ void rseq_syscall(struct pt_regs *regs)
                return;
        if (!access_ok(t->rseq, sizeof(*t->rseq)) ||
            rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
-                force_sig(SIGSEGV, t);
+                force_sig(SIGSEGV);
 }
 #endif
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2d4ff5353ded..2067080bb235 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -259,7 +259,6 @@ out:
 }
 #endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_SCHED_DEBUG
 int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
        if (!task_group_is_autogroup(tg))
@@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
        return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
-#endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e3e3b979f9bd..1152259a4ca0 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * sched_clock() for unstable CPU clocks
 *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 102dfcf0a29a..fa43ce3962e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  kernel/sched/core.c
 *
@@ -22,6 +23,17 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
+/*
+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
+ * associated with them) to allow external modules to probe them.
+ */
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
@@ -760,6 +772,401 @@ static void set_load_weight(struct task_struct *p, bool update_load)
        }
 }
+#ifdef CONFIG_UCLAMP_TASK
+/* Max allowed minimum utilization */
+unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+/* Max allowed maximum utilization */
+unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+/* All clamps are required to be less or equal than these values */
+static struct uclamp_se uclamp_default[UCLAMP_CNT];
+/* Integer rounded range for each bucket */
+#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
+#define for_each_clamp_id(clamp_id) \
+        for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
+static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
+{
+        return clamp_value / UCLAMP_BUCKET_DELTA;
+}
+static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
+{
+        return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
+}
+static inline unsigned int uclamp_none(int clamp_id)
+{
+        if (clamp_id == UCLAMP_MIN)
+                return 0;
+        return SCHED_CAPACITY_SCALE;
+}
+static inline void uclamp_se_set(struct uclamp_se *uc_se,
+                                 unsigned int value, bool user_defined)
+{
+        uc_se->value = value;
+        uc_se->bucket_id = uclamp_bucket_id(value);
+        uc_se->user_defined = user_defined;
+}
+static inline unsigned int
+uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+                  unsigned int clamp_value)
+{
+        /*
+         * Avoid blocked utilization pushing up the frequency when we go
+         * idle (which drops the max-clamp) by retaining the last known
+         * max-clamp.
+         */
+        if (clamp_id == UCLAMP_MAX) {
+                rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
+                return clamp_value;
+        }
+        return uclamp_none(UCLAMP_MIN);
+}
+static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+                                     unsigned int clamp_value)
+{
+        /* Reset max-clamp retention only on idle exit */
+        if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+                return;
+        WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+}
+static inline
+unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
+                                 unsigned int clamp_value)
+{
+        struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
+        int bucket_id = UCLAMP_BUCKETS - 1;
+        /*
+         * Since both min and max clamps are max aggregated, find the
+         * top most bucket with tasks in.
+         */
+        for ( ; bucket_id >= 0; bucket_id--) {
+                if (!bucket[bucket_id].tasks)
+                        continue;
+                return bucket[bucket_id].value;
+        }
+        /* No tasks -- default clamp values */
+        return uclamp_idle_value(rq, clamp_id, clamp_value);
+}
+/*
+ * The effective clamp bucket index of a task depends on, by increasing
+ * priority:
+ * - the task specific clamp value, when explicitly requested from userspace
+ * - the system default clamp value, defined by the sysadmin
+ */
+static inline struct uclamp_se
+uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+{
+        struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+        struct uclamp_se uc_max = uclamp_default[clamp_id];
+        /* System default restrictions always apply */
+        if (unlikely(uc_req.value > uc_max.value))
+                return uc_max;
+        return uc_req;
+}
+unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+{
+        struct uclamp_se uc_eff;
+        /* Task currently refcounted: use back-annotated (effective) value */
+        if (p->uclamp[clamp_id].active)
+                return p->uclamp[clamp_id].value;
+        uc_eff = uclamp_eff_get(p, clamp_id);
+        return uc_eff.value;
+}
+/*
+ * When a task is enqueued on a rq, the clamp bucket currently defined by the
+ * task's uclamp::bucket_id is refcounted on that rq. This also immediately
+ * updates the rq's clamp value if required.
+ *
+ * Tasks can have a task-specific value requested from user-space, track
+ * within each bucket the maximum value for tasks refcounted in it.
+ * This "local max aggregation" allows to track the exact "requested" value
+ * for each bucket when all its RUNNABLE tasks require the same clamp.
+ */
+static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
+                                    unsigned int clamp_id)
+{
+        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+        struct uclamp_bucket *bucket;
+        lockdep_assert_held(&rq->lock);
+        /* Update task effective clamp */
+        p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
+        bucket = &uc_rq->bucket[uc_se->bucket_id];
+        bucket->tasks++;
+        uc_se->active = true;
+        uclamp_idle_reset(rq, clamp_id, uc_se->value);
+        /*
+         * Local max aggregation: rq buckets always track the max
+         * "requested" clamp value of its RUNNABLE tasks.
+         */
+        if (bucket->tasks == 1 || uc_se->value > bucket->value)
+                bucket->value = uc_se->value;
+        if (uc_se->value > READ_ONCE(uc_rq->value))
+                WRITE_ONCE(uc_rq->value, uc_se->value);
+}
+/*
+ * When a task is dequeued from a rq, the clamp bucket refcounted by the task
+ * is released. If this is the last task reference counting the rq's max
+ * active clamp value, then the rq's clamp value is updated.
+ *
+ * Both refcounted tasks and rq's cached clamp values are expected to be
+ * always valid. If it's detected they are not, as defensive programming,
+ * enforce the expected state and warn.
+ */
+static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
+                                    unsigned int clamp_id)
+{
+        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+        struct uclamp_bucket *bucket;
+        unsigned int bkt_clamp;
+        unsigned int rq_clamp;
+        lockdep_assert_held(&rq->lock);
+        bucket = &uc_rq->bucket[uc_se->bucket_id];
+        SCHED_WARN_ON(!bucket->tasks);
+        if (likely(bucket->tasks))
+                bucket->tasks--;
+        uc_se->active = false;
+        /*
+         * Keep "local max aggregation" simple and accept to (possibly)
+         * overboost some RUNNABLE tasks in the same bucket.
+         * The rq clamp bucket value is reset to its base value whenever
+         * there are no more RUNNABLE tasks refcounting it.
+         */
+        if (likely(bucket->tasks))
+                return;
+        rq_clamp = READ_ONCE(uc_rq->value);
+        /*
+         * Defensive programming: this should never happen. If it happens,
+         * e.g. due to future modification, warn and fixup the expected value.
+         */
+        SCHED_WARN_ON(bucket->value > rq_clamp);
+        if (bucket->value >= rq_clamp) {
+                bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
+                WRITE_ONCE(uc_rq->value, bkt_clamp);
+        }
+}
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+{
+        unsigned int clamp_id;
+        if (unlikely(!p->sched_class->uclamp_enabled))
+                return;
+        for_each_clamp_id(clamp_id)
+                uclamp_rq_inc_id(rq, p, clamp_id);
+        /* Reset clamp idle holding when there is one RUNNABLE task */
+        if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+                rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+{
+        unsigned int clamp_id;
+        if (unlikely(!p->sched_class->uclamp_enabled))
+                return;
+        for_each_clamp_id(clamp_id)
+                uclamp_rq_dec_id(rq, p, clamp_id);
+}
+int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp,
+                                loff_t *ppos)
+{
+        int old_min, old_max;
+        static DEFINE_MUTEX(mutex);
+        int result;
+        mutex_lock(&mutex);
+        old_min = sysctl_sched_uclamp_util_min;
+        old_max = sysctl_sched_uclamp_util_max;
+        result = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (result)
+                goto undo;
+        if (!write)
+                goto done;
+        if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
+            sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+                result = -EINVAL;
+                goto undo;
+        }
+        if (old_min != sysctl_sched_uclamp_util_min) {
+                uclamp_se_set(&uclamp_default[UCLAMP_MIN],
+                              sysctl_sched_uclamp_util_min, false);
+        }
+        if (old_max != sysctl_sched_uclamp_util_max) {
+                uclamp_se_set(&uclamp_default[UCLAMP_MAX],
+                              sysctl_sched_uclamp_util_max, false);
+        }
+        /*
+         * Updating all the RUNNABLE task is expensive, keep it simple and do
+         * just a lazy update at each next enqueue time.
+         */
+        goto done;
+undo:
+        sysctl_sched_uclamp_util_min = old_min;
+        sysctl_sched_uclamp_util_max = old_max;
+done:
+        mutex_unlock(&mutex);
+        return result;
+}
+static int uclamp_validate(struct task_struct *p,
+                           const struct sched_attr *attr)
+{
+        unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
+        unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
+        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
+                lower_bound = attr->sched_util_min;
+        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
+                upper_bound = attr->sched_util_max;
+        if (lower_bound > upper_bound)
+                return -EINVAL;
+        if (upper_bound > SCHED_CAPACITY_SCALE)
+                return -EINVAL;
+        return 0;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+                                  const struct sched_attr *attr)
+{
+        unsigned int clamp_id;
+        /*
+         * On scheduling class change, reset to default clamps for tasks
+         * without a task-specific value.
+         */
+        for_each_clamp_id(clamp_id) {
+                struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+                unsigned int clamp_value = uclamp_none(clamp_id);
+                /* Keep using defined clamps across class changes */
+                if (uc_se->user_defined)
+                        continue;
+                /* By default, RT tasks always get 100% boost */
+                if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+                        clamp_value = uclamp_none(UCLAMP_MAX);
+                uclamp_se_set(uc_se, clamp_value, false);
+        }
+        if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
+                return;
+        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+                uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
+                              attr->sched_util_min, true);
+        }
+        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+                uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
+                              attr->sched_util_max, true);
+        }
+}
+static void uclamp_fork(struct task_struct *p)
+{
+        unsigned int clamp_id;
+        for_each_clamp_id(clamp_id)
+                p->uclamp[clamp_id].active = false;
+        if (likely(!p->sched_reset_on_fork))
+                return;
+        for_each_clamp_id(clamp_id) {
+                unsigned int clamp_value = uclamp_none(clamp_id);
+                /* By default, RT tasks always get 100% boost */
+                if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+                        clamp_value = uclamp_none(UCLAMP_MAX);
+                uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
+        }
+}
+static void __init init_uclamp(void)
+{
+        struct uclamp_se uc_max = {};
+        unsigned int clamp_id;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
+                cpu_rq(cpu)->uclamp_flags = 0;
+        }
+        for_each_clamp_id(clamp_id) {
+                uclamp_se_set(&init_task.uclamp_req[clamp_id],
+                              uclamp_none(clamp_id), false);
+        }
+        /* System defaults allow max clamp values for both indexes */
+        uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
+        for_each_clamp_id(clamp_id)
+                uclamp_default[clamp_id] = uc_max;
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline int uclamp_validate(struct task_struct *p,
+                                  const struct sched_attr *attr)
+{
+        return -EOPNOTSUPP;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+                                  const struct sched_attr *attr) { }
+static inline void uclamp_fork(struct task_struct *p) { }
+static inline void init_uclamp(void) { }
+#endif /* CONFIG_UCLAMP_TASK */
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (!(flags & ENQUEUE_NOCLOCK))
@@ -770,6 +1177,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
                psi_enqueue(p, flags & ENQUEUE_WAKEUP);
        }
+        uclamp_rq_inc(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 }
@@ -783,6 +1191,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
                psi_dequeue(p, flags & DEQUEUE_SLEEP);
        }
+        uclamp_rq_dec(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
 }
@@ -929,7 +1338,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 */
 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
 {
-        if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                return false;
        if (is_per_cpu_kthread(p))
@@ -1024,7 +1433,7 @@ static int migration_cpu_stop(void *data)
        local_irq_disable();
        /*
         * We need to explicitly wake pending tasks before running
-         * __migrate_task() such that we will not miss enforcing cpus_allowed
+         * __migrate_task() such that we will not miss enforcing cpus_ptr
         * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
         */
        sched_ttwu_pending();
@@ -1055,7 +1464,7 @@ static int migration_cpu_stop(void *data)
 */
 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
 {
-        cpumask_copy(&p->cpus_allowed, new_mask);
+        cpumask_copy(&p->cpus_mask, new_mask);
        p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
@@ -1125,7 +1534,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                goto out;
        }
-        if (cpumask_equal(&p->cpus_allowed, new_mask))
+        if (cpumask_equal(p->cpus_ptr, new_mask))
                goto out;
        if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -1285,10 +1694,10 @@ static int migrate_swap_stop(void *data)
        if (task_cpu(arg->src_task) != arg->src_cpu)
                goto unlock;
-        if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
+        if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
                goto unlock;
-        if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
+        if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
                goto unlock;
        __migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1330,10 +1739,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
        if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
                goto out;
-        if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
+        if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
                goto out;
-        if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
+        if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
                goto out;
        trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1478,7 +1887,7 @@ void kick_process(struct task_struct *p)
 EXPORT_SYMBOL_GPL(kick_process);
 /*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
 *
 * A few notes on cpu_active vs cpu_online:
 *
@@ -1518,14 +1927,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                for_each_cpu(dest_cpu, nodemask) {
                        if (!cpu_active(dest_cpu))
                                continue;
-                        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                        if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
                                return dest_cpu;
                }
        }
        for (;;) {
                /* Any allowed, online CPU? */
-                for_each_cpu(dest_cpu, &p->cpus_allowed) {
+                for_each_cpu(dest_cpu, p->cpus_ptr) {
                        if (!is_cpu_allowed(p, dest_cpu))
                                continue;
@@ -1569,7 +1978,7 @@ out:
 }
 /*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
 */
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
@@ -1579,11 +1988,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
        if (p->nr_cpus_allowed > 1)
                cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
        else
-                cpu = cpumask_any(&p->cpus_allowed);
+                cpu = cpumask_any(p->cpus_ptr);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
-         * to rely on ttwu() to place the task on a valid ->cpus_allowed
+         * to rely on ttwu() to place the task on a valid ->cpus_ptr
         * CPU.
         *
         * Since this is common to all placement strategies, this lives here.
@@ -1990,6 +2399,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        unsigned long flags;
        int cpu, success = 0;
+        if (p == current) {
+                /*
+                 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
+                 * == smp_processor_id()'. Together this means we can special
+                 * case the whole 'p->on_rq && ttwu_remote()' case below
+                 * without taking any locks.
+                 *
+                 * In particular:
+                 *  - we rely on Program-Order guarantees for all the ordering,
+                 *  - we're serialized against set_special_state() by virtue of
+                 *    it disabling IRQs (this allows not taking ->pi_lock).
+                 */
+                if (!(p->state & state))
+                        return false;
+                success = 1;
+                cpu = task_cpu(p);
+                trace_sched_waking(p);
+                p->state = TASK_RUNNING;
+                trace_sched_wakeup(p);
+                goto out;
+        }
        /*
         * If we are going to wake up a thread waiting for CONDITION we
         * need to ensure that CONDITION=1 done by the caller can not be
@@ -1999,7 +2431,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        smp_mb__after_spinlock();
        if (!(p->state & state))
-                goto out;
+                goto unlock;
        trace_sched_waking(p);
@@ -2029,7 +2461,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         */
        smp_rmb();
        if (p->on_rq && ttwu_remote(p, wake_flags))
-                goto stat;
+                goto unlock;
 #ifdef CONFIG_SMP
        /*
@@ -2089,10 +2521,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 #endif /* CONFIG_SMP */
        ttwu_queue(p, cpu, wake_flags);
-stat:
+unlock:
-        ttwu_stat(p, cpu, wake_flags);
-out:
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+out:
+        if (success)
+                ttwu_stat(p, cpu, wake_flags);
        return success;
 }
@@ -2299,6 +2732,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         */
        p->prio = current->normal_prio;
+        uclamp_fork(p);
        /*
         * Revert to default priority/policy on fork if requested.
         */
@@ -2394,7 +2829,7 @@ void wake_up_new_task(struct task_struct *p)
 #ifdef CONFIG_SMP
        /*
         * Fork balancing, do it here and not earlier because:
-         *  - cpus_allowed can change in the fork path
+         *  - cpus_ptr can change in the fork path
         *  - any previously selected CPU might disappear through hotplug
         *
         * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
@@ -3032,7 +3467,6 @@ void scheduler_tick(void)
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
-        cpu_load_update_active(rq);
        calc_global_load_tick(rq);
        psi_task_tick(rq);
@@ -4070,6 +4504,13 @@ static void __setscheduler_params(struct task_struct *p,
 static void __setscheduler(struct rq *rq, struct task_struct *p,
                           const struct sched_attr *attr, bool keep_boost)
 {
+        /*
+         * If params can't change scheduling class changes aren't allowed
+         * either.
+         */
+        if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
+                return;
        __setscheduler_params(p, attr);
        /*
@@ -4207,6 +4648,13 @@ recheck:
                        return retval;
        }
+        /* Update task specific "requested" clamps */
+        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+                retval = uclamp_validate(p, attr);
+                if (retval)
+                        return retval;
+        }
        /*
         * Make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
@@ -4236,6 +4684,8 @@ recheck:
                        goto change;
                if (dl_policy(policy) && dl_param_changed(p, attr))
                        goto change;
+                if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+                        goto change;
                p->sched_reset_on_fork = reset_on_fork;
                task_rq_unlock(rq, p, &rf);
@@ -4266,7 +4716,7 @@ change:
                         * the entire root_domain to become SCHED_DEADLINE. We
                         * will also fail if there's no bandwidth available.
                         */
-                        if (!cpumask_subset(span, &p->cpus_allowed) ||
+                        if (!cpumask_subset(span, p->cpus_ptr) ||
                            rq->rd->dl_bw.bw == 0) {
                                task_rq_unlock(rq, p, &rf);
                                return -EPERM;
@@ -4316,7 +4766,9 @@ change:
                put_prev_task(rq, p);
        prev_class = p->sched_class;
        __setscheduler(rq, p, attr, pi);
+        __setscheduler_uclamp(p, attr);
        if (queued) {
                /*
@@ -4492,6 +4944,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
        if (ret)
                return -EFAULT;
+        if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
+            size < SCHED_ATTR_SIZE_VER1)
+                return -EINVAL;
        /*
         * XXX: Do we want to be lenient like existing syscalls; or do we want
         * to be strict and return an error on out-of-bounds values?
@@ -4555,14 +5011,21 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
        if ((int)attr.sched_policy < 0)
                return -EINVAL;
+        if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+                attr.sched_policy = SETPARAM_POLICY;
        rcu_read_lock();
        retval = -ESRCH;
        p = find_process_by_pid(pid);
-        if (p != NULL)
+        if (likely(p))
-                retval = sched_setattr(p, &attr);
+                get_task_struct(p);
        rcu_read_unlock();
+        if (likely(p)) {
+                retval = sched_setattr(p, &attr);
+                put_task_struct(p);
+        }
        return retval;
 }
@@ -4713,6 +5176,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
        else
                attr.sched_nice = task_nice(p);
+#ifdef CONFIG_UCLAMP_TASK
+        attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+        attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+#endif
        rcu_read_unlock();
        retval = sched_read_attr(uattr, &attr, size);
@@ -4865,7 +5333,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
                goto out_unlock;
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+        cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
@@ -5122,7 +5590,7 @@ long __sched io_schedule_timeout(long timeout)
 }
 EXPORT_SYMBOL(io_schedule_timeout);
-void io_schedule(void)
+void __sched io_schedule(void)
 {
        int token;
@@ -5442,7 +5910,7 @@ int task_can_attach(struct task_struct *p,
         * allowed nodes is unnecessary.  Thus, cpusets are not
         * applicable for such threads.  This prevents checking for
         * success of set_cpus_allowed_ptr() on all attached tasks
-         * before cpus_allowed may be changed.
+         * before cpus_mask may be changed.
         */
        if (p->flags & PF_NO_SETAFFINITY) {
                ret = -EINVAL;
@@ -5469,7 +5937,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
        if (curr_cpu == target_cpu)
                return 0;
-        if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
+        if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
                return -EINVAL;
        /* TODO: This is not properly updating schedstats */
@@ -5607,7 +6075,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
                put_prev_task(rq, next);
                /*
-                 * Rules for changing task_struct::cpus_allowed are holding
+                 * Rules for changing task_struct::cpus_mask are holding
                 * both pi_lock and rq->lock, such that holding either
                 * stabilizes the mask.
                 *
@@ -5901,8 +6369,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
 void __init sched_init(void)
 {
-        int i, j;
        unsigned long alloc_size = 0, ptr;
+        int i;
        wait_bit_init();
@@ -6004,10 +6472,6 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
-                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
-                        rq->cpu_load[j] = 0;
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
@@ -6062,6 +6526,8 @@ void __init sched_init(void)
        psi_init();
+        init_uclamp();
        scheduler_running = 1;
 }
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 50316455ea66..5cc4012572ec 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  kernel/sched/cpudl.c
 *
 *  Global CPU deadline management
 *
 *  Author: Juri Lelli <j.lelli@sssup.it>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; version 2
- *  of the License.
 */
 #include "sched.h"
@@ -124,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
        const struct sched_dl_entity *dl_se = &p->dl;
        if (later_mask &&
-            cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+            cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
                return 1;
        } else {
                int best_cpu = cpudl_maximum(cp);
                WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
-                if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+                if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
                    dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
                        if (later_mask)
                                cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 962cf343f798..636ca6f88c8e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -196,14 +196,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 * based on the task model parameters and gives the minimal utilization
 * required to meet deadlines.
 */
-unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
-                                  unsigned long max, enum schedutil_type type)
+                                 unsigned long max, enum schedutil_type type,
+                                 struct task_struct *p)
 {
        unsigned long dl_util, util, irq;
        struct rq *rq = cpu_rq(cpu);
-        if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
+        if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
+            type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
                return max;
+        }
        /*
         * Early check to see if IRQ/steal time saturates the CPU, can be
@@ -219,9 +222,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
         * CFS tasks and we use the same metric to track the effective
         * utilization (PELT windows are synchronized) we can directly add them
         * to obtain the CPU's actual utilization.
+         *
+         * CFS and RT utilization can be boosted or capped, depending on
+         * utilization clamp constraints requested by currently RUNNABLE
+         * tasks.
+         * When there are no CFS RUNNABLE tasks, clamps are released and
+         * frequency will be gracefully reduced with the utilization decay.
         */
-        util = util_cfs;
+        util = util_cfs + cpu_util_rt(rq);
-        util += cpu_util_rt(rq);
+        if (type == FREQUENCY_UTIL)
+                util = uclamp_util_with(rq, util, p);
        dl_util = cpu_util_dl(rq);
@@ -276,12 +286,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 {
        struct rq *rq = cpu_rq(sg_cpu->cpu);
        unsigned long util = cpu_util_cfs(rq);
-        unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+        unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
        sg_cpu->max = max;
        sg_cpu->bw_dl = cpu_bw_dl(rq);
-        return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
+        return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
 }
 /**
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index daaadf939ccb..b7abca987d94 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  kernel/sched/cpupri.c
 *
@@ -20,11 +21,6 @@
 *  searches).  For tasks with affinity restrictions, the algorithm has a
 *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
 *  yields the worst case search is fairly contrived.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; version 2
- *  of the License.
 */
 #include "sched.h"
@@ -98,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
                if (skip)
                        continue;
-                if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+                if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
                        continue;
                if (lowest_mask) {
-                        cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+                        cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
                        /*
                         * We have to ensure that we have at least one bit
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ba4a143bdcf3..2305ce89a26c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Simple CPU accounting cgroup controller
 */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 43901fa3f269..ef5b9f6b1d42 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                 * If we cannot preempt any rq, fall back to pick any
                 * online CPU:
                 */
-                cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+                cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
                if (cpu >= nr_cpu_ids) {
                        /*
                         * Failed to find any suitable CPU.
@@ -726,7 +726,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
 * refill the runtime and set the deadline a period in the future,
 * because keeping the current (absolute) deadline of the task would
 * result in breaking guarantees promised to other tasks (refer to
- * Documentation/scheduler/sched-deadline.txt for more information).
+ * Documentation/scheduler/sched-deadline.rst for more information).
 *
 * This function returns true if:
 *
@@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq)
                                                 &curr->dl);
        } else {
                unsigned long scale_freq = arch_scale_freq_capacity(cpu);
-                unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+                unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
                scaled_delta_exec = cap_scale(delta_exec, scale_freq);
                scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
@@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq)
 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-            cpumask_test_cpu(cpu, &p->cpus_allowed))
+            cpumask_test_cpu(cpu, p->cpus_ptr))
                return 1;
        return 0;
 }
@@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
                /* Retry if something changed. */
                if (double_lock_balance(rq, later_rq)) {
                        if (unlikely(task_rq(task) != rq ||
-                                     !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
+                                     !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
                                     task_running(rq, task) ||
                                     !dl_task(task) ||
                                     !task_on_rq_queued(task))) {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 678bfb9bd87f..f7e4579e746c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kernel/sched/debug.c
 *
 * Print the CFS rbtree and other debugging details
 *
 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
 */
 #include "sched.h"
@@ -236,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
        *tablep = NULL;
 }
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
 static void
 set_table_entry(struct ctl_table *entry,
                const char *procname, void *data, int maxlen,
-                umode_t mode, proc_handler *proc_handler,
+                umode_t mode, proc_handler *proc_handler)
-                bool load_idx)
 {
        entry->procname = procname;
        entry->data = data;
        entry->maxlen = maxlen;
        entry->mode = mode;
        entry->proc_handler = proc_handler;
-        if (load_idx) {
-                entry->extra1 = &min_load_idx;
-                entry->extra2 = &max_load_idx;
-        }
 }
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-        struct ctl_table *table = sd_alloc_ctl_entry(14);
+        struct ctl_table *table = sd_alloc_ctl_entry(9);
        if (table == NULL)
                return NULL;
-        set_table_entry(&table[0] , "min_interval",        &sd->min_interval,        sizeof(long), 0644, proc_doulongvec_minmax, false);
+        set_table_entry(&table[0], "min_interval",        &sd->min_interval,        sizeof(long), 0644, proc_doulongvec_minmax);
-        set_table_entry(&table[1] , "max_interval",        &sd->max_interval,        sizeof(long), 0644, proc_doulongvec_minmax, false);
+        set_table_entry(&table[1], "max_interval",        &sd->max_interval,        sizeof(long), 0644, proc_doulongvec_minmax);
-        set_table_entry(&table[2] , "busy_idx",            &sd->busy_idx,            sizeof(int) , 0644, proc_dointvec_minmax,   true );
+        set_table_entry(&table[2], "busy_factor",         &sd->busy_factor,         sizeof(int),  0644, proc_dointvec_minmax);
-        set_table_entry(&table[3] , "idle_idx",            &sd->idle_idx,            sizeof(int) , 0644, proc_dointvec_minmax,   true );
+        set_table_entry(&table[3], "imbalance_pct",       &sd->imbalance_pct,       sizeof(int),  0644, proc_dointvec_minmax);
-        set_table_entry(&table[4] , "newidle_idx",         &sd->newidle_idx,         sizeof(int) , 0644, proc_dointvec_minmax,   true );
+        set_table_entry(&table[4], "cache_nice_tries",    &sd->cache_nice_tries,    sizeof(int),  0644, proc_dointvec_minmax);
-        set_table_entry(&table[5] , "wake_idx",            &sd->wake_idx,            sizeof(int) , 0644, proc_dointvec_minmax,   true );
+        set_table_entry(&table[5], "flags",               &sd->flags,               sizeof(int),  0644, proc_dointvec_minmax);
-        set_table_entry(&table[6] , "forkexec_idx",        &sd->forkexec_idx,        sizeof(int) , 0644, proc_dointvec_minmax,   true );
+        set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
-        set_table_entry(&table[7] , "busy_factor",         &sd->busy_factor,         sizeof(int) , 0644, proc_dointvec_minmax,   false);
+        set_table_entry(&table[7], "name",                sd->name,            CORENAME_MAX_SIZE, 0444, proc_dostring);
-        set_table_entry(&table[8] , "imbalance_pct",       &sd->imbalance_pct,       sizeof(int) , 0644, proc_dointvec_minmax,   false);
+        /* &table[8] is terminator */
-        set_table_entry(&table[9] , "cache_nice_tries",    &sd->cache_nice_tries,    sizeof(int) , 0644, proc_dointvec_minmax,   false);
-        set_table_entry(&table[10], "flags",               &sd->flags,               sizeof(int) , 0644, proc_dointvec_minmax,   false);
-        set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
-        set_table_entry(&table[12], "name",                sd->name,            CORENAME_MAX_SIZE, 0444, proc_dostring,          false);
-        /* &table[13] is terminator */
        return table;
 }
@@ -656,8 +639,6 @@ do {									\
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
        P(nr_running);
-        SEQ_printf(m, "  .%-30s: %lu\n", "load",
-                   rq->load.weight);
        P(nr_switches);
        P(nr_load_updates);
        P(nr_uninterruptible);
@@ -665,11 +646,6 @@ do {									\
        SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
        PN(clock);
        PN(clock_task);
-        P(cpu_load[0]);
-        P(cpu_load[1]);
-        P(cpu_load[2]);
-        P(cpu_load[3]);
-        P(cpu_load[4]);
 #undef P
 #undef PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f5e528..036be95a87e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -275,6 +275,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return grp->my_q;
 }
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+        if (!path)
+                return;
+        if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
+                autogroup_path(cfs_rq->tg, path, len);
+        else if (cfs_rq && cfs_rq->tg->css.cgroup)
+                cgroup_path(cfs_rq->tg->css.cgroup, path, len);
+        else
+                strlcpy(path, "(null)", len);
+}
 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        struct rq *rq = rq_of(cfs_rq);
@@ -449,6 +462,12 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return NULL;
 }
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+        if (path)
+                strlcpy(path, "(null)", len);
+}
 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        return true;
@@ -764,7 +783,7 @@ void post_init_entity_util_avg(struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        struct sched_avg *sa = &se->avg;
-        long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+        long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
        long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
        if (cap > 0) {
@@ -1466,9 +1485,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
               group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
-static unsigned long weighted_cpuload(struct rq *rq);
+static unsigned long cpu_runnable_load(struct rq *rq);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
@@ -1489,7 +1506,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
        for_each_cpu(cpu, cpumask_of_node(nid)) {
                struct rq *rq = cpu_rq(cpu);
-                ns->load += weighted_cpuload(rq);
+                ns->load += cpu_runnable_load(rq);
                ns->compute_capacity += capacity_of(cpu);
        }
@@ -1621,7 +1638,7 @@ static void task_numa_compare(struct task_numa_env *env,
         * be incurred if the tasks were swapped.
         */
        /* Skip this swap candidate if cannot move to the source cpu */
-        if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+        if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
                goto unlock;
        /*
@@ -1718,7 +1735,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
                /* Skip this CPU if the source task cannot migrate */
-                if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
+                if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
                        continue;
                env->dst_cpu = cpu;
@@ -2686,8 +2703,6 @@ static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_add(&cfs_rq->load, se->load.weight);
-        if (!parent_entity(se))
-                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
        if (entity_is_task(se)) {
                struct rq *rq = rq_of(cfs_rq);
@@ -2703,8 +2718,6 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_sub(&cfs_rq->load, se->load.weight);
-        if (!parent_entity(se))
-                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
        if (entity_is_task(se)) {
                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
@@ -3334,6 +3347,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
        update_tg_cfs_util(cfs_rq, se, gcfs_rq);
        update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
+        trace_pelt_cfs_tp(cfs_rq);
+        trace_pelt_se_tp(se);
        return 1;
 }
@@ -3486,6 +3502,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
        add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
        cfs_rq_util_change(cfs_rq, flags);
+        trace_pelt_cfs_tp(cfs_rq);
 }
 /**
@@ -3505,6 +3523,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
        add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
        cfs_rq_util_change(cfs_rq, 0);
+        trace_pelt_cfs_tp(cfs_rq);
 }
 /*
@@ -4100,7 +4120,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * least twice that of our own weight (i.e. dont track it
         * when there are only lesser-weight tasks around):
         */
-        if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+        if (schedstat_enabled() &&
+            rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
                schedstat_set(se->statistics.slice_max,
                        max((u64)schedstat_val(se->statistics.slice_max),
                            se->sum_exec_runtime - se->prev_sum_exec_runtime));
@@ -4734,6 +4755,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
        if (runtime_refresh_within(cfs_b, min_left))
                return;
+        /* don't push forwards an existing deferred unthrottle */
+        if (cfs_b->slack_started)
+                return;
+        cfs_b->slack_started = true;
        hrtimer_start(&cfs_b->slack_timer,
                        ns_to_ktime(cfs_bandwidth_slack_period),
                        HRTIMER_MODE_REL);
@@ -4787,6 +4813,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        /* confirm we're still not at a refresh boundary */
        raw_spin_lock_irqsave(&cfs_b->lock, flags);
+        cfs_b->slack_started = false;
        if (cfs_b->distribute_running) {
                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                return;
@@ -4950,6 +4977,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        cfs_b->slack_timer.function = sched_cfs_slack_timer;
        cfs_b->distribute_running = 0;
+        cfs_b->slack_started = false;
 }
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -5153,8 +5181,10 @@ static inline bool cpu_overutilized(int cpu)
 static inline void update_overutilized_status(struct rq *rq)
 {
-        if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+        if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
                WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+                trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
+        }
 }
 #else
 static inline void update_overutilized_status(struct rq *rq) { }
@@ -5325,71 +5355,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
 #ifdef CONFIG_NO_HZ_COMMON
-/*
- * per rq 'load' arrray crap; XXX kill this.
- */
-/*
- * The exact cpuload calculated at every tick would be:
- *
- *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
- *
- * If a CPU misses updates for n ticks (as it was idle) and update gets
- * called on the n+1-th tick when CPU may be busy, then we have:
- *
- *   load_n   = (1 - 1/2^i)^n * load_0
- *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- *
- *   load' = (1 - 1/2^i)^n * load
- *
- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
- * This allows us to precompute the above in said factors, thereby allowing the
- * reduction of an arbitrary n in O(log_2 n) steps. (See also
- * fixed_power_int())
- *
- * The calculation is approximated on a 128 point scale.
- */
-#define DEGRADE_SHIFT           7
-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-        {   0,   0,  0,  0,  0,  0, 0, 0 },
-        {  64,  32,  8,  0,  0,  0, 0, 0 },
-        {  96,  72, 40, 12,  1,  0, 0, 0 },
-        { 112,  98, 75, 43, 15,  1, 0, 0 },
-        { 120, 112, 98, 76, 45, 16, 2, 0 }
-};
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-        int j = 0;
-        if (!missed_updates)
-                return load;
-        if (missed_updates >= degrade_zero_ticks[idx])
-                return 0;
-        if (idx == 1)
-                return load >> missed_updates;
-        while (missed_updates) {
-                if (missed_updates % 2)
-                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-                missed_updates >>= 1;
-                j++;
-        }
-        return load;
-}
 static struct {
        cpumask_var_t idle_cpus_mask;
@@ -5401,234 +5366,11 @@ static struct {
 #endif /* CONFIG_NO_HZ_COMMON */
-/**
+static unsigned long cpu_runnable_load(struct rq *rq)
- * __cpu_load_update - update the rq->cpu_load[] statistics
- * @this_rq: The rq to update statistics for
- * @this_load: The current load
- * @pending_updates: The number of missed updates
- *
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
- *
- * This function computes a decaying average:
- *
- *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
- *
- * Because of NOHZ it might not get called on every tick which gives need for
- * the @pending_updates argument.
- *
- *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
- *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
- *             = A * (A * load[i]_n-2 + B) + B
- *             = A * (A * (A * load[i]_n-3 + B) + B) + B
- *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
- *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
- *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
- *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
- *
- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
- * any change in load would have resulted in the tick being turned back on.
- *
- * For regular NOHZ, this reduces to:
- *
- *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
- *
- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term.
- */
-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
-                            unsigned long pending_updates)
-{
-        unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
-        int i, scale;
-        this_rq->nr_load_updates++;
-        /* Update our load: */
-        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-                unsigned long old_load, new_load;
-                /* scale is effectively 1 << i now, and >> i divides by scale */
-                old_load = this_rq->cpu_load[i];
-#ifdef CONFIG_NO_HZ_COMMON
-                old_load = decay_load_missed(old_load, pending_updates - 1, i);
-                if (tickless_load) {
-                        old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
-                        /*
-                         * old_load can never be a negative value because a
-                         * decayed tickless_load cannot be greater than the
-                         * original tickless_load.
-                         */
-                        old_load += tickless_load;
-                }
-#endif
-                new_load = this_load;
-                /*
-                 * Round up the averaging division if load is increasing. This
-                 * prevents us from getting stuck on 9 if the load is 10, for
-                 * example.
-                 */
-                if (new_load > old_load)
-                        new_load += scale - 1;
-                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-        }
-}
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(struct rq *rq)
 {
        return cfs_rq_runnable_load_avg(&rq->cfs);
 }
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we need to avoid the delta approach from the regular tick when
- * possible since that would seriously skew the load calculation. This is why we
- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
- * loop exit, nohz_idle_balance, nohz full exit...)
- *
- * This means we might still be one tick off for nohz periods.
- */
-static void cpu_load_update_nohz(struct rq *this_rq,
-                                 unsigned long curr_jiffies,
-                                 unsigned long load)
-{
-        unsigned long pending_updates;
-        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-        if (pending_updates) {
-                this_rq->last_load_update_tick = curr_jiffies;
-                /*
-                 * In the regular NOHZ case, we were idle, this means load 0.
-                 * In the NOHZ_FULL case, we were non-idle, we should consider
-                 * its weighted load.
-                 */
-                cpu_load_update(this_rq, load, pending_updates);
-        }
-}
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-static void cpu_load_update_idle(struct rq *this_rq)
-{
-        /*
-         * bail if there's load or we're actually up-to-date.
-         */
-        if (weighted_cpuload(this_rq))
-                return;
-        cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
-}
-/*
- * Record CPU load on nohz entry so we know the tickless load to account
- * on nohz exit. cpu_load[0] happens then to be updated more frequently
- * than other cpu_load[idx] but it should be fine as cpu_load readers
- * shouldn't rely into synchronized cpu_load[*] updates.
- */
-void cpu_load_update_nohz_start(void)
-{
-        struct rq *this_rq = this_rq();
-        /*
-         * This is all lockless but should be fine. If weighted_cpuload changes
-         * concurrently we'll exit nohz. And cpu_load write can race with
-         * cpu_load_update_idle() but both updater would be writing the same.
-         */
-        this_rq->cpu_load[0] = weighted_cpuload(this_rq);
-}
-/*
- * Account the tickless load in the end of a nohz frame.
- */
-void cpu_load_update_nohz_stop(void)
-{
-        unsigned long curr_jiffies = READ_ONCE(jiffies);
-        struct rq *this_rq = this_rq();
-        unsigned long load;
-        struct rq_flags rf;
-        if (curr_jiffies == this_rq->last_load_update_tick)
-                return;
-        load = weighted_cpuload(this_rq);
-        rq_lock(this_rq, &rf);
-        update_rq_clock(this_rq);
-        cpu_load_update_nohz(this_rq, curr_jiffies, load);
-        rq_unlock(this_rq, &rf);
-}
-#else /* !CONFIG_NO_HZ_COMMON */
-static inline void cpu_load_update_nohz(struct rq *this_rq,
-                                        unsigned long curr_jiffies,
-                                        unsigned long load) { }
-#endif /* CONFIG_NO_HZ_COMMON */
-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
-{
-#ifdef CONFIG_NO_HZ_COMMON
-        /* See the mess around cpu_load_update_nohz(). */
-        this_rq->last_load_update_tick = READ_ONCE(jiffies);
-#endif
-        cpu_load_update(this_rq, load, 1);
-}
-/*
- * Called from scheduler_tick()
- */
-void cpu_load_update_active(struct rq *this_rq)
-{
-        unsigned long load = weighted_cpuload(this_rq);
-        if (tick_nohz_tick_stopped())
-                cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
-        else
-                cpu_load_update_periodic(this_rq, load);
-}
-/*
- * Return a low guess at the load of a migration-source CPU weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(rq);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return min(rq->cpu_load[type-1], total);
-}
-/*
- * Return a high guess at the load of a migration-target CPU weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(rq);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return max(rq->cpu_load[type-1], total);
-}
 static unsigned long capacity_of(int cpu)
 {
        return cpu_rq(cpu)->cpu_capacity;
@@ -5638,7 +5380,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-        unsigned long load_avg = weighted_cpuload(rq);
+        unsigned long load_avg = cpu_runnable_load(rq);
        if (nr_running)
                return load_avg / nr_running;
@@ -5736,7 +5478,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
        s64 this_eff_load, prev_eff_load;
        unsigned long task_load;
-        this_eff_load = target_load(this_cpu, sd->wake_idx);
+        this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
        if (sync) {
                unsigned long current_load = task_h_load(current);
@@ -5754,7 +5496,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
                this_eff_load *= 100;
        this_eff_load *= capacity_of(prev_cpu);
-        prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+        prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
        prev_eff_load -= task_load;
        if (sched_feat(WA_BIAS))
                prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5815,14 +5557,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
        unsigned long this_runnable_load = ULONG_MAX;
        unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
        unsigned long most_spare = 0, this_spare = 0;
-        int load_idx = sd->forkexec_idx;
        int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
        unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
                                (sd->imbalance_pct-100) / 100;
-        if (sd_flag & SD_BALANCE_WAKE)
-                load_idx = sd->wake_idx;
        do {
                unsigned long load, avg_load, runnable_load;
                unsigned long spare_cap, max_spare_cap;
@@ -5831,7 +5569,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                /* Skip over this group if it has no CPUs allowed */
                if (!cpumask_intersects(sched_group_span(group),
-                                        &p->cpus_allowed))
+                                        p->cpus_ptr))
                        continue;
                local_group = cpumask_test_cpu(this_cpu,
@@ -5846,12 +5584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                max_spare_cap = 0;
                for_each_cpu(i, sched_group_span(group)) {
-                        /* Bias balancing toward CPUs of our domain */
+                        load = cpu_runnable_load(cpu_rq(i));
-                        if (local_group)
-                                load = source_load(i, load_idx);
-                        else
-                                load = target_load(i, load_idx);
                        runnable_load += load;
                        avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
@@ -5963,7 +5696,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                return cpumask_first(sched_group_span(group));
        /* Traverse only the allowed CPUs */
-        for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
+        for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
                if (available_idle_cpu(i)) {
                        struct rq *rq = cpu_rq(i);
                        struct cpuidle_state *idle = idle_get_state(rq);
@@ -5987,7 +5720,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                                shallowest_idle_cpu = i;
                        }
                } else if (shallowest_idle_cpu == -1) {
-                        load = weighted_cpuload(cpu_rq(i));
+                        load = cpu_runnable_load(cpu_rq(i));
                        if (load < min_load) {
                                min_load = load;
                                least_loaded_cpu = i;
@@ -6003,7 +5736,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 {
        int new_cpu = cpu;
-        if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+        if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
                return prev_cpu;
        /*
@@ -6120,7 +5853,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
        if (!test_idle_cores(target, false))
                return -1;
-        cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+        cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
        for_each_cpu_wrap(core, cpus, target) {
                bool idle = true;
@@ -6154,7 +5887,7 @@ static int select_idle_smt(struct task_struct *p, int target)
                return -1;
        for_each_cpu(cpu, cpu_smt_mask(target)) {
-                if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+                if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                        continue;
                if (available_idle_cpu(cpu))
                        return cpu;
@@ -6189,6 +5922,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
        u64 time, cost;
        s64 delta;
        int cpu, nr = INT_MAX;
+        int this = smp_processor_id();
        this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
        if (!this_sd)
@@ -6212,18 +5946,18 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
                        nr = 4;
        }
-        time = local_clock();
+        time = cpu_clock(this);
        for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
                if (!--nr)
                        return -1;
-                if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+                if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                        continue;
                if (available_idle_cpu(cpu))
                        break;
        }
-        time = local_clock() - time;
+        time = cpu_clock(this) - time;
        cost = this_sd->avg_scan_cost;
        delta = (s64)(time - cost) / 8;
        this_sd->avg_scan_cost += delta;
@@ -6254,7 +5988,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
            recent_used_cpu != target &&
            cpus_share_cache(recent_used_cpu, target) &&
            available_idle_cpu(recent_used_cpu) &&
-            cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+            cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
                /*
                 * Replace recent_used_cpu with prev as it is a potential
                 * candidate for the next wake:
@@ -6498,11 +6232,21 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
 static long
 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 {
-        long util, max_util, sum_util, energy = 0;
+        unsigned int max_util, util_cfs, cpu_util, cpu_cap;
+        unsigned long sum_util, energy = 0;
+        struct task_struct *tsk;
        int cpu;
        for (; pd; pd = pd->next) {
+                struct cpumask *pd_mask = perf_domain_span(pd);
+                /*
+                 * The energy model mandates all the CPUs of a performance
+                 * domain have the same capacity.
+                 */
+                cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
                max_util = sum_util = 0;
                /*
                 * The capacity state of CPUs of the current rd can be driven by
                 * CPUs of another rd if they belong to the same performance
@@ -6513,11 +6257,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
                 * it will not appear in its pd list and will not be accounted
                 * by compute_energy().
                 */
-                for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
+                for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
-                        util = cpu_util_next(cpu, p, dst_cpu);
+                        util_cfs = cpu_util_next(cpu, p, dst_cpu);
-                        util = schedutil_energy_util(cpu, util);
-                        max_util = max(util, max_util);
+                        /*
-                        sum_util += util;
+                         * Busy time computation: utilization clamping is not
+                         * required since the ratio (sum_util / cpu_capacity)
+                         * is already enough to scale the EM reported power
+                         * consumption at the (eventually clamped) cpu_capacity.
+                         */
+                        sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                                       ENERGY_UTIL, NULL);
+                        /*
+                         * Performance domain frequency: utilization clamping
+                         * must be considered since it affects the selection
+                         * of the performance domain frequency.
+                         * NOTE: in case RT tasks are running, by default the
+                         * FREQUENCY_UTIL's utilization can be max OPP.
+                         */
+                        tsk = cpu == dst_cpu ? p : NULL;
+                        cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                                      FREQUENCY_UTIL, tsk);
+                        max_util = max(max_util, cpu_util);
                }
                energy += em_pd_energy(pd->em_pd, max_util, sum_util);
@@ -6600,7 +6362,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                int max_spare_cap_cpu = -1;
                for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
-                        if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+                        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                continue;
                        /* Skip CPUs that will be overutilized. */
@@ -6689,7 +6451,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                }
                want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
-                              cpumask_test_cpu(cpu, &p->cpus_allowed);
+                              cpumask_test_cpu(cpu, p->cpus_ptr);
        }
        rcu_read_lock();
@@ -7445,14 +7207,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        /*
         * We do not migrate tasks that are:
         * 1) throttled_lb_pair, or
-         * 2) cannot be migrated to this CPU due to cpus_allowed, or
+         * 2) cannot be migrated to this CPU due to cpus_ptr, or
         * 3) running (obviously), or
         * 4) are cache-hot on their current CPU.
         */
        if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
                return 0;
-        if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
+        if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
                int cpu;
                schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -7472,7 +7234,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                /* Prevent to re-select dst_cpu via env's CPUs: */
                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
-                        if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+                        if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
                                env->flags |= LBF_DST_PINNED;
                                env->new_dst_cpu = cpu;
                                break;
@@ -7558,7 +7320,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 static const unsigned int sched_nr_migrate_break = 32;
 /*
- * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * detach_tasks() -- tries to detach up to imbalance runnable load from
 * busiest_rq, as part of a balancing operation within domain "sd".
 *
 * Returns number of detached tasks if successful and 0 otherwise.
@@ -7626,7 +7388,7 @@ static int detach_tasks(struct lb_env *env)
                /*
                 * We only want to steal up to the prescribed amount of
-                 * weighted load.
+                 * runnable load.
                 */
                if (env->imbalance <= 0)
                        break;
@@ -7695,6 +7457,7 @@ static void attach_tasks(struct lb_env *env)
        rq_unlock(env->dst_rq, &rf);
 }
+#ifdef CONFIG_NO_HZ_COMMON
 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
 {
        if (cfs_rq->avg.load_avg)
@@ -7722,6 +7485,19 @@ static inline bool others_have_blocked(struct rq *rq)
        return false;
 }
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
+        rq->last_blocked_load_update_tick = jiffies;
+        if (!has_blocked)
+                rq->has_blocked_load = 0;
+}
+#else
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7787,11 +7563,7 @@ static void update_blocked_averages(int cpu)
        if (others_have_blocked(rq))
                done = false;
-#ifdef CONFIG_NO_HZ_COMMON
+        update_blocked_load_status(rq, !done);
-        rq->last_blocked_load_update_tick = jiffies;
-        if (done)
-                rq->has_blocked_load = 0;
-#endif
        rq_unlock_irqrestore(rq, &rf);
 }
@@ -7857,11 +7629,7 @@ static inline void update_blocked_averages(int cpu)
        update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
        update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
        update_irq_load_avg(rq, 0);
-#ifdef CONFIG_NO_HZ_COMMON
+        update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
-        rq->last_blocked_load_update_tick = jiffies;
-        if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
-                rq->has_blocked_load = 0;
-#endif
        rq_unlock_irqrestore(rq, &rf);
 }
@@ -7879,7 +7647,6 @@ static unsigned long task_h_load(struct task_struct *p)
 struct sg_lb_stats {
        unsigned long avg_load; /*Avg load across the CPUs of the group */
        unsigned long group_load; /* Total load over the CPUs of the group */
-        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long load_per_task;
        unsigned long group_capacity;
        unsigned long group_util; /* Total utilization of the group */
@@ -7933,38 +7700,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
        };
 }
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
- *
- * Return: The load index.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
-                                        enum cpu_idle_type idle)
-{
-        int load_idx;
-        switch (idle) {
-        case CPU_NOT_IDLE:
-                load_idx = sd->busy_idx;
-                break;
-        case CPU_NEWLY_IDLE:
-                load_idx = sd->newidle_idx;
-                break;
-        default:
-                load_idx = sd->idle_idx;
-                break;
-        }
-        return load_idx;
-}
 static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        unsigned long max = arch_scale_cpu_capacity(sd, cpu);
+        unsigned long max = arch_scale_cpu_capacity(cpu);
        unsigned long used, free;
        unsigned long irq;
@@ -7989,7 +7728,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
        unsigned long capacity = scale_rt_capacity(sd, cpu);
        struct sched_group *sdg = sd->groups;
-        cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
+        cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
        if (!capacity)
                capacity = 1;
@@ -8099,7 +7838,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
 /*
 * Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to ->cpus_allowed constraints.
+ * groups is inadequate due to ->cpus_ptr constraints.
 *
 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
@@ -8249,9 +7988,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                                      struct sg_lb_stats *sgs,
                                      int *sg_status)
 {
-        int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
-        int load_idx = get_sd_load_idx(env->sd, env->idle);
-        unsigned long load;
        int i, nr_running;
        memset(sgs, 0, sizeof(*sgs));
@@ -8262,13 +7998,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
                        env->flags |= LBF_NOHZ_AGAIN;
-                /* Bias balancing toward CPUs of our domain: */
+                sgs->group_load += cpu_runnable_load(rq);
-                if (local_group)
-                        load = target_load(i, load_idx);
-                else
-                        load = source_load(i, load_idx);
-                sgs->group_load += load;
                sgs->group_util += cpu_util(i);
                sgs->sum_nr_running += rq->cfs.h_nr_running;
@@ -8283,7 +8013,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->nr_numa_running += rq->nr_numa_running;
                sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
-                sgs->sum_weighted_load += weighted_cpuload(rq);
                /*
                 * No need to call idle_cpu() if nr_running is not 0
                 */
@@ -8302,7 +8031,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
        if (sgs->sum_nr_running)
-                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+                sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
        sgs->group_weight = group->group_weight;
@@ -8516,8 +8245,12 @@ next_group:
                /* Update over-utilization (tipping point, U >= 0) indicator */
                WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
+                trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
        } else if (sg_status & SG_OVERUTILIZED) {
-                WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
+                struct root_domain *rd = env->dst_rq->rd;
+                WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
+                trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
        }
 }
@@ -8723,7 +8456,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 * find_busiest_group - Returns the busiest group within the sched_domain
 * if there is an imbalance.
 *
- * Also calculates the amount of weighted load which should be moved
+ * Also calculates the amount of runnable load which should be moved
 * to restore balance.
 *
 * @env: The load balancing environment.
@@ -8768,7 +8501,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
        /*
         * If the busiest group is imbalanced the below checks don't
         * work because they assume all things are equal, which typically
-         * isn't true due to cpus_allowed constraints and the like.
+         * isn't true due to cpus_ptr constraints and the like.
         */
        if (busiest->group_type == group_imbalanced)
                goto force_balance;
@@ -8842,7 +8575,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        int i;
        for_each_cpu_and(i, sched_group_span(group), env->cpus) {
-                unsigned long capacity, wl;
+                unsigned long capacity, load;
                enum fbq_type rt;
                rq = cpu_rq(i);
@@ -8896,30 +8629,30 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                    rq->nr_running == 1)
                        continue;
-                wl = weighted_cpuload(rq);
+                load = cpu_runnable_load(rq);
                /*
-                 * When comparing with imbalance, use weighted_cpuload()
+                 * When comparing with imbalance, use cpu_runnable_load()
                 * which is not scaled with the CPU capacity.
                 */
-                if (rq->nr_running == 1 && wl > env->imbalance &&
+                if (rq->nr_running == 1 && load > env->imbalance &&
                    !check_cpu_capacity(rq, env->sd))
                        continue;
                /*
                 * For the load comparisons with the other CPU's, consider
-                 * the weighted_cpuload() scaled with the CPU capacity, so
+                 * the cpu_runnable_load() scaled with the CPU capacity, so
                 * that the load can be moved away from the CPU that is
                 * potentially running at a lower capacity.
                 *
-                 * Thus we're looking for max(wl_i / capacity_i), crosswise
+                 * Thus we're looking for max(load_i / capacity_i), crosswise
                 * multiplication to rid ourselves of the division works out
-                 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
+                 * to: load_i * capacity_j > load_j * capacity_i;  where j is
                 * our previous maximum.
                 */
-                if (wl * busiest_capacity > busiest_load * capacity) {
+                if (load * busiest_capacity > busiest_load * capacity) {
-                        busiest_load = wl;
+                        busiest_load = load;
                        busiest_capacity = capacity;
                        busiest = rq;
                }
@@ -9210,7 +8943,7 @@ more_balance:
                         * if the curr task on busiest CPU can't be
                         * moved to this_cpu:
                         */
-                        if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+                        if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
                                raw_spin_unlock_irqrestore(&busiest->lock,
                                                            flags);
                                env.flags |= LBF_ALL_PINNED;
@@ -9879,7 +9612,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
                        rq_lock_irqsave(rq, &rf);
                        update_rq_clock(rq);
-                        cpu_load_update_idle(rq);
                        rq_unlock_irqrestore(rq, &rf);
                        if (flags & NOHZ_BALANCE_KICK)
@@ -10690,6 +10422,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        .task_change_group      = task_change_group_fair,
 #endif
+#ifdef CONFIG_UCLAMP_TASK
+        .uclamp_enabled         = 1,
+#endif
 };
 #ifdef CONFIG_SCHED_DEBUG
@@ -10737,3 +10473,83 @@ __init void init_sched_fair_class(void)
 #endif /* SMP */
 }
+/*
+ * Helper functions to facilitate extracting info from tracepoints.
+ */
+const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
+{
+#ifdef CONFIG_SMP
+        return cfs_rq ? &cfs_rq->avg : NULL;
+#else
+        return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
+char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
+{
+        if (!cfs_rq) {
+                if (str)
+                        strlcpy(str, "(null)", len);
+                else
+                        return NULL;
+        }
+        cfs_rq_tg_path(cfs_rq, str, len);
+        return str;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
+{
+        return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
+const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+        return rq ? &rq->avg_rt : NULL;
+#else
+        return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
+const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+        return rq ? &rq->avg_dl : NULL;
+#else
+        return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
+const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
+        return rq ? &rq->avg_irq : NULL;
+#else
+        return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
+int sched_trace_rq_cpu(struct rq *rq)
+{
+        return rq ? cpu_of(rq) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
+const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
+{
+#ifdef CONFIG_SMP
+        return rd ? rd->span : NULL;
+#else
+        return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 858589b83377..2410db5e9a35 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
-SCHED_FEAT(LB_BIAS, false)
 /*
 * Decrement CPU capacity based on time not spent running tasks
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f5516bae0c1b..80940939b733 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Generic entry points for the idle threads and
 * implementation of the idle task scheduling class.
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 687302051a27..123ea07a3f3b 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  Housekeeping management. Manage the targets for routine code that can run on
 *  any CPU: unbound workqueues, timers, kthreads and any offloadable work.
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 3cd8a3a795d2..aa8d75804108 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 *
 * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
 */
 #include "sched.h"
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index befce29bd882..a96db50d40e0 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,6 +28,8 @@
 #include "sched.h"
 #include "pelt.h"
+#include <trace/events/sched.h>
 /*
 * Approximate:
 *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
@@ -265,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 {
        if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
                ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+                trace_pelt_se_tp(se);
                return 1;
        }
@@ -278,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
                ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
                cfs_se_util_change(&se->avg);
+                trace_pelt_se_tp(se);
                return 1;
        }
@@ -292,6 +296,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
                                cfs_rq->curr != NULL)) {
                ___update_load_avg(&cfs_rq->avg, 1, 1);
+                trace_pelt_cfs_tp(cfs_rq);
                return 1;
        }
@@ -317,6 +322,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
                                running)) {
                ___update_load_avg(&rq->avg_rt, 1, 1);
+                trace_pelt_rt_tp(rq);
                return 1;
        }
@@ -340,6 +346,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
                                running)) {
                ___update_load_avg(&rq->avg_dl, 1, 1);
+                trace_pelt_dl_tp(rq);
                return 1;
        }
@@ -366,7 +373,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
         * reflect the real amount of computation
         */
        running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
-        running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+        running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
        /*
         * We know the time that has been used by interrupt since last update
@@ -388,8 +395,10 @@ int update_irq_load_avg(struct rq *rq, u64 running)
                                1,
                                1);
-        if (ret)
+        if (ret) {
                ___update_load_avg(&rq->avg_irq, 1, 1);
+                trace_pelt_irq_tp(rq);
+        }
        return ret;
 }
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7489d5f56960..afff644da065 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
         * Scale the elapsed time to reflect the real amount of
         * computation
         */
-        delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+        delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
        delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
        rq->clock_pelt += delta;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 0e97ca9306ef..7acc632c3b82 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -4,6 +4,9 @@
 * Copyright (c) 2018 Facebook, Inc.
 * Author: Johannes Weiner <hannes@cmpxchg.org>
 *
+ * Polling support by Suren Baghdasaryan <surenb@google.com>
+ * Copyright (c) 2018 Google, Inc.
+ *
 * When CPU, memory and IO are contended, tasks experience delays that
 * reduce throughput and introduce latencies into the workload. Memory
 * and IO contention, in addition, can cause a full loss of forward
@@ -129,9 +132,13 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/seqlock.h>
+#include <linux/uaccess.h>
 #include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/poll.h>
 #include <linux/psi.h>
 #include "sched.h"
@@ -140,9 +147,9 @@ static int psi_bug __read_mostly;
 DEFINE_STATIC_KEY_FALSE(psi_disabled);
 #ifdef CONFIG_PSI_DEFAULT_DISABLED
-bool psi_enable;
+static bool psi_enable;
 #else
-bool psi_enable = true;
+static bool psi_enable = true;
 #endif
 static int __init setup_psi(char *str)
 {
@@ -156,16 +163,21 @@ __setup("psi=", setup_psi);
 #define EXP_60s         1981            /* 1/exp(2s/60s) */
 #define EXP_300s        2034            /* 1/exp(2s/300s) */
+/* PSI trigger definitions */
+#define WINDOW_MIN_US 500000    /* Min window size is 500ms */
+#define WINDOW_MAX_US 10000000  /* Max window size is 10s */
+#define UPDATES_PER_WINDOW 10   /* 10 updates per window */
 /* Sampling frequency in nanoseconds */
 static u64 psi_period __read_mostly;
 /* System-level pressure and stall tracking */
 static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
-static struct psi_group psi_system = {
+struct psi_group psi_system = {
        .pcpu = &system_group_pcpu,
 };
-static void psi_update_work(struct work_struct *work);
+static void psi_avgs_work(struct work_struct *work);
 static void group_init(struct psi_group *group)
 {
@@ -173,9 +185,20 @@ static void group_init(struct psi_group *group)
        for_each_possible_cpu(cpu)
                seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
-        group->next_update = sched_clock() + psi_period;
+        group->avg_next_update = sched_clock() + psi_period;
-        INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
+        INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
-        mutex_init(&group->stat_lock);
+        mutex_init(&group->avgs_lock);
+        /* Init trigger-related members */
+        atomic_set(&group->poll_scheduled, 0);
+        mutex_init(&group->trigger_lock);
+        INIT_LIST_HEAD(&group->triggers);
+        memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
+        group->poll_states = 0;
+        group->poll_min_period = U32_MAX;
+        memset(group->polling_total, 0, sizeof(group->polling_total));
+        group->polling_next_update = ULLONG_MAX;
+        group->polling_until = 0;
+        rcu_assign_pointer(group->poll_kworker, NULL);
 }
 void __init psi_init(void)
@@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
        }
 }
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
+static void get_recent_times(struct psi_group *group, int cpu,
+                             enum psi_aggregators aggregator, u32 *times,
+                             u32 *pchanged_states)
 {
        struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
-        unsigned int tasks[NR_PSI_TASK_COUNTS];
        u64 now, state_start;
+        enum psi_states s;
        unsigned int seq;
-        int s;
+        u32 state_mask;
+        *pchanged_states = 0;
        /* Snapshot a coherent view of the CPU state */
        do {
                seq = read_seqcount_begin(&groupc->seq);
                now = cpu_clock(cpu);
                memcpy(times, groupc->times, sizeof(groupc->times));
-                memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+                state_mask = groupc->state_mask;
                state_start = groupc->state_start;
        } while (read_seqcount_retry(&groupc->seq, seq));
@@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
                 * (u32) and our reported pressure close to what's
                 * actually happening.
                 */
-                if (test_state(tasks, s))
+                if (state_mask & (1 << s))
                        times[s] += now - state_start;
-                delta = times[s] - groupc->times_prev[s];
+                delta = times[s] - groupc->times_prev[aggregator][s];
-                groupc->times_prev[s] = times[s];
+                groupc->times_prev[aggregator][s] = times[s];
                times[s] = delta;
+                if (delta)
+                        *pchanged_states |= (1 << s);
        }
 }
@@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
        avg[2] = calc_load(avg[2], EXP_300s, pct);
 }
-static bool update_stats(struct psi_group *group)
+static void collect_percpu_times(struct psi_group *group,
+                                 enum psi_aggregators aggregator,
+                                 u32 *pchanged_states)
 {
        u64 deltas[NR_PSI_STATES - 1] = { 0, };
-        unsigned long missed_periods = 0;
        unsigned long nonidle_total = 0;
-        u64 now, expires, period;
+        u32 changed_states = 0;
        int cpu;
        int s;
-        mutex_lock(&group->stat_lock);
        /*
         * Collect the per-cpu time buckets and average them into a
         * single time sample that is normalized to wallclock time.
@@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group)
        for_each_possible_cpu(cpu) {
                u32 times[NR_PSI_STATES];
                u32 nonidle;
+                u32 cpu_changed_states;
-                get_recent_times(group, cpu, times);
+                get_recent_times(group, cpu, aggregator, times,
+                                &cpu_changed_states);
+                changed_states |= cpu_changed_states;
                nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
                nonidle_total += nonidle;
@@ -315,13 +346,22 @@ static bool update_stats(struct psi_group *group)
        /* total= */
        for (s = 0; s < NR_PSI_STATES - 1; s++)
-                group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
+                group->total[aggregator][s] +=
+                                div_u64(deltas[s], max(nonidle_total, 1UL));
+        if (pchanged_states)
+                *pchanged_states = changed_states;
+}
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+        unsigned long missed_periods = 0;
+        u64 expires, period;
+        u64 avg_next_update;
+        int s;
        /* avgX= */
-        now = sched_clock();
+        expires = group->avg_next_update;
-        expires = group->next_update;
-        if (now < expires)
-                goto out;
        if (now - expires >= psi_period)
                missed_periods = div_u64(now - expires, psi_period);
@@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group)
         * But the deltas we sample out of the per-cpu buckets above
         * are based on the actual time elapsing between clock ticks.
         */
-        group->next_update = expires + ((1 + missed_periods) * psi_period);
+        avg_next_update = expires + ((1 + missed_periods) * psi_period);
-        period = now - (group->last_update + (missed_periods * psi_period));
+        period = now - (group->avg_last_update + (missed_periods * psi_period));
-        group->last_update = now;
+        group->avg_last_update = now;
        for (s = 0; s < NR_PSI_STATES - 1; s++) {
                u32 sample;
-                sample = group->total[s] - group->total_prev[s];
+                sample = group->total[PSI_AVGS][s] - group->avg_total[s];
                /*
                 * Due to the lockless sampling of the time buckets,
                 * recorded time deltas can slip into the next period,
@@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group)
                 */
                if (sample > period)
                        sample = period;
-                group->total_prev[s] += sample;
+                group->avg_total[s] += sample;
                calc_avgs(group->avg[s], missed_periods, sample, period);
        }
-out:
-        mutex_unlock(&group->stat_lock);
+        return avg_next_update;
-        return nonidle_total;
 }
-static void psi_update_work(struct work_struct *work)
+static void psi_avgs_work(struct work_struct *work)
 {
        struct delayed_work *dwork;
        struct psi_group *group;
+        u32 changed_states;
        bool nonidle;
+        u64 now;
        dwork = to_delayed_work(work);
-        group = container_of(dwork, struct psi_group, clock_work);
+        group = container_of(dwork, struct psi_group, avgs_work);
+        mutex_lock(&group->avgs_lock);
+        now = sched_clock();
+        collect_percpu_times(group, PSI_AVGS, &changed_states);
+        nonidle = changed_states & (1 << PSI_NONIDLE);
        /*
         * If there is task activity, periodically fold the per-cpu
         * times and feed samples into the running averages. If things
@@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work)
         * Once restarted, we'll catch up the running averages in one
         * go - see calc_avgs() and missed_periods.
         */
+        if (now >= group->avg_next_update)
-        nonidle = update_stats(group);
+                group->avg_next_update = update_averages(group, now);
        if (nonidle) {
-                unsigned long delay = 0;
+                schedule_delayed_work(dwork, nsecs_to_jiffies(
-                u64 now;
+                                group->avg_next_update - now) + 1);
+        }
+        mutex_unlock(&group->avgs_lock);
+}
+/* Trigger tracking window manupulations */
+static void window_reset(struct psi_window *win, u64 now, u64 value,
+                         u64 prev_growth)
+{
+        win->start_time = now;
+        win->start_value = value;
+        win->prev_growth = prev_growth;
+}
+/*
+ * PSI growth tracking window update and growth calculation routine.
+ *
+ * This approximates a sliding tracking window by interpolating
+ * partially elapsed windows using historical growth data from the
+ * previous intervals. This minimizes memory requirements (by not storing
+ * all the intermediate values in the previous window) and simplifies
+ * the calculations. It works well because PSI signal changes only in
+ * positive direction and over relatively small window sizes the growth
+ * is close to linear.
+ */
+static u64 window_update(struct psi_window *win, u64 now, u64 value)
+{
+        u64 elapsed;
+        u64 growth;
+        elapsed = now - win->start_time;
+        growth = value - win->start_value;
+        /*
+         * After each tracking window passes win->start_value and
+         * win->start_time get reset and win->prev_growth stores
+         * the average per-window growth of the previous window.
+         * win->prev_growth is then used to interpolate additional
+         * growth from the previous window assuming it was linear.
+         */
+        if (elapsed > win->size)
+                window_reset(win, now, value, growth);
+        else {
+                u32 remaining;
+                remaining = win->size - elapsed;
+                growth += div_u64(win->prev_growth * remaining, win->size);
+        }
+        return growth;
+}
+static void init_triggers(struct psi_group *group, u64 now)
+{
+        struct psi_trigger *t;
+        list_for_each_entry(t, &group->triggers, node)
+                window_reset(&t->win, now,
+                                group->total[PSI_POLL][t->state], 0);
+        memcpy(group->polling_total, group->total[PSI_POLL],
+                   sizeof(group->polling_total));
+        group->polling_next_update = now + group->poll_min_period;
+}
+static u64 update_triggers(struct psi_group *group, u64 now)
+{
+        struct psi_trigger *t;
+        bool new_stall = false;
+        u64 *total = group->total[PSI_POLL];
+        /*
+         * On subsequent updates, calculate growth deltas and let
+         * watchers know when their specified thresholds are exceeded.
+         */
+        list_for_each_entry(t, &group->triggers, node) {
+                u64 growth;
+                /* Check for stall activity */
+                if (group->polling_total[t->state] == total[t->state])
+                        continue;
+                /*
+                 * Multiple triggers might be looking at the same state,
+                 * remember to update group->polling_total[] once we've
+                 * been through all of them. Also remember to extend the
+                 * polling time if we see new stall activity.
+                 */
+                new_stall = true;
+                /* Calculate growth since last update */
+                growth = window_update(&t->win, now, total[t->state]);
+                if (growth < t->threshold)
+                        continue;
+                /* Limit event signaling to once per window */
+                if (now < t->last_event_time + t->win.size)
+                        continue;
+                /* Generate an event */
+                if (cmpxchg(&t->event, 0, 1) == 0)
+                        wake_up_interruptible(&t->event_wait);
+                t->last_event_time = now;
+        }
+        if (new_stall)
+                memcpy(group->polling_total, total,
+                                sizeof(group->polling_total));
+        return now + group->poll_min_period;
+}
+/*
+ * Schedule polling if it's not already scheduled. It's safe to call even from
+ * hotpath because even though kthread_queue_delayed_work takes worker->lock
+ * spinlock that spinlock is never contended due to poll_scheduled atomic
+ * preventing such competition.
+ */
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
+{
+        struct kthread_worker *kworker;
+        /* Do not reschedule if already scheduled */
+        if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+                return;
+        rcu_read_lock();
-                now = sched_clock();
+        kworker = rcu_dereference(group->poll_kworker);
-                if (group->next_update > now)
+        /*
-                        delay = nsecs_to_jiffies(group->next_update - now) + 1;
+         * kworker might be NULL in case psi_trigger_destroy races with
-                schedule_delayed_work(dwork, delay);
+         * psi_task_change (hotpath) which can't use locks
+         */
+        if (likely(kworker))
+                kthread_queue_delayed_work(kworker, &group->poll_work, delay);
+        else
+                atomic_set(&group->poll_scheduled, 0);
+        rcu_read_unlock();
+}
+static void psi_poll_work(struct kthread_work *work)
+{
+        struct kthread_delayed_work *dwork;
+        struct psi_group *group;
+        u32 changed_states;
+        u64 now;
+        dwork = container_of(work, struct kthread_delayed_work, work);
+        group = container_of(dwork, struct psi_group, poll_work);
+        atomic_set(&group->poll_scheduled, 0);
+        mutex_lock(&group->trigger_lock);
+        now = sched_clock();
+        collect_percpu_times(group, PSI_POLL, &changed_states);
+        if (changed_states & group->poll_states) {
+                /* Initialize trigger windows when entering polling mode */
+                if (now > group->polling_until)
+                        init_triggers(group, now);
+                /*
+                 * Keep the monitor active for at least the duration of the
+                 * minimum tracking window as long as monitor states are
+                 * changing.
+                 */
+                group->polling_until = now +
+                        group->poll_min_period * UPDATES_PER_WINDOW;
+        }
+        if (now > group->polling_until) {
+                group->polling_next_update = ULLONG_MAX;
+                goto out;
        }
+        if (now >= group->polling_next_update)
+                group->polling_next_update = update_triggers(group, now);
+        psi_schedule_poll_work(group,
+                nsecs_to_jiffies(group->polling_next_update - now) + 1);
+out:
+        mutex_unlock(&group->trigger_lock);
 }
 static void record_times(struct psi_group_cpu *groupc, int cpu,
@@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
        delta = now - groupc->state_start;
        groupc->state_start = now;
-        if (test_state(groupc->tasks, PSI_IO_SOME)) {
+        if (groupc->state_mask & (1 << PSI_IO_SOME)) {
                groupc->times[PSI_IO_SOME] += delta;
-                if (test_state(groupc->tasks, PSI_IO_FULL))
+                if (groupc->state_mask & (1 << PSI_IO_FULL))
                        groupc->times[PSI_IO_FULL] += delta;
        }
-        if (test_state(groupc->tasks, PSI_MEM_SOME)) {
+        if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
                groupc->times[PSI_MEM_SOME] += delta;
-                if (test_state(groupc->tasks, PSI_MEM_FULL))
+                if (groupc->state_mask & (1 << PSI_MEM_FULL))
                        groupc->times[PSI_MEM_FULL] += delta;
                else if (memstall_tick) {
                        u32 sample;
@@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
                }
        }
-        if (test_state(groupc->tasks, PSI_CPU_SOME))
+        if (groupc->state_mask & (1 << PSI_CPU_SOME))
                groupc->times[PSI_CPU_SOME] += delta;
-        if (test_state(groupc->tasks, PSI_NONIDLE))
+        if (groupc->state_mask & (1 << PSI_NONIDLE))
                groupc->times[PSI_NONIDLE] += delta;
 }
-static void psi_group_change(struct psi_group *group, int cpu,
+static u32 psi_group_change(struct psi_group *group, int cpu,
-                             unsigned int clear, unsigned int set)
+                            unsigned int clear, unsigned int set)
 {
        struct psi_group_cpu *groupc;
        unsigned int t, m;
+        enum psi_states s;
+        u32 state_mask = 0;
        groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
                if (set & (1 << t))
                        groupc->tasks[t]++;
+        /* Calculate state mask representing active states */
+        for (s = 0; s < NR_PSI_STATES; s++) {
+                if (test_state(groupc->tasks, s))
+                        state_mask |= (1 << s);
+        }
+        groupc->state_mask = state_mask;
        write_seqcount_end(&groupc->seq);
+        return state_mask;
 }
 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set)
         */
        if (unlikely((clear & TSK_RUNNING) &&
                     (task->flags & PF_WQ_WORKER) &&
-                     wq_worker_last_func(task) == psi_update_work))
+                     wq_worker_last_func(task) == psi_avgs_work))
                wake_clock = false;
        while ((group = iterate_groups(task, &iter))) {
-                psi_group_change(group, cpu, clear, set);
+                u32 state_mask = psi_group_change(group, cpu, clear, set);
-                if (wake_clock && !delayed_work_pending(&group->clock_work))
-                        schedule_delayed_work(&group->clock_work, PSI_FREQ);
+                if (state_mask & group->poll_states)
+                        psi_schedule_poll_work(group, 1);
+                if (wake_clock && !delayed_work_pending(&group->avgs_work))
+                        schedule_delayed_work(&group->avgs_work, PSI_FREQ);
        }
 }
@@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup)
        if (static_branch_likely(&psi_disabled))
                return;
-        cancel_delayed_work_sync(&cgroup->psi.clock_work);
+        cancel_delayed_work_sync(&cgroup->psi.avgs_work);
        free_percpu(cgroup->psi.pcpu);
+        /* All triggers must be removed by now */
+        WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
 }
 /**
@@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
 int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 {
        int full;
+        u64 now;
        if (static_branch_likely(&psi_disabled))
                return -EOPNOTSUPP;
-        update_stats(group);
+        /* Update averages before reporting them */
+        mutex_lock(&group->avgs_lock);
+        now = sched_clock();
+        collect_percpu_times(group, PSI_AVGS, NULL);
+        if (now >= group->avg_next_update)
+                group->avg_next_update = update_averages(group, now);
+        mutex_unlock(&group->avgs_lock);
        for (full = 0; full < 2 - (res == PSI_CPU); full++) {
                unsigned long avg[3];
@@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
                for (w = 0; w < 3; w++)
                        avg[w] = group->avg[res * 2 + full][w];
-                total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC);
+                total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+                                NSEC_PER_USEC);
                seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
                           full ? "full" : "some",
@@ -753,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
        return single_open(file, psi_cpu_show, NULL);
 }
+struct psi_trigger *psi_trigger_create(struct psi_group *group,
+                        char *buf, size_t nbytes, enum psi_res res)
+{
+        struct psi_trigger *t;
+        enum psi_states state;
+        u32 threshold_us;
+        u32 window_us;
+        if (static_branch_likely(&psi_disabled))
+                return ERR_PTR(-EOPNOTSUPP);
+        if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
+                state = PSI_IO_SOME + res * 2;
+        else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
+                state = PSI_IO_FULL + res * 2;
+        else
+                return ERR_PTR(-EINVAL);
+        if (state >= PSI_NONIDLE)
+                return ERR_PTR(-EINVAL);
+        if (window_us < WINDOW_MIN_US ||
+                window_us > WINDOW_MAX_US)
+                return ERR_PTR(-EINVAL);
+        /* Check threshold */
+        if (threshold_us == 0 || threshold_us > window_us)
+                return ERR_PTR(-EINVAL);
+        t = kmalloc(sizeof(*t), GFP_KERNEL);
+        if (!t)
+                return ERR_PTR(-ENOMEM);
+        t->group = group;
+        t->state = state;
+        t->threshold = threshold_us * NSEC_PER_USEC;
+        t->win.size = window_us * NSEC_PER_USEC;
+        window_reset(&t->win, 0, 0, 0);
+        t->event = 0;
+        t->last_event_time = 0;
+        init_waitqueue_head(&t->event_wait);
+        kref_init(&t->refcount);
+        mutex_lock(&group->trigger_lock);
+        if (!rcu_access_pointer(group->poll_kworker)) {
+                struct sched_param param = {
+                        .sched_priority = MAX_RT_PRIO - 1,
+                };
+                struct kthread_worker *kworker;
+                kworker = kthread_create_worker(0, "psimon");
+                if (IS_ERR(kworker)) {
+                        kfree(t);
+                        mutex_unlock(&group->trigger_lock);
+                        return ERR_CAST(kworker);
+                }
+                sched_setscheduler(kworker->task, SCHED_FIFO, &param);
+                kthread_init_delayed_work(&group->poll_work,
+                                psi_poll_work);
+                rcu_assign_pointer(group->poll_kworker, kworker);
+        }
+        list_add(&t->node, &group->triggers);
+        group->poll_min_period = min(group->poll_min_period,
+                div_u64(t->win.size, UPDATES_PER_WINDOW));
+        group->nr_triggers[t->state]++;
+        group->poll_states |= (1 << t->state);
+        mutex_unlock(&group->trigger_lock);
+        return t;
+}
+static void psi_trigger_destroy(struct kref *ref)
+{
+        struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
+        struct psi_group *group = t->group;
+        struct kthread_worker *kworker_to_destroy = NULL;
+        if (static_branch_likely(&psi_disabled))
+                return;
+        /*
+         * Wakeup waiters to stop polling. Can happen if cgroup is deleted
+         * from under a polling process.
+         */
+        wake_up_interruptible(&t->event_wait);
+        mutex_lock(&group->trigger_lock);
+        if (!list_empty(&t->node)) {
+                struct psi_trigger *tmp;
+                u64 period = ULLONG_MAX;
+                list_del(&t->node);
+                group->nr_triggers[t->state]--;
+                if (!group->nr_triggers[t->state])
+                        group->poll_states &= ~(1 << t->state);
+                /* reset min update period for the remaining triggers */
+                list_for_each_entry(tmp, &group->triggers, node)
+                        period = min(period, div_u64(tmp->win.size,
+                                        UPDATES_PER_WINDOW));
+                group->poll_min_period = period;
+                /* Destroy poll_kworker when the last trigger is destroyed */
+                if (group->poll_states == 0) {
+                        group->polling_until = 0;
+                        kworker_to_destroy = rcu_dereference_protected(
+                                        group->poll_kworker,
+                                        lockdep_is_held(&group->trigger_lock));
+                        rcu_assign_pointer(group->poll_kworker, NULL);
+                }
+        }
+        mutex_unlock(&group->trigger_lock);
+        /*
+         * Wait for both *trigger_ptr from psi_trigger_replace and
+         * poll_kworker RCUs to complete their read-side critical sections
+         * before destroying the trigger and optionally the poll_kworker
+         */
+        synchronize_rcu();
+        /*
+         * Destroy the kworker after releasing trigger_lock to prevent a
+         * deadlock while waiting for psi_poll_work to acquire trigger_lock
+         */
+        if (kworker_to_destroy) {
+                kthread_cancel_delayed_work_sync(&group->poll_work);
+                kthread_destroy_worker(kworker_to_destroy);
+        }
+        kfree(t);
+}
+void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
+{
+        struct psi_trigger *old = *trigger_ptr;
+        if (static_branch_likely(&psi_disabled))
+                return;
+        rcu_assign_pointer(*trigger_ptr, new);
+        if (old)
+                kref_put(&old->refcount, psi_trigger_destroy);
+}
+__poll_t psi_trigger_poll(void **trigger_ptr,
+                                struct file *file, poll_table *wait)
+{
+        __poll_t ret = DEFAULT_POLLMASK;
+        struct psi_trigger *t;
+        if (static_branch_likely(&psi_disabled))
+                return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+        rcu_read_lock();
+        t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
+        if (!t) {
+                rcu_read_unlock();
+                return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+        }
+        kref_get(&t->refcount);
+        rcu_read_unlock();
+        poll_wait(file, &t->event_wait, wait);
+        if (cmpxchg(&t->event, 1, 0) == 1)
+                ret |= EPOLLPRI;
+        kref_put(&t->refcount, psi_trigger_destroy);
+        return ret;
+}
+static ssize_t psi_write(struct file *file, const char __user *user_buf,
+                         size_t nbytes, enum psi_res res)
+{
+        char buf[32];
+        size_t buf_size;
+        struct seq_file *seq;
+        struct psi_trigger *new;
+        if (static_branch_likely(&psi_disabled))
+                return -EOPNOTSUPP;
+        buf_size = min(nbytes, (sizeof(buf) - 1));
+        if (copy_from_user(buf, user_buf, buf_size))
+                return -EFAULT;
+        buf[buf_size - 1] = '\0';
+        new = psi_trigger_create(&psi_system, buf, nbytes, res);
+        if (IS_ERR(new))
+                return PTR_ERR(new);
+        seq = file->private_data;
+        /* Take seq->lock to protect seq->private from concurrent writes */
+        mutex_lock(&seq->lock);
+        psi_trigger_replace(&seq->private, new);
+        mutex_unlock(&seq->lock);
+        return nbytes;
+}
+static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
+                            size_t nbytes, loff_t *ppos)
+{
+        return psi_write(file, user_buf, nbytes, PSI_IO);
+}
+static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
+                                size_t nbytes, loff_t *ppos)
+{
+        return psi_write(file, user_buf, nbytes, PSI_MEM);
+}
+static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
+                             size_t nbytes, loff_t *ppos)
+{
+        return psi_write(file, user_buf, nbytes, PSI_CPU);
+}
+static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
+{
+        struct seq_file *seq = file->private_data;
+        return psi_trigger_poll(&seq->private, file, wait);
+}
+static int psi_fop_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = file->private_data;
+        psi_trigger_replace(&seq->private, NULL);
+        return single_release(inode, file);
+}
 static const struct file_operations psi_io_fops = {
        .open           = psi_io_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = single_release,
+        .write          = psi_io_write,
+        .poll           = psi_fop_poll,
+        .release        = psi_fop_release,
 };
 static const struct file_operations psi_memory_fops = {
        .open           = psi_memory_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = single_release,
+        .write          = psi_memory_write,
+        .poll           = psi_fop_poll,
+        .release        = psi_fop_release,
 };
 static const struct file_operations psi_cpu_fops = {
        .open           = psi_cpu_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = single_release,
+        .write          = psi_cpu_write,
+        .poll           = psi_fop_poll,
+        .release        = psi_fop_release,
 };
 static int __init psi_proc_init(void)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1e6b909dca36..a532558a5176 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-            cpumask_test_cpu(cpu, &p->cpus_allowed))
+            cpumask_test_cpu(cpu, p->cpus_ptr))
                return 1;
        return 0;
@@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                         * Also make sure that it wasn't scheduled on its rq.
                         */
                        if (unlikely(task_rq(task) != rq ||
-                                     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
+                                     !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
                                     task_running(rq, task) ||
                                     !rt_task(task) ||
                                     !task_on_rq_queued(task))) {
@@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = {
        .switched_to            = switched_to_rt,
        .update_curr            = update_curr_rt,
+#ifdef CONFIG_UCLAMP_TASK
+        .uclamp_enabled         = 1,
+#endif
 };
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index a26473674fb7..c529706bed11 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by Documentation/scheduler/sched-pelt; do not modify. */
-static const u32 runnable_avg_yN_inv[] = {
+static const u32 runnable_avg_yN_inv[] __maybe_unused = {
        0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
        0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
        0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1ada0be..802b1f3405f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks;
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
-#ifdef CONFIG_SMP
-extern void cpu_load_update_active(struct rq *this_rq);
-#else
-static inline void cpu_load_update_active(struct rq *this_rq) { }
-#endif
 /*
 * Helpers for converting nanosecond timing to jiffy resolution
 */
@@ -344,8 +338,10 @@ struct cfs_bandwidth {
        u64                     runtime_expires;
        int                     expires_seq;
-        short                   idle;
+        u8                      idle;
-        short                   period_active;
+        u8                      period_active;
+        u8                      distribute_running;
+        u8                      slack_started;
        struct hrtimer          period_timer;
        struct hrtimer          slack_timer;
        struct list_head        throttled_cfs_rq;
@@ -354,8 +350,6 @@ struct cfs_bandwidth {
        int                     nr_periods;
        int                     nr_throttled;
        u64                     throttled_time;
-        bool                    distribute_running;
 #endif
 };
@@ -797,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
 #endif
 #endif /* CONFIG_SMP */
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * struct uclamp_bucket - Utilization clamp bucket
+ * @value: utilization clamp value for tasks on this clamp bucket
+ * @tasks: number of RUNNABLE tasks on this clamp bucket
+ *
+ * Keep track of how many tasks are RUNNABLE for a given utilization
+ * clamp value.
+ */
+struct uclamp_bucket {
+        unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
+        unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
+};
+/*
+ * struct uclamp_rq - rq's utilization clamp
+ * @value: currently active clamp values for a rq
+ * @bucket: utilization clamp buckets affecting a rq
+ *
+ * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
+ * A clamp value is affecting a rq when there is at least one task RUNNABLE
+ * (or actually running) with that value.
+ *
+ * There are up to UCLAMP_CNT possible different clamp values, currently there
+ * are only two: minimum utilization and maximum utilization.
+ *
+ * All utilization clamping values are MAX aggregated, since:
+ * - for util_min: we want to run the CPU at least at the max of the minimum
+ *   utilization required by its currently RUNNABLE tasks.
+ * - for util_max: we want to allow the CPU to run up to the max of the
+ *   maximum utilization allowed by its currently RUNNABLE tasks.
+ *
+ * Since on each system we expect only a limited number of different
+ * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
+ * the metrics required to compute all the per-rq utilization clamp values.
+ */
+struct uclamp_rq {
+        unsigned int value;
+        struct uclamp_bucket bucket[UCLAMP_BUCKETS];
+};
+#endif /* CONFIG_UCLAMP_TASK */
 /*
 * This is the main, per-CPU runqueue data structure.
 *
@@ -818,8 +854,6 @@ struct rq {
        unsigned int            nr_preferred_running;
        unsigned int            numa_migrate_on;
 #endif
-        #define CPU_LOAD_IDX_MAX 5
-        unsigned long           cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ_COMMON
 #ifdef CONFIG_SMP
        unsigned long           last_load_update_tick;
@@ -830,11 +864,16 @@ struct rq {
        atomic_t nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
-        /* capture load from *all* tasks on this CPU: */
-        struct load_weight      load;
        unsigned long           nr_load_updates;
        u64                     nr_switches;
+#ifdef CONFIG_UCLAMP_TASK
+        /* Utilization clamp values based on CPU's RUNNABLE tasks */
+        struct uclamp_rq        uclamp[UCLAMP_CNT] ____cacheline_aligned;
+        unsigned int            uclamp_flags;
+#define UCLAMP_FLAG_IDLE 0x01
+#endif
        struct cfs_rq           cfs;
        struct rt_rq            rt;
        struct dl_rq            dl;
@@ -1649,6 +1688,10 @@ extern const u32		sched_prio_to_wmult[40];
 struct sched_class {
        const struct sched_class *next;
+#ifdef CONFIG_UCLAMP_TASK
+        int uclamp_enabled;
+#endif
        void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
        void (*yield_task)   (struct rq *rq);
@@ -2222,6 +2265,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
+#ifdef CONFIG_UCLAMP_TASK
+unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
+static __always_inline
+unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
+                              struct task_struct *p)
+{
+        unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+        unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+        if (p) {
+                min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
+                max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
+        }
+        /*
+         * Since CPU's {min,max}_util clamps are MAX aggregated considering
+         * RUNNABLE tasks with _different_ clamps, we can end up with an
+         * inversion. Fix it now when the clamps are applied.
+         */
+        if (unlikely(min_util >= max_util))
+                return min_util;
+        return clamp(util, min_util, max_util);
+}
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+        return uclamp_util_with(rq, util, NULL);
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
+                                            struct task_struct *p)
+{
+        return util;
+}
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+        return util;
+}
+#endif /* CONFIG_UCLAMP_TASK */
 #ifdef arch_scale_freq_capacity
 # ifndef arch_scale_freq_invariant
 #  define arch_scale_freq_invariant()   true
@@ -2237,7 +2322,6 @@ static inline unsigned long capacity_orig_of(int cpu)
 }
 #endif
-#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
 /**
 * enum schedutil_type - CPU utilization type
 * @FREQUENCY_UTIL:     Utilization used to select frequency
@@ -2253,15 +2337,11 @@ enum schedutil_type {
        ENERGY_UTIL,
 };
-unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
-                                  unsigned long max, enum schedutil_type type);
-static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
-{
-        unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
-        return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL);
+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
-}
+                                 unsigned long max, enum schedutil_type type,
+                                 struct task_struct *p);
 static inline unsigned long cpu_bw_dl(struct rq *rq)
 {
@@ -2290,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
        return READ_ONCE(rq->avg_rt.util_avg);
 }
 #else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
-static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
+static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+                                 unsigned long max, enum schedutil_type type,
+                                 struct task_struct *p)
 {
-        return cfs;
+        return 0;
 }
-#endif
+#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 static inline unsigned long cpu_util_irq(struct rq *rq)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f53f89df837d..f751ce0b783e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl,
                .imbalance_pct          = 125,
                .cache_nice_tries       = 0,
-                .busy_idx               = 0,
-                .idle_idx               = 0,
-                .newidle_idx            = 0,
-                .wake_idx               = 0,
-                .forkexec_idx           = 0,
                .flags                  = 1*SD_LOAD_BALANCE
                                        | 1*SD_BALANCE_NEWIDLE
@@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl,
        } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
                sd->imbalance_pct = 117;
                sd->cache_nice_tries = 1;
-                sd->busy_idx = 2;
 #ifdef CONFIG_NUMA
        } else if (sd->flags & SD_NUMA) {
                sd->cache_nice_tries = 2;
-                sd->busy_idx = 3;
-                sd->idle_idx = 2;
                sd->flags &= ~SD_PREFER_SIBLING;
                sd->flags |= SD_SERIALIZE;
@@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl,
 #endif
        } else {
                sd->cache_nice_tries = 1;
-                sd->busy_idx = 2;
-                sd->idle_idx = 1;
        }
        /*
@@ -1884,10 +1874,10 @@ static struct sched_domain_topology_level
        unsigned long cap;
        /* Is there any asymmetry? */
-        cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
+        cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
        for_each_cpu(i, cpu_map) {
-                if (arch_scale_cpu_capacity(NULL, i) != cap) {
+                if (arch_scale_cpu_capacity(i) != cap) {
                        asym = true;
                        break;
                }
@@ -1902,7 +1892,7 @@ static struct sched_domain_topology_level
         * to everyone.
         */
        for_each_cpu(i, cpu_map) {
-                unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
+                unsigned long max_capacity = arch_scale_cpu_capacity(i);
                int tl_id = 0;
                for_each_sd_topology(tl) {
@@ -1912,7 +1902,7 @@ static struct sched_domain_topology_level
                        for_each_cpu_and(j, tl->mask(i), cpu_map) {
                                unsigned long capacity;
-                                capacity = arch_scale_cpu_capacity(NULL, j);
+                                capacity = arch_scale_cpu_capacity(j);
                                if (capacity <= max_capacity)
                                        continue;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 6eb1f8efd221..c1e566a114ca 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Generic waiting primitives.
 *
@@ -117,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
        bookmark.func = NULL;
        INIT_LIST_HEAD(&bookmark.entry);
-        spin_lock_irqsave(&wq_head->lock, flags);
+        do {
-        nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
-        spin_unlock_irqrestore(&wq_head->lock, flags);
-        while (bookmark.flags & WQ_FLAG_BOOKMARK) {
                spin_lock_irqsave(&wq_head->lock, flags);
                nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
                                                wake_flags, key, &bookmark);
                spin_unlock_irqrestore(&wq_head->lock, flags);
-        }
+        } while (bookmark.flags & WQ_FLAG_BOOKMARK);
 }
 /**
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index c67c6d24adc2..45eba18a2898 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * The implementation of the wait_bit*() and related waiting APIs:
 */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 811b4a86cdf6..dba52a7db5e8 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -609,7 +609,7 @@ static void seccomp_send_sigsys(int syscall, int reason)
 {
        struct kernel_siginfo info;
        seccomp_init_siginfo(&info, syscall, reason);
-        force_sig_info(SIGSYS, &info, current);
+        force_sig_info(&info);
 }
 #endif  /* CONFIG_SECCOMP_FILTER */
diff --git a/kernel/signal.c b/kernel/signal.c
index 62f9aea4a15a..dabe100d2091 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  linux/kernel/signal.c
 *
@@ -44,6 +45,7 @@
 #include <linux/posix-timers.h>
 #include <linux/livepatch.h>
 #include <linux/cgroup.h>
+#include <linux/audit.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -53,7 +55,6 @@
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
 #include <asm/cacheflush.h>
-#include "audit.h"      /* audit_signal_info() */
 /*
 * SLAB caches for signal bits.
@@ -840,6 +841,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
                         */
                        if (!sid || sid == task_session(current))
                                break;
+                        /* fall through */
                default:
                        return -EPERM;
                }
@@ -1055,29 +1057,8 @@ static inline bool legacy_queue(struct sigpending *signals, int sig)
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
-#ifdef CONFIG_USER_NS
-static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t)
-{
-        if (current_user_ns() == task_cred_xxx(t, user_ns))
-                return;
-        if (SI_FROMKERNEL(info))
-                return;
-        rcu_read_lock();
-        info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
-                                        make_kuid(current_user_ns(), info->si_uid));
-        rcu_read_unlock();
-}
-#else
-static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t)
-{
-        return;
-}
-#endif
 static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
-                        enum pid_type type, int from_ancestor_ns)
+                        enum pid_type type, bool force)
 {
        struct sigpending *pending;
        struct sigqueue *q;
@@ -1087,8 +1068,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
        assert_spin_locked(&t->sighand->siglock);
        result = TRACE_SIGNAL_IGNORED;
-        if (!prepare_signal(sig, t,
+        if (!prepare_signal(sig, t, force))
-                        from_ancestor_ns || (info == SEND_SIG_PRIV)))
                goto ret;
        pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
@@ -1133,7 +1113,11 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
                        q->info.si_code = SI_USER;
                        q->info.si_pid = task_tgid_nr_ns(current,
                                                        task_active_pid_ns(t));
-                        q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+                        rcu_read_lock();
+                        q->info.si_uid =
+                                from_kuid_munged(task_cred_xxx(t, user_ns),
+                                                 current_uid());
+                        rcu_read_unlock();
                        break;
                case (unsigned long) SEND_SIG_PRIV:
                        clear_siginfo(&q->info);
@@ -1145,30 +1129,24 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
                        break;
                default:
                        copy_siginfo(&q->info, info);
-                        if (from_ancestor_ns)
-                                q->info.si_pid = 0;
                        break;
                }
+        } else if (!is_si_special(info) &&
-                userns_fixup_signal_uid(&q->info, t);
+                   sig >= SIGRTMIN && info->si_code != SI_USER) {
+                /*
-        } else if (!is_si_special(info)) {
+                 * Queue overflow, abort.  We may abort if the
-                if (sig >= SIGRTMIN && info->si_code != SI_USER) {
+                 * signal was rt and sent by user using something
-                        /*
+                 * other than kill().
-                         * Queue overflow, abort.  We may abort if the
+                 */
-                         * signal was rt and sent by user using something
+                result = TRACE_SIGNAL_OVERFLOW_FAIL;
-                         * other than kill().
+                ret = -EAGAIN;
-                         */
+                goto ret;
-                        result = TRACE_SIGNAL_OVERFLOW_FAIL;
+        } else {
-                        ret = -EAGAIN;
+                /*
-                        goto ret;
+                 * This is a silent loss of information.  We still
-                } else {
+                 * send the signal, but the *info bits are lost.
-                        /*
+                 */
-                         * This is a silent loss of information.  We still
+                result = TRACE_SIGNAL_LOSE_INFO;
-                         * send the signal, but the *info bits are lost.
-                         */
-                        result = TRACE_SIGNAL_LOSE_INFO;
-                }
        }
 out_set:
@@ -1195,17 +1173,62 @@ ret:
        return ret;
 }
+static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
+{
+        bool ret = false;
+        switch (siginfo_layout(info->si_signo, info->si_code)) {
+        case SIL_KILL:
+        case SIL_CHLD:
+        case SIL_RT:
+                ret = true;
+                break;
+        case SIL_TIMER:
+        case SIL_POLL:
+        case SIL_FAULT:
+        case SIL_FAULT_MCEERR:
+        case SIL_FAULT_BNDERR:
+        case SIL_FAULT_PKUERR:
+        case SIL_SYS:
+                ret = false;
+                break;
+        }
+        return ret;
+}
 static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
                        enum pid_type type)
 {
-        int from_ancestor_ns = 0;
+        /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
+        bool force = false;
-#ifdef CONFIG_PID_NS
+        if (info == SEND_SIG_NOINFO) {
-        from_ancestor_ns = si_fromuser(info) &&
+                /* Force if sent from an ancestor pid namespace */
-                           !task_pid_nr_ns(current, task_active_pid_ns(t));
+                force = !task_pid_nr_ns(current, task_active_pid_ns(t));
-#endif
+        } else if (info == SEND_SIG_PRIV) {
+                /* Don't ignore kernel generated signals */
+                force = true;
+        } else if (has_si_pid_and_uid(info)) {
+                /* SIGKILL and SIGSTOP is special or has ids */
+                struct user_namespace *t_user_ns;
+                rcu_read_lock();
+                t_user_ns = task_cred_xxx(t, user_ns);
+                if (current_user_ns() != t_user_ns) {
+                        kuid_t uid = make_kuid(current_user_ns(), info->si_uid);
+                        info->si_uid = from_kuid_munged(t_user_ns, uid);
+                }
+                rcu_read_unlock();
-        return __send_signal(sig, info, t, type, from_ancestor_ns);
+                /* A kernel generated signal? */
+                force = (info->si_code == SI_KERNEL);
+                /* From an ancestor pid namespace? */
+                if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
+                        info->si_pid = 0;
+                        force = true;
+                }
+        }
+        return __send_signal(sig, info, t, type, force);
 }
 static void print_fatal_signal(int signr)
@@ -1272,12 +1295,13 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p
 * We don't want to have recursive SIGSEGV's etc, for example,
 * that is why we also clear SIGNAL_UNKILLABLE.
 */
-int
+static int
-force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t)
+force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
 {
        unsigned long int flags;
        int ret, blocked, ignored;
        struct k_sigaction *action;
+        int sig = info->si_signo;
        spin_lock_irqsave(&t->sighand->siglock, flags);
        action = &t->sighand->action[sig-1];
@@ -1302,6 +1326,11 @@ force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t)
        return ret;
 }
+int force_sig_info(struct kernel_siginfo *info)
+{
+        return force_sig_info_to_task(info, current);
+}
 /*
 * Nuke all other threads in the group.
 */
@@ -1438,13 +1467,44 @@ static inline bool kill_as_cred_perm(const struct cred *cred,
               uid_eq(cred->uid, pcred->uid);
 }
-/* like kill_pid_info(), but doesn't use uid/euid of "current" */
+/*
-int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid,
+ * The usb asyncio usage of siginfo is wrong.  The glibc support
-                         const struct cred *cred)
+ * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT.
+ * AKA after the generic fields:
+ *      kernel_pid_t    si_pid;
+ *      kernel_uid32_t  si_uid;
+ *      sigval_t        si_value;
+ *
+ * Unfortunately when usb generates SI_ASYNCIO it assumes the layout
+ * after the generic fields is:
+ *      void __user     *si_addr;
+ *
+ * This is a practical problem when there is a 64bit big endian kernel
+ * and a 32bit userspace.  As the 32bit address will encoded in the low
+ * 32bits of the pointer.  Those low 32bits will be stored at higher
+ * address than appear in a 32 bit pointer.  So userspace will not
+ * see the address it was expecting for it's completions.
+ *
+ * There is nothing in the encoding that can allow
+ * copy_siginfo_to_user32 to detect this confusion of formats, so
+ * handle this by requiring the caller of kill_pid_usb_asyncio to
+ * notice when this situration takes place and to store the 32bit
+ * pointer in sival_int, instead of sival_addr of the sigval_t addr
+ * parameter.
+ */
+int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
+                         struct pid *pid, const struct cred *cred)
 {
-        int ret = -EINVAL;
+        struct kernel_siginfo info;
        struct task_struct *p;
        unsigned long flags;
+        int ret = -EINVAL;
+        clear_siginfo(&info);
+        info.si_signo = sig;
+        info.si_errno = errno;
+        info.si_code = SI_ASYNCIO;
+        *((sigval_t *)&info.si_pid) = addr;
        if (!valid_signal(sig))
                return ret;
@@ -1455,17 +1515,17 @@ int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid,
                ret = -ESRCH;
                goto out_unlock;
        }
-        if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
+        if (!kill_as_cred_perm(cred, p)) {
                ret = -EPERM;
                goto out_unlock;
        }
-        ret = security_task_kill(p, info, sig, cred);
+        ret = security_task_kill(p, &info, sig, cred);
        if (ret)
                goto out_unlock;
        if (sig) {
                if (lock_task_sighand(p, &flags)) {
-                        ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0);
+                        ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false);
                        unlock_task_sighand(p, &flags);
                } else
                        ret = -ESRCH;
@@ -1474,7 +1534,7 @@ out_unlock:
        rcu_read_unlock();
        return ret;
 }
-EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
+EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio);
 /*
 * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1550,9 +1610,17 @@ send_sig(int sig, struct task_struct *p, int priv)
 }
 EXPORT_SYMBOL(send_sig);
-void force_sig(int sig, struct task_struct *p)
+void force_sig(int sig)
 {
-        force_sig_info(sig, SEND_SIG_PRIV, p);
+        struct kernel_siginfo info;
+        clear_siginfo(&info);
+        info.si_signo = sig;
+        info.si_errno = 0;
+        info.si_code = SI_KERNEL;
+        info.si_pid = 0;
+        info.si_uid = 0;
+        force_sig_info(&info);
 }
 EXPORT_SYMBOL(force_sig);
@@ -1562,18 +1630,20 @@ EXPORT_SYMBOL(force_sig);
 * the problem was already a SIGSEGV, we'll want to
 * make sure we don't even try to deliver the signal..
 */
-void force_sigsegv(int sig, struct task_struct *p)
+void force_sigsegv(int sig)
 {
+        struct task_struct *p = current;
        if (sig == SIGSEGV) {
                unsigned long flags;
                spin_lock_irqsave(&p->sighand->siglock, flags);
                p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
                spin_unlock_irqrestore(&p->sighand->siglock, flags);
        }
-        force_sig(SIGSEGV, p);
+        force_sig(SIGSEGV);
 }
-int force_sig_fault(int sig, int code, void __user *addr
+int force_sig_fault_to_task(int sig, int code, void __user *addr
        ___ARCH_SI_TRAPNO(int trapno)
        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
        , struct task_struct *t)
@@ -1593,7 +1663,16 @@ int force_sig_fault(int sig, int code, void __user *addr
        info.si_flags = flags;
        info.si_isr = isr;
 #endif
-        return force_sig_info(info.si_signo, &info, t);
+        return force_sig_info_to_task(&info, t);
+}
+int force_sig_fault(int sig, int code, void __user *addr
+        ___ARCH_SI_TRAPNO(int trapno)
+        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
+{
+        return force_sig_fault_to_task(sig, code, addr
+                                       ___ARCH_SI_TRAPNO(trapno)
+                                       ___ARCH_SI_IA64(imm, flags, isr), current);
 }
 int send_sig_fault(int sig, int code, void __user *addr
@@ -1619,7 +1698,7 @@ int send_sig_fault(int sig, int code, void __user *addr
        return send_sig_info(info.si_signo, &info, t);
 }
-int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
+int force_sig_mceerr(int code, void __user *addr, short lsb)
 {
        struct kernel_siginfo info;
@@ -1630,7 +1709,7 @@ int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct
        info.si_code = code;
        info.si_addr = addr;
        info.si_addr_lsb = lsb;
-        return force_sig_info(info.si_signo, &info, t);
+        return force_sig_info(&info);
 }
 int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
@@ -1659,7 +1738,7 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
        info.si_addr  = addr;
        info.si_lower = lower;
        info.si_upper = upper;
-        return force_sig_info(info.si_signo, &info, current);
+        return force_sig_info(&info);
 }
 #ifdef SEGV_PKUERR
@@ -1673,7 +1752,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey)
        info.si_code  = SEGV_PKUERR;
        info.si_addr  = addr;
        info.si_pkey  = pkey;
-        return force_sig_info(info.si_signo, &info, current);
+        return force_sig_info(&info);
 }
 #endif
@@ -1689,7 +1768,7 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr)
        info.si_errno = errno;
        info.si_code  = TRAP_HWBKPT;
        info.si_addr  = addr;
-        return force_sig_info(info.si_signo, &info, current);
+        return force_sig_info(&info);
 }
 int kill_pgrp(struct pid *pid, int sig, int priv)
@@ -1802,6 +1881,14 @@ ret:
        return ret;
 }
+static void do_notify_pidfd(struct task_struct *task)
+{
+        struct pid *pid;
+        pid = task_pid(task);
+        wake_up_all(&pid->wait_pidfd);
+}
 /*
 * Let a parent know about the death of a child.
 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1825,6 +1912,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
        BUG_ON(!tsk->ptrace &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+        /* Wake up all pidfd waiters */
+        do_notify_pidfd(tsk);
        if (sig != SIGCHLD) {
                /*
                 * This is only possible if parent == real_parent.
@@ -2112,6 +2202,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
                preempt_enable_no_resched();
                cgroup_enter_frozen();
                freezable_schedule();
+                cgroup_leave_frozen(true);
        } else {
                /*
                 * By the time we got the lock, our tracer went away.
@@ -2482,6 +2573,8 @@ relock:
        if (signal_group_exit(signal)) {
                ksig->info.si_signo = signr = SIGKILL;
                sigdelset(&current->pending.signal, SIGKILL);
+                trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
+                                &sighand->action[SIGKILL - 1]);
                recalc_sigpending();
                goto fatal;
        }
@@ -2671,7 +2764,7 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
 void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
 {
        if (failed)
-                force_sigsegv(ksig->sig, current);
+                force_sigsegv(ksig->sig);
        else
                signal_delivered(ksig, stepping);
 }
@@ -2907,7 +3000,8 @@ EXPORT_SYMBOL(set_compat_user_sigmask);
 * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
 * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
 */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
+                                bool interrupted)
 {
        if (!usigmask)
@@ -2917,7 +3011,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
         * Restoring sigmask here can lead to delivering signals that the above
         * syscalls are intended to block because of the sigmask passed in.
         */
-        if (signal_pending(current)) {
+        if (interrupted) {
                current->saved_sigmask = *sigsaved;
                set_restore_sigmask();
                return;
@@ -3616,12 +3710,11 @@ static struct pid *pidfd_to_pid(const struct file *file)
 }
 /**
- * sys_pidfd_send_signal - send a signal to a process through a task file
+ * sys_pidfd_send_signal - Signal a process through a pidfd
- *                          descriptor
+ * @pidfd:  file descriptor of the process
- * @pidfd:  the file descriptor of the process
+ * @sig:    signal to send
- * @sig:    signal to be sent
+ * @info:   signal info
- * @info:   the signal info
+ * @flags:  future flags
- * @flags:  future flags to be passed
 *
 * The syscall currently only signals via PIDTYPE_PID which covers
 * kill(<positive-pid>, <signal>. It does not signal threads or process
@@ -4472,6 +4565,28 @@ static inline void siginfo_buildtime_checks(void)
        CHECK_OFFSET(si_syscall);
        CHECK_OFFSET(si_arch);
 #undef CHECK_OFFSET
+        /* usb asyncio */
+        BUILD_BUG_ON(offsetof(struct siginfo, si_pid) !=
+                     offsetof(struct siginfo, si_addr));
+        if (sizeof(int) == sizeof(void __user *)) {
+                BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) !=
+                             sizeof(void __user *));
+        } else {
+                BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) +
+                              sizeof_field(struct siginfo, si_uid)) !=
+                             sizeof(void __user *));
+                BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) !=
+                             offsetof(struct siginfo, si_uid));
+        }
+#ifdef CONFIG_COMPAT
+        BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) !=
+                     offsetof(struct compat_siginfo, si_addr));
+        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
+                     sizeof(compat_uptr_t));
+        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
+                     sizeof_field(struct siginfo, si_pid));
+#endif
 }
 void __init signals_init(void)
diff --git a/kernel/smp.c b/kernel/smp.c
index f4cf1b0bb3b8..616d4d114847 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Generic helpers for smp ipi calls
 *
@@ -33,7 +34,7 @@ struct call_function_data {
        cpumask_var_t           cpumask_ipi;
 };
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
+static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
@@ -486,13 +487,11 @@ EXPORT_SYMBOL(smp_call_function_many);
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.
 */
-int smp_call_function(smp_call_func_t func, void *info, int wait)
+void smp_call_function(smp_call_func_t func, void *info, int wait)
 {
        preempt_disable();
        smp_call_function_many(cpu_online_mask, func, info, wait);
        preempt_enable();
-        return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
@@ -593,18 +592,16 @@ void __init smp_init(void)
 * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
 * of local_irq_disable/enable().
 */
-int on_each_cpu(void (*func) (void *info), void *info, int wait)
+void on_each_cpu(void (*func) (void *info), void *info, int wait)
 {
        unsigned long flags;
-        int ret = 0;
        preempt_disable();
-        ret = smp_call_function(func, info, wait);
+        smp_call_function(func, info, wait);
        local_irq_save(flags);
        func(info);
        local_irq_restore(flags);
        preempt_enable();
-        return ret;
 }
 EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index c230c2dd48e1..2efe1e206167 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Common SMP CPU bringup/teardown functions
 */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2c3382378d94..0427a86743a4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -1,10 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *      linux/kernel/softirq.c
 *
 *      Copyright (C) 1992 Linus Torvalds
 *
- *      Distribute under GPLv2.
- *
 *      Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
 */
@@ -650,7 +649,7 @@ static int takeover_tasklets(unsigned int cpu)
        /* Find end, append list for that CPU. */
        if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
                *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
-                this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
+                __this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
                per_cpu(tasklet_vec, cpu).head = NULL;
                per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
        }
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 27bafc1e271e..e6a02b274b73 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kernel/stacktrace.c
 *
@@ -206,7 +207,7 @@ int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
        ret = arch_stack_walk_reliable(consume_entry, &c, tsk);
        put_task_stack(tsk);
-        return ret;
+        return ret ? ret : c.len;
 }
 #endif
@@ -227,7 +228,7 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
        };
        /* Trace user stack if not a kernel thread */
-        if (!current->mm)
+        if (current->flags & PF_KTHREAD)
                return 0;
        arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
@@ -254,14 +255,6 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
        WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
 }
-__weak int
-save_stack_trace_tsk_reliable(struct task_struct *tsk,
-                              struct stack_trace *trace)
-{
-        WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
-        return -ENOSYS;
-}
 /**
 * stack_trace_save - Save a stack trace into a storage array
 * @store:      Pointer to storage array
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 7231fb5953fc..b4f83f7bdf86 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * kernel/stop_machine.c
 *
@@ -5,8 +6,6 @@
 * Copyright (C) 2008, 2005     Rusty Russell rusty@rustcorp.com.au
 * Copyright (C) 2010           SUSE Linux Products GmbH
 * Copyright (C) 2010           Tejun Heo <tj@kernel.org>
- *
- * This file is released under the GPLv2 and any later version.
 */
 #include <linux/completion.h>
 #include <linux/cpu.h>
@@ -178,12 +177,18 @@ static void ack_state(struct multi_stop_data *msdata)
                set_state(msdata, msdata->state + 1);
 }
+void __weak stop_machine_yield(const struct cpumask *cpumask)
+{
+        cpu_relax();
+}
 /* This is the cpu_stop function which stops the CPU. */
 static int multi_cpu_stop(void *data)
 {
        struct multi_stop_data *msdata = data;
        enum multi_stop_state curstate = MULTI_STOP_NONE;
        int cpu = smp_processor_id(), err = 0;
+        const struct cpumask *cpumask;
        unsigned long flags;
        bool is_active;
@@ -193,15 +198,18 @@ static int multi_cpu_stop(void *data)
         */
        local_save_flags(flags);
-        if (!msdata->active_cpus)
+        if (!msdata->active_cpus) {
-                is_active = cpu == cpumask_first(cpu_online_mask);
+                cpumask = cpu_online_mask;
-        else
+                is_active = cpu == cpumask_first(cpumask);
-                is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
+        } else {
+                cpumask = msdata->active_cpus;
+                is_active = cpumask_test_cpu(cpu, cpumask);
+        }
        /* Simple state machine */
        do {
                /* Chill out and ensure we re-read multi_stop_state. */
-                cpu_relax_yield();
+                stop_machine_yield(cpumask);
                if (msdata->state != curstate) {
                        curstate = msdata->state;
                        switch (curstate) {
diff --git a/kernel/sys.c b/kernel/sys.c
index bdbfe8d37418..2969304c29fe 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1882,13 +1882,14 @@ exit_err:
 }
 /*
+ * Check arithmetic relations of passed addresses.
+ *
 * WARNING: we don't require any capability here so be very careful
 * in what is allowed for modification from userspace.
 */
-static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
 {
        unsigned long mmap_max_addr = TASK_SIZE;
-        struct mm_struct *mm = current->mm;
        int error = -EINVAL, i;
        static const unsigned char offsets[] = {
@@ -1949,24 +1950,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
                              prctl_map->start_data))
                        goto out;
-        /*
-         * Someone is trying to cheat the auxv vector.
-         */
-        if (prctl_map->auxv_size) {
-                if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
-                        goto out;
-        }
-        /*
-         * Finally, make sure the caller has the rights to
-         * change /proc/pid/exe link: only local sys admin should
-         * be allowed to.
-         */
-        if (prctl_map->exe_fd != (u32)-1) {
-                if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
-                        goto out;
-        }
        error = 0;
 out:
        return error;
@@ -1993,11 +1976,18 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
        if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
                return -EFAULT;
-        error = validate_prctl_map(&prctl_map);
+        error = validate_prctl_map_addr(&prctl_map);
        if (error)
                return error;
        if (prctl_map.auxv_size) {
+                /*
+                 * Someone is trying to cheat the auxv vector.
+                 */
+                if (!prctl_map.auxv ||
+                                prctl_map.auxv_size > sizeof(mm->saved_auxv))
+                        return -EINVAL;
                memset(user_auxv, 0, sizeof(user_auxv));
                if (copy_from_user(user_auxv,
                                   (const void __user *)prctl_map.auxv,
@@ -2010,6 +2000,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
        }
        if (prctl_map.exe_fd != (u32)-1) {
+                /*
+                 * Make sure the caller has the rights to
+                 * change /proc/pid/exe link: only local sys admin should
+                 * be allowed to.
+                 */
+                if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+                        return -EINVAL;
                error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
                if (error)
                        return error;
@@ -2097,7 +2095,11 @@ static int prctl_set_mm(int opt, unsigned long addr,
                        unsigned long arg4, unsigned long arg5)
 {
        struct mm_struct *mm = current->mm;
-        struct prctl_mm_map prctl_map;
+        struct prctl_mm_map prctl_map = {
+                .auxv = NULL,
+                .auxv_size = 0,
+                .exe_fd = -1,
+        };
        struct vm_area_struct *vma;
        int error;
@@ -2125,9 +2127,15 @@ static int prctl_set_mm(int opt, unsigned long addr,
        error = -EINVAL;
-        down_write(&mm->mmap_sem);
+        /*
+         * arg_lock protects concurent updates of arg boundaries, we need
+         * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr
+         * validation.
+         */
+        down_read(&mm->mmap_sem);
        vma = find_vma(mm, addr);
+        spin_lock(&mm->arg_lock);
        prctl_map.start_code    = mm->start_code;
        prctl_map.end_code      = mm->end_code;
        prctl_map.start_data    = mm->start_data;
@@ -2139,9 +2147,6 @@ static int prctl_set_mm(int opt, unsigned long addr,
        prctl_map.arg_end       = mm->arg_end;
        prctl_map.env_start     = mm->env_start;
        prctl_map.env_end       = mm->env_end;
-        prctl_map.auxv          = NULL;
-        prctl_map.auxv_size     = 0;
-        prctl_map.exe_fd        = -1;
        switch (opt) {
        case PR_SET_MM_START_CODE:
@@ -2181,7 +2186,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
                goto out;
        }
-        error = validate_prctl_map(&prctl_map);
+        error = validate_prctl_map_addr(&prctl_map);
        if (error)
                goto out;
@@ -2218,7 +2223,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
        error = 0;
 out:
-        up_write(&mm->mmap_sem);
+        spin_unlock(&mm->arg_lock);
+        up_read(&mm->mmap_sem);
        return error;
 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 4d9ae5ea6caf..34b76895b81e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -137,6 +137,8 @@ COND_SYSCALL(capset);
 /* kernel/exit.c */
 /* kernel/fork.c */
+/* __ARCH_WANT_SYS_CLONE3 */
+COND_SYSCALL(clone3);
 /* kernel/futex.c */
 COND_SYSCALL(futex);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba158f61aab4..1c1ad1e14f21 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * sysctl.c: General linux system control interface
 *
@@ -229,11 +230,6 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 #endif
 static int proc_dopipe_max_size(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp, loff_t *ppos);
-#ifdef CONFIG_BPF_SYSCALL
-static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
-                                          void __user *buffer, size_t *lenp,
-                                          loff_t *ppos);
-#endif
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses its own private copy */
@@ -456,6 +452,22 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = sched_rr_handler,
        },
+#ifdef CONFIG_UCLAMP_TASK
+        {
+                .procname       = "sched_util_clamp_min",
+                .data           = &sysctl_sched_uclamp_util_min,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = sysctl_sched_uclamp_handler,
+        },
+        {
+                .procname       = "sched_util_clamp_max",
+                .data           = &sysctl_sched_uclamp_util_max,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = sysctl_sched_uclamp_handler,
+        },
+#endif
 #ifdef CONFIG_SCHED_AUTOGROUP
        {
                .procname       = "sched_autogroup_enabled",
@@ -1252,12 +1264,10 @@ static struct ctl_table kern_table[] = {
        },
        {
                .procname       = "bpf_stats_enabled",
-                .data           = &sysctl_bpf_stats_enabled,
+                .data           = &bpf_stats_enabled_key.key,
-                .maxlen         = sizeof(sysctl_bpf_stats_enabled),
+                .maxlen         = sizeof(bpf_stats_enabled_key),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax_bpf_stats,
+                .proc_handler   = proc_do_static_key,
-                .extra1         = &zero,
-                .extra2         = &one,
        },
 #endif
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
@@ -2886,8 +2896,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
                        if (neg)
                                continue;
                        val = convmul * val / convdiv;
-                        if ((min && val < *min) || (max && val > *max))
+                        if ((min && val < *min) || (max && val > *max)) {
-                                continue;
+                                err = -EINVAL;
+                                break;
+                        }
                        *i = val;
                } else {
                        val = convdiv * (*i) / convmul;
@@ -3170,17 +3182,19 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
        if (write) {
                char *kbuf, *p;
+                size_t skipped = 0;
-                if (left > PAGE_SIZE - 1)
+                if (left > PAGE_SIZE - 1) {
                        left = PAGE_SIZE - 1;
+                        /* How much of the buffer we'll skip this pass */
+                        skipped = *lenp - left;
+                }
                p = kbuf = memdup_user_nul(buffer, left);
                if (IS_ERR(kbuf))
                        return PTR_ERR(kbuf);
-                tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len),
+                tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
-                                     sizeof(unsigned long),
-                                     GFP_KERNEL);
                if (!tmp_bitmap) {
                        kfree(kbuf);
                        return -ENOMEM;
@@ -3189,9 +3203,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                while (!err && left) {
                        unsigned long val_a, val_b;
                        bool neg;
+                        size_t saved_left;
+                        /* In case we stop parsing mid-number, we can reset */
+                        saved_left = left;
                        err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
                                             sizeof(tr_a), &c);
+                        /*
+                         * If we consumed the entirety of a truncated buffer or
+                         * only one char is left (may be a "-"), then stop here,
+                         * reset, & come back for more.
+                         */
+                        if ((left <= 1) && skipped) {
+                                left = saved_left;
+                                break;
+                        }
                        if (err)
                                break;
                        if (val_a >= bitmap_len || neg) {
@@ -3209,6 +3236,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                                err = proc_get_long(&p, &left, &val_b,
                                                     &neg, tr_b, sizeof(tr_b),
                                                     &c);
+                                /*
+                                 * If we consumed all of a truncated buffer or
+                                 * then stop here, reset, & come back for more.
+                                 */
+                                if (!left && skipped) {
+                                        left = saved_left;
+                                        break;
+                                }
                                if (err)
                                        break;
                                if (val_b >= bitmap_len || neg ||
@@ -3227,6 +3263,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                        proc_skip_char(&p, &left, '\n');
                }
                kfree(kbuf);
+                left += skipped;
        } else {
                unsigned long bit_a, bit_b = 0;
@@ -3271,7 +3308,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                *ppos += *lenp;
        }
-        kfree(tmp_bitmap);
+        bitmap_free(tmp_bitmap);
        return err;
 }
@@ -3346,26 +3383,35 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 #endif /* CONFIG_PROC_SYSCTL */
-#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
+#if defined(CONFIG_SYSCTL)
-static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+int proc_do_static_key(struct ctl_table *table, int write,
-                                          void __user *buffer, size_t *lenp,
+                       void __user *buffer, size_t *lenp,
-                                          loff_t *ppos)
+                       loff_t *ppos)
 {
-        int ret, bpf_stats = *(int *)table->data;
+        struct static_key *key = (struct static_key *)table->data;
-        struct ctl_table tmp = *table;
+        static DEFINE_MUTEX(static_key_mutex);
+        int val, ret;
+        struct ctl_table tmp = {
+                .data   = &val,
+                .maxlen = sizeof(val),
+                .mode   = table->mode,
+                .extra1 = &zero,
+                .extra2 = &one,
+        };
        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;
-        tmp.data = &bpf_stats;
+        mutex_lock(&static_key_mutex);
+        val = static_key_enabled(key);
        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
        if (write && !ret) {
-                *(int *)table->data = bpf_stats;
+                if (val)
-                if (bpf_stats)
+                        static_key_enable(key);
-                        static_branch_enable(&bpf_stats_enabled_key);
                else
-                        static_branch_disable(&bpf_stats_enabled_key);
+                        static_key_disable(key);
        }
+        mutex_unlock(&static_key_mutex);
        return ret;
 }
 #endif
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 5f852b8f59f7..13a0f2e6ebc2 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -1,19 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * taskstats.c - Export per-task statistics to userland
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
 *           (C) Balbir Singh,   IBM Corp. 2006
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
 */
 #include <linux/kernel.h>
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 7bca480151b0..76c997fdbc9d 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * test_kprobes.c - simple sanity test for *probes
 *
 * Copyright IBM Corp. 2008
- *
- * This program is free software;  you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
 */
 #define pr_fmt(fmt) "Kprobe smoke test: " fmt
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e2c038d6c13c..fcc42353f125 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Timer subsystem related configuration options
 #
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f1e46f338a9c..1867044800bb 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -16,5 +16,6 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
 endif
 obj-$(CONFIG_GENERIC_SCHED_CLOCK)               += sched_clock.o
 obj-$(CONFIG_TICK_ONESHOT)                      += tick-oneshot.o tick-sched.o
+obj-$(CONFIG_HAVE_GENERIC_VDSO)                 += vsyscall.o
 obj-$(CONFIG_DEBUG_FS)                          += timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)                       += test_udelay.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0519a8805aab..57518efc3810 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -233,7 +233,6 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
 /**
 * alarmtimer_suspend - Suspend time callback
 * @dev: unused
- * @state: unused
 *
 * When we are going into suspend, we look through the bases
 * to see which is the soonest timer to expire. We then
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 3bcc19ceb073..fff5f64981c6 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -105,12 +105,12 @@ static DEFINE_SPINLOCK(watchdog_lock);
 static int watchdog_running;
 static atomic_t watchdog_reset_pending;
-static void inline clocksource_watchdog_lock(unsigned long *flags)
+static inline void clocksource_watchdog_lock(unsigned long *flags)
 {
        spin_lock_irqsave(&watchdog_lock, *flags);
 }
-static void inline clocksource_watchdog_unlock(unsigned long *flags)
+static inline void clocksource_watchdog_unlock(unsigned long *flags)
 {
        spin_unlock_irqrestore(&watchdog_lock, *flags);
 }
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 41dfff23c1f9..5ee77f1a8a92 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -30,7 +30,6 @@
 #include <linux/syscalls.h>
 #include <linux/interrupt.h>
 #include <linux/tick.h>
-#include <linux/seq_file.h>
 #include <linux/err.h>
 #include <linux/debugobjects.h>
 #include <linux/sched/signal.h>
@@ -1115,9 +1114,10 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 * @timer:      hrtimer to stop
 *
 * Returns:
- *  0 when the timer was not active
+ *
- *  1 when the timer was active
+ *  *  0 when the timer was not active
- * -1 when the timer is currently executing the callback function and
+ *  *  1 when the timer was active
+ *  * -1 when the timer is currently executing the callback function and
 *    cannot be stopped
 */
 int hrtimer_try_to_cancel(struct hrtimer *timer)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index ac5555e25733..65eb796610dc 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -43,6 +43,7 @@ static u64			tick_length_base;
 #define MAX_TICKADJ             500LL           /* usecs */
 #define MAX_TICKADJ_SCALED \
        (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
+#define MAX_TAI_OFFSET          100000
 /*
 * phase-lock loop variables
@@ -691,7 +692,8 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc,
                time_constant = max(time_constant, 0l);
        }
-        if (txc->modes & ADJ_TAI && txc->constant > 0)
+        if (txc->modes & ADJ_TAI &&
+                        txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET)
                *time_tai = txc->constant;
        if (txc->modes & ADJ_OFFSET)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 29176635991f..d7f2d91acdac 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -980,23 +980,16 @@ retry_delete:
 */
 static void itimer_delete(struct k_itimer *timer)
 {
-        unsigned long flags;
 retry_delete:
-        spin_lock_irqsave(&timer->it_lock, flags);
+        spin_lock_irq(&timer->it_lock);
        if (timer_delete_hook(timer) == TIMER_RETRY) {
-                unlock_timer(timer, flags);
+                spin_unlock_irq(&timer->it_lock);
                goto retry_delete;
        }
        list_del(&timer->list);
-        /*
-         * This keeps any tasks waiting on the spin lock from thinking
-         * they got something (see the lock code above).
-         */
-        timer->it_signal = NULL;
-        unlock_timer(timer, flags);
+        spin_unlock_irq(&timer->it_lock);
        release_posix_timer(timer, IT_ID_SET);
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4ee1a3428ae..be9707f68024 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
         */
        if (!ts->tick_stopped) {
                calc_load_nohz_start();
-                cpu_load_update_nohz_start();
                quiet_vmstat();
                ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
        /* Update jiffies first */
        tick_do_update_jiffies64(now);
-        cpu_load_update_nohz_stop();
        /*
         * Clear the timer idle flag, so we avoid IPIs on remote queueing and
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 7f7d6914ddd5..5c54ca632d08 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -251,6 +251,10 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
        if (tv) {
                if (compat_get_timeval(&user_tv, tv))
                        return -EFAULT;
+                if (!timeval_valid(&user_tv))
+                        return -EINVAL;
                new_ts.tv_sec = user_tv.tv_sec;
                new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
        }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 85f5912d8f70..d911c8470149 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -808,17 +808,18 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base, *offset = offsets[offs];
+        u64 nsecs;
        WARN_ON(timekeeping_suspended);
        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
+                nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
        } while (read_seqcount_retry(&tk_core.seq, seq));
-        return base;
+        return ktime_add_ns(base, nsecs);
 }
 EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 98ba50dcb1b2..acb326f5f50a 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -282,23 +282,6 @@ static inline void timer_list_header(struct seq_file *m, u64 now)
        SEQ_printf(m, "\n");
 }
-static int timer_list_show(struct seq_file *m, void *v)
-{
-        struct timer_list_iter *iter = v;
-        if (iter->cpu == -1 && !iter->second_pass)
-                timer_list_header(m, iter->now);
-        else if (!iter->second_pass)
-                print_cpu(m, iter->cpu, iter->now);
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
-        else if (iter->cpu == -1 && iter->second_pass)
-                timer_list_show_tickdevices_header(m);
-        else
-                print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
-#endif
-        return 0;
-}
 void sysrq_timer_list_show(void)
 {
        u64 now = ktime_to_ns(ktime_get());
@@ -317,6 +300,24 @@ void sysrq_timer_list_show(void)
        return;
 }
+#ifdef CONFIG_PROC_FS
+static int timer_list_show(struct seq_file *m, void *v)
+{
+        struct timer_list_iter *iter = v;
+        if (iter->cpu == -1 && !iter->second_pass)
+                timer_list_header(m, iter->now);
+        else if (!iter->second_pass)
+                print_cpu(m, iter->cpu, iter->now);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+        else if (iter->cpu == -1 && iter->second_pass)
+                timer_list_show_tickdevices_header(m);
+        else
+                print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
+#endif
+        return 0;
+}
 static void *move_iter(struct timer_list_iter *iter, loff_t offset)
 {
        for (; offset; offset--) {
@@ -376,3 +377,4 @@ static int __init init_timer_list_procfs(void)
        return 0;
 }
 __initcall(init_timer_list_procfs);
+#endif
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
new file mode 100644
index 000000000000..8cf3596a4ce6
--- /dev/null
+++ b/kernel/time/vsyscall.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 ARM Ltd.
+ *
+ * Generic implementation of update_vsyscall and update_vsyscall_tz.
+ *
+ * Based on the x86 specific implementation.
+ */
+#include <linux/hrtimer.h>
+#include <linux/timekeeper_internal.h>
+#include <vdso/datapage.h>
+#include <vdso/helpers.h>
+#include <vdso/vsyscall.h>
+static inline void update_vdso_data(struct vdso_data *vdata,
+                                    struct timekeeper *tk)
+{
+        struct vdso_timestamp *vdso_ts;
+        u64 nsec;
+        vdata[CS_HRES_COARSE].cycle_last        = tk->tkr_mono.cycle_last;
+        vdata[CS_HRES_COARSE].mask              = tk->tkr_mono.mask;
+        vdata[CS_HRES_COARSE].mult              = tk->tkr_mono.mult;
+        vdata[CS_HRES_COARSE].shift             = tk->tkr_mono.shift;
+        vdata[CS_RAW].cycle_last                = tk->tkr_raw.cycle_last;
+        vdata[CS_RAW].mask                      = tk->tkr_raw.mask;
+        vdata[CS_RAW].mult                      = tk->tkr_raw.mult;
+        vdata[CS_RAW].shift                     = tk->tkr_raw.shift;
+        /* CLOCK_REALTIME */
+        vdso_ts         = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
+        vdso_ts->sec    = tk->xtime_sec;
+        vdso_ts->nsec   = tk->tkr_mono.xtime_nsec;
+        /* CLOCK_MONOTONIC */
+        vdso_ts         = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
+        vdso_ts->sec    = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+        nsec = tk->tkr_mono.xtime_nsec;
+        nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
+        while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
+                nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift);
+                vdso_ts->sec++;
+        }
+        vdso_ts->nsec   = nsec;
+        /* CLOCK_MONOTONIC_RAW */
+        vdso_ts         = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
+        vdso_ts->sec    = tk->raw_sec;
+        vdso_ts->nsec   = tk->tkr_raw.xtime_nsec;
+        /* CLOCK_BOOTTIME */
+        vdso_ts         = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
+        vdso_ts->sec    = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+        nsec = tk->tkr_mono.xtime_nsec;
+        nsec += ((u64)(tk->wall_to_monotonic.tv_nsec +
+                       ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift);
+        while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
+                nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift);
+                vdso_ts->sec++;
+        }
+        vdso_ts->nsec   = nsec;
+        /* CLOCK_TAI */
+        vdso_ts         = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
+        vdso_ts->sec    = tk->xtime_sec + (s64)tk->tai_offset;
+        vdso_ts->nsec   = tk->tkr_mono.xtime_nsec;
+        /*
+         * Read without the seqlock held by clock_getres().
+         * Note: No need to have a second copy.
+         */
+        WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
+}
+void update_vsyscall(struct timekeeper *tk)
+{
+        struct vdso_data *vdata = __arch_get_k_vdso_data();
+        struct vdso_timestamp *vdso_ts;
+        u64 nsec;
+        if (__arch_update_vdso_data()) {
+                /*
+                 * Some architectures might want to skip the update of the
+                 * data page.
+                 */
+                return;
+        }
+        /* copy vsyscall data */
+        vdso_write_begin(vdata);
+        vdata[CS_HRES_COARSE].clock_mode        = __arch_get_clock_mode(tk);
+        vdata[CS_RAW].clock_mode                = __arch_get_clock_mode(tk);
+        /* CLOCK_REALTIME_COARSE */
+        vdso_ts         = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
+        vdso_ts->sec    = tk->xtime_sec;
+        vdso_ts->nsec   = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+        /* CLOCK_MONOTONIC_COARSE */
+        vdso_ts         = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
+        vdso_ts->sec    = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+        nsec            = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+        nsec            = nsec + tk->wall_to_monotonic.tv_nsec;
+        vdso_ts->sec    += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
+        if (__arch_use_vsyscall(vdata))
+                update_vdso_data(vdata, tk);
+        __arch_update_vsyscall(vdata, tk);
+        vdso_write_end(vdata);
+        __arch_sync_vdso_data(vdata);
+}
+void update_vsyscall_tz(void)
+{
+        struct vdso_data *vdata = __arch_get_k_vdso_data();
+        if (__arch_use_vsyscall(vdata)) {
+                vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest;
+                vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime;
+        }
+        __arch_sync_vdso_data(vdata);
+}
diff --git a/kernel/torture.c b/kernel/torture.c
index 17b2be9bde12..a8d9bdfba7c3 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -570,6 +570,7 @@ static void torture_shutdown_cleanup(void)
 static struct task_struct *stutter_task;
 static int stutter_pause_test;
 static int stutter;
+static int stutter_gap;
 /*
 * Block until the stutter interval ends.  This must be called periodically
@@ -578,10 +579,12 @@ static int stutter;
 bool stutter_wait(const char *title)
 {
        int spt;
+        bool ret = false;
        cond_resched_tasks_rcu_qs();
        spt = READ_ONCE(stutter_pause_test);
        for (; spt; spt = READ_ONCE(stutter_pause_test)) {
+                ret = true;
                if (spt == 1) {
                        schedule_timeout_interruptible(1);
                } else if (spt == 2) {
@@ -592,7 +595,7 @@ bool stutter_wait(const char *title)
                }
                torture_shutdown_absorb(title);
        }
-        return !!spt;
+        return ret;
 }
 EXPORT_SYMBOL_GPL(stutter_wait);
@@ -602,17 +605,24 @@ EXPORT_SYMBOL_GPL(stutter_wait);
 */
 static int torture_stutter(void *arg)
 {
+        int wtime;
        VERBOSE_TOROUT_STRING("torture_stutter task started");
        do {
                if (!torture_must_stop() && stutter > 1) {
-                        WRITE_ONCE(stutter_pause_test, 1);
+                        wtime = stutter;
-                        schedule_timeout_interruptible(stutter - 1);
+                        if (stutter > HZ + 1) {
+                                WRITE_ONCE(stutter_pause_test, 1);
+                                wtime = stutter - HZ - 1;
+                                schedule_timeout_interruptible(wtime);
+                                wtime = HZ + 1;
+                        }
                        WRITE_ONCE(stutter_pause_test, 2);
-                        schedule_timeout_interruptible(1);
+                        schedule_timeout_interruptible(wtime);
                }
                WRITE_ONCE(stutter_pause_test, 0);
                if (!torture_must_stop())
-                        schedule_timeout_interruptible(stutter);
+                        schedule_timeout_interruptible(stutter_gap);
                torture_shutdown_absorb("torture_stutter");
        } while (!torture_must_stop());
        torture_kthread_stopping("torture_stutter");
@@ -622,9 +632,10 @@ static int torture_stutter(void *arg)
 /*
 * Initialize and kick off the torture_stutter kthread.
 */
-int torture_stutter_init(const int s)
+int torture_stutter_init(const int s, const int sgap)
 {
        stutter = s;
+        stutter_gap = sgap;
        return torture_create_kthread(torture_stutter, NULL, stutter_task);
 }
 EXPORT_SYMBOL_GPL(torture_stutter_init);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d965cef6c77..564e5fdb025f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Architectures that offer an FUNCTION_TRACER implementation should
 #  select HAVE_FUNCTION_TRACER:
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e1c6d79fb4cc..2d6e93ab0478 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -512,8 +512,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        dir = debugfs_lookup(buts->name, blk_debugfs_root);
        if (!dir)
                bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
-        if (!dir)
-                goto err;
        bt->dev = dev;
        atomic_set(&bt->dropped, 0);
@@ -522,12 +520,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        ret = -EIO;
        bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
                                               &blk_dropped_fops);
-        if (!bt->dropped_file)
-                goto err;
        bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
-        if (!bt->msg_file)
-                goto err;
        bt->rchan = relay_open("trace", dir, buts->buf_size,
                                buts->buf_nr, &blk_relay_callbacks, bt);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b496ffdf5f36..ca1255d14576 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -19,6 +19,9 @@
 #include "trace_probe.h"
 #include "trace.h"
+#define bpf_event_rcu_dereference(p)                                    \
+        rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
 #ifdef CONFIG_MODULES
 struct bpf_trace_module {
        struct module *module;
@@ -410,8 +413,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
        .arg4_type      = ARG_CONST_SIZE,
 };
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
 static __always_inline u64
 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
                        u64 flags, struct perf_sample_data *sd)
@@ -442,24 +443,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
        return perf_event_output(event, sd, regs);
 }
+/*
+ * Support executing tracepoints in normal, irq, and nmi context that each call
+ * bpf_perf_event_output
+ */
+struct bpf_trace_sample_data {
+        struct perf_sample_data sds[3];
+};
+static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
+static DEFINE_PER_CPU(int, bpf_trace_nest_level);
 BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
           u64, flags, void *, data, u64, size)
 {
-        struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
+        struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
+        int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
        struct perf_raw_record raw = {
                .frag = {
                        .size = size,
                        .data = data,
                },
        };
+        struct perf_sample_data *sd;
+        int err;
-        if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+        if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
-                return -EINVAL;
+                err = -EBUSY;
+                goto out;
+        }
+        sd = &sds->sds[nest_level - 1];
+        if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
+                err = -EINVAL;
+                goto out;
+        }
        perf_sample_data_init(sd, 0, 0);
        sd->raw = &raw;
-        return __bpf_perf_event_output(regs, map, flags, sd);
+        err = __bpf_perf_event_output(regs, map, flags, sd);
+out:
+        this_cpu_dec(bpf_trace_nest_level);
+        return err;
 }
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -567,6 +594,69 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
        .arg3_type      = ARG_ANYTHING,
 };
+struct send_signal_irq_work {
+        struct irq_work irq_work;
+        struct task_struct *task;
+        u32 sig;
+};
+static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
+static void do_bpf_send_signal(struct irq_work *entry)
+{
+        struct send_signal_irq_work *work;
+        work = container_of(entry, struct send_signal_irq_work, irq_work);
+        group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID);
+}
+BPF_CALL_1(bpf_send_signal, u32, sig)
+{
+        struct send_signal_irq_work *work = NULL;
+        /* Similar to bpf_probe_write_user, task needs to be
+         * in a sound condition and kernel memory access be
+         * permitted in order to send signal to the current
+         * task.
+         */
+        if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
+                return -EPERM;
+        if (unlikely(uaccess_kernel()))
+                return -EPERM;
+        if (unlikely(!nmi_uaccess_okay()))
+                return -EPERM;
+        if (in_nmi()) {
+                /* Do an early check on signal validity. Otherwise,
+                 * the error is lost in deferred irq_work.
+                 */
+                if (unlikely(!valid_signal(sig)))
+                        return -EINVAL;
+                work = this_cpu_ptr(&send_signal_work);
+                if (work->irq_work.flags & IRQ_WORK_BUSY)
+                        return -EBUSY;
+                /* Add the current task, which is the target of sending signal,
+                 * to the irq_work. The current task may change when queued
+                 * irq works get executed.
+                 */
+                work->task = current;
+                work->sig = sig;
+                irq_work_queue(&work->irq_work);
+                return 0;
+        }
+        return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
+}
+static const struct bpf_func_proto bpf_send_signal_proto = {
+        .func           = bpf_send_signal,
+        .gpl_only       = false,
+        .ret_type       = RET_INTEGER,
+        .arg1_type      = ARG_ANYTHING,
+};
 static const struct bpf_func_proto *
 tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -617,6 +707,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
        case BPF_FUNC_get_current_cgroup_id:
                return &bpf_get_current_cgroup_id_proto;
 #endif
+        case BPF_FUNC_send_signal:
+                return &bpf_send_signal_proto;
        default:
                return NULL;
        }
@@ -822,16 +914,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
 * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
 * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
+ *
+ * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
+ * in normal, irq, and nmi context.
 */
-static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
+struct bpf_raw_tp_regs {
+        struct pt_regs regs[3];
+};
+static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
+static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
+static struct pt_regs *get_bpf_raw_tp_regs(void)
+{
+        struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
+        int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
+        if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+                this_cpu_dec(bpf_raw_tp_nest_level);
+                return ERR_PTR(-EBUSY);
+        }
+        return &tp_regs->regs[nest_level - 1];
+}
+static void put_bpf_raw_tp_regs(void)
+{
+        this_cpu_dec(bpf_raw_tp_nest_level);
+}
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
           struct bpf_map *, map, u64, flags, void *, data, u64, size)
 {
-        struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+        struct pt_regs *regs = get_bpf_raw_tp_regs();
+        int ret;
+        if (IS_ERR(regs))
+                return PTR_ERR(regs);
        perf_fetch_caller_regs(regs);
-        return ____bpf_perf_event_output(regs, map, flags, data, size);
+        ret = ____bpf_perf_event_output(regs, map, flags, data, size);
+        put_bpf_raw_tp_regs();
+        return ret;
 }
 static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
@@ -848,12 +972,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
 BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
           struct bpf_map *, map, u64, flags)
 {
-        struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+        struct pt_regs *regs = get_bpf_raw_tp_regs();
+        int ret;
+        if (IS_ERR(regs))
+                return PTR_ERR(regs);
        perf_fetch_caller_regs(regs);
        /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
-        return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+        ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
-                               flags, 0, 0);
+                              flags, 0, 0);
+        put_bpf_raw_tp_regs();
+        return ret;
 }
 static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
@@ -868,11 +998,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
           void *, buf, u32, size, u64, flags)
 {
-        struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+        struct pt_regs *regs = get_bpf_raw_tp_regs();
+        int ret;
+        if (IS_ERR(regs))
+                return PTR_ERR(regs);
        perf_fetch_caller_regs(regs);
-        return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+        ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
-                             (unsigned long) size, flags, 0);
+                            (unsigned long) size, flags, 0);
+        put_bpf_raw_tp_regs();
+        return ret;
 }
 static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
@@ -1034,7 +1170,7 @@ static DEFINE_MUTEX(bpf_event_mutex);
 int perf_event_attach_bpf_prog(struct perf_event *event,
                               struct bpf_prog *prog)
 {
-        struct bpf_prog_array __rcu *old_array;
+        struct bpf_prog_array *old_array;
        struct bpf_prog_array *new_array;
        int ret = -EEXIST;
@@ -1052,7 +1188,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
        if (event->prog)
                goto unlock;
-        old_array = event->tp_event->prog_array;
+        old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
        if (old_array &&
            bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
                ret = -E2BIG;
@@ -1075,7 +1211,7 @@ unlock:
 void perf_event_detach_bpf_prog(struct perf_event *event)
 {
-        struct bpf_prog_array __rcu *old_array;
+        struct bpf_prog_array *old_array;
        struct bpf_prog_array *new_array;
        int ret;
@@ -1084,7 +1220,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
        if (!event->prog)
                goto unlock;
-        old_array = event->tp_event->prog_array;
+        old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
        ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
        if (ret == -ENOENT)
                goto unlock;
@@ -1106,6 +1242,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 {
        struct perf_event_query_bpf __user *uquery = info;
        struct perf_event_query_bpf query = {};
+        struct bpf_prog_array *progs;
        u32 *ids, prog_cnt, ids_len;
        int ret;
@@ -1130,10 +1267,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
         */
        mutex_lock(&bpf_event_mutex);
-        ret = bpf_prog_array_copy_info(event->tp_event->prog_array,
+        progs = bpf_event_rcu_dereference(event->tp_event->prog_array);
-                                       ids,
+        ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt);
-                                       ids_len,
-                                       &prog_cnt);
        mutex_unlock(&bpf_event_mutex);
        if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
@@ -1296,8 +1431,23 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
        return err;
 }
+static int __init send_signal_irq_work_init(void)
+{
+        int cpu;
+        struct send_signal_irq_work *work;
+        for_each_possible_cpu(cpu) {
+                work = per_cpu_ptr(&send_signal_work, cpu);
+                init_irq_work(&work->irq_work, do_bpf_send_signal);
+        }
+        return 0;
+}
+subsys_initcall(send_signal_irq_work_init);
 #ifdef CONFIG_MODULES
-int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module)
+static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
+                            void *module)
 {
        struct bpf_trace_module *btm, *tmp;
        struct module *mod = module;
@@ -1336,7 +1486,7 @@ static struct notifier_block bpf_module_nb = {
        .notifier_call = bpf_event_notify,
 };
-int __init bpf_event_init(void)
+static int __init bpf_event_init(void)
 {
        register_module_notifier(&bpf_module_nb);
        return 0;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b920358dd8f7..576c41644e77 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -70,12 +70,8 @@
 #define INIT_OPS_HASH(opsname)  \
        .func_hash              = &opsname.local_hash,                  \
        .local_hash.regex_lock  = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
-#define ASSIGN_OPS_HASH(opsname, val) \
-        .func_hash              = val, \
-        .local_hash.regex_lock  = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
 #else
 #define INIT_OPS_HASH(opsname)
-#define ASSIGN_OPS_HASH(opsname, val)
 #endif
 enum {
@@ -2939,14 +2935,13 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
                        p = &pg->records[i];
                        p->flags = rec_flags;
-#ifndef CC_USING_NOP_MCOUNT
                        /*
                         * Do the initial record conversion from mcount jump
                         * to the NOP instructions.
                         */
-                        if (!ftrace_code_disable(mod, p))
+                        if (!__is_defined(CC_USING_NOP_MCOUNT) &&
+                            !ftrace_code_disable(mod, p))
                                break;
-#endif
                        update_cnt++;
                }
@@ -3880,7 +3875,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
 static bool module_exists(const char *module)
 {
        /* All modules have the symbol __this_module */
-        const char this_mod[] = "__this_module";
+        static const char this_mod[] = "__this_module";
        char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
        unsigned long val;
        int n;
@@ -4225,10 +4220,13 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
        struct ftrace_func_entry *entry;
        struct ftrace_func_map *map;
        struct hlist_head *hhd;
-        int size = 1 << mapper->hash.size_bits;
+        int size, i;
-        int i;
+        if (!mapper)
+                return;
        if (free_func && mapper->hash.count) {
+                size = 1 << mapper->hash.size_bits;
                for (i = 0; i < size; i++) {
                        hhd = &mapper->hash.buckets[i];
                        hlist_for_each_entry(entry, hhd, hlist) {
@@ -6265,6 +6263,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
        preempt_disable_notrace();
        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                /* Stub functions don't need to be called nor tested */
+                if (op->flags & FTRACE_OPS_FL_STUB)
+                        continue;
                /*
                 * Check the following for each ops before calling their func:
                 *  if RCU flag is set, then rcu_is_watching() must be true
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4ee8d8aa3d0f..05b0b3139ebc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4979,7 +4979,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
        cnt = data->cnt + (nested ? 27 : 0);
        /* Multiply cnt by ~e, to make some unique increment */
-        size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
+        size = (cnt * 68 / 25) % (sizeof(rb_string) - 1);
        len = size + sizeof(struct rb_item);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index ffba6789c0e2..0564f6db0561 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -362,7 +362,7 @@ static void ring_buffer_producer(void)
                        hit--; /* make it non zero */
                }
-                /* Caculate the average time in nanosecs */
+                /* Calculate the average time in nanosecs */
                avg = NSEC_PER_MSEC / (hit + missed);
                trace_printk("%ld ns per entry\n", avg);
        }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ec439999f387..c90c687cf950 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1727,6 +1727,10 @@ static __init int init_trace_selftests(void)
        pr_info("Running postponed tracer tests:\n");
        list_for_each_entry_safe(p, n, &postponed_selftests, list) {
+                /* This loop can take minutes when sanitizers are enabled, so
+                 * lets make sure we allow RCU processing.
+                 */
+                cond_resched();
                ret = run_tracer_selftest(p->type);
                /* If the test fails, then warn and remove from available_tracers */
                if (ret < 0) {
@@ -3045,6 +3049,7 @@ void trace_printk_init_buffers(void)
        if (global_trace.trace_buffer.buffer)
                tracing_start_cmdline_record();
 }
+EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
 void trace_printk_start_comm(void)
 {
@@ -3205,6 +3210,7 @@ int trace_array_printk(struct trace_array *tr,
        va_end(ap);
        return ret;
 }
+EXPORT_SYMBOL_GPL(trace_array_printk);
 __printf(3, 4)
 int trace_array_printk_buf(struct ring_buffer *buffer,
@@ -3483,33 +3489,68 @@ static void s_stop(struct seq_file *m, void *p)
 }
 static void
+get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total,
+                      unsigned long *entries, int cpu)
+{
+        unsigned long count;
+        count = ring_buffer_entries_cpu(buf->buffer, cpu);
+        /*
+         * If this buffer has skipped entries, then we hold all
+         * entries for the trace and we need to ignore the
+         * ones before the time stamp.
+         */
+        if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
+                count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
+                /* total is the same as the entries */
+                *total = count;
+        } else
+                *total = count +
+                        ring_buffer_overrun_cpu(buf->buffer, cpu);
+        *entries = count;
+}
+static void
 get_total_entries(struct trace_buffer *buf,
                  unsigned long *total, unsigned long *entries)
 {
-        unsigned long count;
+        unsigned long t, e;
        int cpu;
        *total = 0;
        *entries = 0;
        for_each_tracing_cpu(cpu) {
-                count = ring_buffer_entries_cpu(buf->buffer, cpu);
+                get_total_entries_cpu(buf, &t, &e, cpu);
-                /*
+                *total += t;
-                 * If this buffer has skipped entries, then we hold all
+                *entries += e;
-                 * entries for the trace and we need to ignore the
-                 * ones before the time stamp.
-                 */
-                if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
-                        count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
-                        /* total is the same as the entries */
-                        *total += count;
-                } else
-                        *total += count +
-                                ring_buffer_overrun_cpu(buf->buffer, cpu);
-                *entries += count;
        }
 }
+unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu)
+{
+        unsigned long total, entries;
+        if (!tr)
+                tr = &global_trace;
+        get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu);
+        return entries;
+}
+unsigned long trace_total_entries(struct trace_array *tr)
+{
+        unsigned long total, entries;
+        if (!tr)
+                tr = &global_trace;
+        get_total_entries(&tr->trace_buffer, &total, &entries);
+        return entries;
+}
 static void print_lat_help_header(struct seq_file *m)
 {
        seq_puts(m, "#                  _------=> CPU#            \n"
@@ -3548,25 +3589,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
                                       unsigned int flags)
 {
        bool tgid = flags & TRACE_ITER_RECORD_TGID;
-        const char tgid_space[] = "          ";
+        const char *space = "          ";
-        const char space[] = "  ";
+        int prec = tgid ? 10 : 2;
        print_event_info(buf, m);
-        seq_printf(m, "#                          %s  _-----=> irqs-off\n",
+        seq_printf(m, "#                          %.*s  _-----=> irqs-off\n", prec, space);
-                   tgid ? tgid_space : space);
+        seq_printf(m, "#                          %.*s / _----=> need-resched\n", prec, space);
-        seq_printf(m, "#                          %s / _----=> need-resched\n",
+        seq_printf(m, "#                          %.*s| / _---=> hardirq/softirq\n", prec, space);
-                   tgid ? tgid_space : space);
+        seq_printf(m, "#                          %.*s|| / _--=> preempt-depth\n", prec, space);
-        seq_printf(m, "#                          %s| / _---=> hardirq/softirq\n",
+        seq_printf(m, "#                          %.*s||| /     delay\n", prec, space);
-                   tgid ? tgid_space : space);
+        seq_printf(m, "#           TASK-PID %.*sCPU#  ||||    TIMESTAMP  FUNCTION\n", prec, "   TGID   ");
-        seq_printf(m, "#                          %s|| / _--=> preempt-depth\n",
+        seq_printf(m, "#              | |   %.*s  |   ||||       |         |\n", prec, "     |    ");
-                   tgid ? tgid_space : space);
-        seq_printf(m, "#                          %s||| /     delay\n",
-                   tgid ? tgid_space : space);
-        seq_printf(m, "#           TASK-PID %sCPU#  ||||    TIMESTAMP  FUNCTION\n",
-                   tgid ? "   TGID   " : space);
-        seq_printf(m, "#              | |   %s  |   ||||       |         |\n",
-                   tgid ? "     |    " : space);
 }
 void
@@ -4692,6 +4726,7 @@ static const char readme_msg[] =
        "  trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
        "  current_tracer\t- function and latency tracers\n"
        "  available_tracers\t- list of configured tracers for current_tracer\n"
+        "  error_log\t- error log for failed commands (that support it)\n"
        "  buffer_size_kb\t- view and modify size of per cpu buffer\n"
        "  buffer_total_size_kb  - view total size of all cpu buffers\n\n"
        "  trace_clock\t\t-change the clock used to order events\n"
@@ -4712,7 +4747,7 @@ static const char readme_msg[] =
        "  instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
        "\t\t\t  Remove sub-buffer with rmdir\n"
        "  trace_options\t\t- Set format or modify how tracing happens\n"
-        "\t\t\t  Disable an option by adding a suffix 'no' to the\n"
+        "\t\t\t  Disable an option by prefixing 'no' to the\n"
        "\t\t\t  option name\n"
        "  saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -6296,13 +6331,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        struct ring_buffer *buffer;
        struct print_entry *entry;
        unsigned long irq_flags;
-        const char faulted[] = "<faulted>";
        ssize_t written;
        int size;
        int len;
 /* Used in tracing_mark_raw_write() as well */
-#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */
+#define FAULTED_STR "<faulted>"
+#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */
        if (tracing_disabled)
                return -EINVAL;
@@ -6334,7 +6369,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
        if (len) {
-                memcpy(&entry->buf, faulted, FAULTED_SIZE);
+                memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
                cnt = FAULTED_SIZE;
                written = -EFAULT;
        } else
@@ -6375,7 +6410,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
        struct raw_data_entry *entry;
-        const char faulted[] = "<faulted>";
        unsigned long irq_flags;
        ssize_t written;
        int size;
@@ -6415,7 +6449,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
        len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
        if (len) {
                entry->id = -1;
-                memcpy(&entry->buf, faulted, FAULTED_SIZE);
+                memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
                written = -EFAULT;
        } else
                written = cnt;
@@ -6685,11 +6719,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
                        break;
                }
 #endif
-                if (!tr->allocated_snapshot) {
+                if (tr->allocated_snapshot)
+                        ret = resize_buffer_duplicate_size(&tr->max_buffer,
+                                        &tr->trace_buffer, iter->cpu_file);
+                else
                        ret = tracing_alloc_snapshot_instance(tr);
-                        if (ret < 0)
+                if (ret < 0)
-                                break;
+                        break;
-                }
                local_irq_disable();
                /* Now, we're going to swap */
                if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
@@ -6868,6 +6904,250 @@ static const struct file_operations snapshot_raw_fops = {
 #endif /* CONFIG_TRACER_SNAPSHOT */
+#define TRACING_LOG_ERRS_MAX    8
+#define TRACING_LOG_LOC_MAX     128
+#define CMD_PREFIX "  Command: "
+struct err_info {
+        const char      **errs; /* ptr to loc-specific array of err strings */
+        u8              type;   /* index into errs -> specific err string */
+        u8              pos;    /* MAX_FILTER_STR_VAL = 256 */
+        u64             ts;
+};
+struct tracing_log_err {
+        struct list_head        list;
+        struct err_info         info;
+        char                    loc[TRACING_LOG_LOC_MAX]; /* err location */
+        char                    cmd[MAX_FILTER_STR_VAL]; /* what caused err */
+};
+static DEFINE_MUTEX(tracing_err_log_lock);
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+{
+        struct tracing_log_err *err;
+        if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
+                err = kzalloc(sizeof(*err), GFP_KERNEL);
+                if (!err)
+                        err = ERR_PTR(-ENOMEM);
+                tr->n_err_log_entries++;
+                return err;
+        }
+        err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
+        list_del(&err->list);
+        return err;
+}
+/**
+ * err_pos - find the position of a string within a command for error careting
+ * @cmd: The tracing command that caused the error
+ * @str: The string to position the caret at within @cmd
+ *
+ * Finds the position of the first occurence of @str within @cmd.  The
+ * return value can be passed to tracing_log_err() for caret placement
+ * within @cmd.
+ *
+ * Returns the index within @cmd of the first occurence of @str or 0
+ * if @str was not found.
+ */
+unsigned int err_pos(char *cmd, const char *str)
+{
+        char *found;
+        if (WARN_ON(!strlen(cmd)))
+                return 0;
+        found = strstr(cmd, str);
+        if (found)
+                return found - cmd;
+        return 0;
+}
+/**
+ * tracing_log_err - write an error to the tracing error log
+ * @tr: The associated trace array for the error (NULL for top level array)
+ * @loc: A string describing where the error occurred
+ * @cmd: The tracing command that caused the error
+ * @errs: The array of loc-specific static error strings
+ * @type: The index into errs[], which produces the specific static err string
+ * @pos: The position the caret should be placed in the cmd
+ *
+ * Writes an error into tracing/error_log of the form:
+ *
+ * <loc>: error: <text>
+ *   Command: <cmd>
+ *              ^
+ *
+ * tracing/error_log is a small log file containing the last
+ * TRACING_LOG_ERRS_MAX errors (8).  Memory for errors isn't allocated
+ * unless there has been a tracing error, and the error log can be
+ * cleared and have its memory freed by writing the empty string in
+ * truncation mode to it i.e. echo > tracing/error_log.
+ *
+ * NOTE: the @errs array along with the @type param are used to
+ * produce a static error string - this string is not copied and saved
+ * when the error is logged - only a pointer to it is saved.  See
+ * existing callers for examples of how static strings are typically
+ * defined for use with tracing_log_err().
+ */
+void tracing_log_err(struct trace_array *tr,
+                     const char *loc, const char *cmd,
+                     const char **errs, u8 type, u8 pos)
+{
+        struct tracing_log_err *err;
+        if (!tr)
+                tr = &global_trace;
+        mutex_lock(&tracing_err_log_lock);
+        err = get_tracing_log_err(tr);
+        if (PTR_ERR(err) == -ENOMEM) {
+                mutex_unlock(&tracing_err_log_lock);
+                return;
+        }
+        snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
+        snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
+        err->info.errs = errs;
+        err->info.type = type;
+        err->info.pos = pos;
+        err->info.ts = local_clock();
+        list_add_tail(&err->list, &tr->err_log);
+        mutex_unlock(&tracing_err_log_lock);
+}
+static void clear_tracing_err_log(struct trace_array *tr)
+{
+        struct tracing_log_err *err, *next;
+        mutex_lock(&tracing_err_log_lock);
+        list_for_each_entry_safe(err, next, &tr->err_log, list) {
+                list_del(&err->list);
+                kfree(err);
+        }
+        tr->n_err_log_entries = 0;
+        mutex_unlock(&tracing_err_log_lock);
+}
+static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos)
+{
+        struct trace_array *tr = m->private;
+        mutex_lock(&tracing_err_log_lock);
+        return seq_list_start(&tr->err_log, *pos);
+}
+static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct trace_array *tr = m->private;
+        return seq_list_next(v, &tr->err_log, pos);
+}
+static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
+{
+        mutex_unlock(&tracing_err_log_lock);
+}
+static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
+{
+        u8 i;
+        for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
+                seq_putc(m, ' ');
+        for (i = 0; i < pos; i++)
+                seq_putc(m, ' ');
+        seq_puts(m, "^\n");
+}
+static int tracing_err_log_seq_show(struct seq_file *m, void *v)
+{
+        struct tracing_log_err *err = v;
+        if (err) {
+                const char *err_text = err->info.errs[err->info.type];
+                u64 sec = err->info.ts;
+                u32 nsec;
+                nsec = do_div(sec, NSEC_PER_SEC);
+                seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000,
+                           err->loc, err_text);
+                seq_printf(m, "%s", err->cmd);
+                tracing_err_log_show_pos(m, err->info.pos);
+        }
+        return 0;
+}
+static const struct seq_operations tracing_err_log_seq_ops = {
+        .start  = tracing_err_log_seq_start,
+        .next   = tracing_err_log_seq_next,
+        .stop   = tracing_err_log_seq_stop,
+        .show   = tracing_err_log_seq_show
+};
+static int tracing_err_log_open(struct inode *inode, struct file *file)
+{
+        struct trace_array *tr = inode->i_private;
+        int ret = 0;
+        if (trace_array_get(tr) < 0)
+                return -ENODEV;
+        /* If this file was opened for write, then erase contents */
+        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
+                clear_tracing_err_log(tr);
+        if (file->f_mode & FMODE_READ) {
+                ret = seq_open(file, &tracing_err_log_seq_ops);
+                if (!ret) {
+                        struct seq_file *m = file->private_data;
+                        m->private = tr;
+                } else {
+                        trace_array_put(tr);
+                }
+        }
+        return ret;
+}
+static ssize_t tracing_err_log_write(struct file *file,
+                                     const char __user *buffer,
+                                     size_t count, loff_t *ppos)
+{
+        return count;
+}
+static int tracing_err_log_release(struct inode *inode, struct file *file)
+{
+        struct trace_array *tr = inode->i_private;
+        trace_array_put(tr);
+        if (file->f_mode & FMODE_READ)
+                seq_release(inode, file);
+        return 0;
+}
+static const struct file_operations tracing_err_log_fops = {
+        .open           = tracing_err_log_open,
+        .write          = tracing_err_log_write,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = tracing_err_log_release,
+};
 static int tracing_buffers_open(struct inode *inode, struct file *filp)
 {
        struct trace_array *tr = inode->i_private;
@@ -7926,7 +8206,7 @@ static const struct file_operations buffer_percent_fops = {
        .llseek         = default_llseek,
 };
-struct dentry *trace_instance_dir;
+static struct dentry *trace_instance_dir;
 static void
 init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
@@ -8033,7 +8313,7 @@ static void update_tracer_options(struct trace_array *tr)
        mutex_unlock(&trace_types_lock);
 }
-static int instance_mkdir(const char *name)
+struct trace_array *trace_array_create(const char *name)
 {
        struct trace_array *tr;
        int ret;
@@ -8072,6 +8352,7 @@ static int instance_mkdir(const char *name)
        INIT_LIST_HEAD(&tr->systems);
        INIT_LIST_HEAD(&tr->events);
        INIT_LIST_HEAD(&tr->hist_vars);
+        INIT_LIST_HEAD(&tr->err_log);
        if (allocate_trace_buffers(tr, trace_buf_size) < 0)
                goto out_free_tr;
@@ -8097,7 +8378,7 @@ static int instance_mkdir(const char *name)
        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);
-        return 0;
+        return tr;
 out_free_tr:
        free_trace_buffers(tr);
@@ -8109,33 +8390,21 @@ static int instance_mkdir(const char *name)
        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);
-        return ret;
+        return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(trace_array_create);
+static int instance_mkdir(const char *name)
+{
+        return PTR_ERR_OR_ZERO(trace_array_create(name));
 }
-static int instance_rmdir(const char *name)
+static int __remove_instance(struct trace_array *tr)
 {
-        struct trace_array *tr;
-        int found = 0;
-        int ret;
        int i;
-        mutex_lock(&event_mutex);
-        mutex_lock(&trace_types_lock);
-        ret = -ENODEV;
-        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
-                if (tr->name && strcmp(tr->name, name) == 0) {
-                        found = 1;
-                        break;
-                }
-        }
-        if (!found)
-                goto out_unlock;
-        ret = -EBUSY;
        if (tr->ref || (tr->current_trace && tr->current_trace->ref))
-                goto out_unlock;
+                return -EBUSY;
        list_del(&tr->list);
@@ -8161,10 +8430,46 @@ static int instance_rmdir(const char *name)
        free_cpumask_var(tr->tracing_cpumask);
        kfree(tr->name);
        kfree(tr);
+        tr = NULL;
-        ret = 0;
+        return 0;
+}
+int trace_array_destroy(struct trace_array *tr)
+{
+        int ret;
+        if (!tr)
+                return -EINVAL;
+        mutex_lock(&event_mutex);
+        mutex_lock(&trace_types_lock);
+        ret = __remove_instance(tr);
+        mutex_unlock(&trace_types_lock);
+        mutex_unlock(&event_mutex);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(trace_array_destroy);
+static int instance_rmdir(const char *name)
+{
+        struct trace_array *tr;
+        int ret;
+        mutex_lock(&event_mutex);
+        mutex_lock(&trace_types_lock);
+        ret = -ENODEV;
+        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+                if (tr->name && strcmp(tr->name, name) == 0) {
+                        ret = __remove_instance(tr);
+                        break;
+                }
+        }
- out_unlock:
        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);
@@ -8254,6 +8559,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
                          tr, &snapshot_fops);
 #endif
+        trace_create_file("error_log", 0644, d_tracer,
+                          tr, &tracing_err_log_fops);
        for_each_tracing_cpu(cpu)
                tracing_init_tracefs_percpu(tr, cpu);
@@ -8310,10 +8618,6 @@ struct dentry *tracing_init_dentry(void)
         */
        tr->dir = debugfs_create_automount("tracing", NULL,
                                           trace_automount, NULL);
-        if (!tr->dir) {
-                pr_warn_once("Could not create debugfs directory 'tracing'\n");
-                return ERR_PTR(-ENOMEM);
-        }
        return NULL;
 }
@@ -8616,12 +8920,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
                cnt++;
-                /* reset all but tr, trace, and overruns */
+                trace_iterator_reset(&iter);
-                memset(&iter.seq, 0,
-                       sizeof(struct trace_iterator) -
-                       offsetof(struct trace_iterator, seq));
                iter.iter_flags |= TRACE_FILE_LAT_FMT;
-                iter.pos = -1;
                if (trace_find_next_entry_inc(&iter) != NULL) {
                        int ret;
@@ -8839,6 +9139,7 @@ __init static int tracer_alloc_buffers(void)
        INIT_LIST_HEAD(&global_trace.systems);
        INIT_LIST_HEAD(&global_trace.events);
        INIT_LIST_HEAD(&global_trace.hist_vars);
+        INIT_LIST_HEAD(&global_trace.err_log);
        list_add(&global_trace.list, &ftrace_trace_arrays);
        apply_trace_boot_options();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 639047b259d7..005f08629b8b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -15,7 +15,6 @@
 #include <linux/trace_seq.h>
 #include <linux/trace_events.h>
 #include <linux/compiler.h>
-#include <linux/trace_seq.h>
 #include <linux/glob.h>
 #ifdef CONFIG_FTRACE_SYSCALLS
@@ -293,11 +292,13 @@ struct trace_array {
        int                     nr_topts;
        bool                    clear_trace;
        int                     buffer_percent;
+        unsigned int            n_err_log_entries;
        struct tracer           *current_trace;
        unsigned int            trace_flags;
        unsigned char           trace_flags_index[TRACE_FLAGS_MAX_SIZE];
        unsigned int            flags;
        raw_spinlock_t          start_lock;
+        struct list_head        err_log;
        struct dentry           *dir;
        struct dentry           *options;
        struct dentry           *percpu_dir;
@@ -719,6 +720,9 @@ void trace_init_global_iter(struct trace_iterator *iter);
 void tracing_iter_reset(struct trace_iterator *iter, int cpu);
+unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu);
+unsigned long trace_total_entries(struct trace_array *tr);
 void trace_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
@@ -1545,7 +1549,8 @@ extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
 extern void print_subsystem_event_filter(struct event_subsystem *system,
                                         struct trace_seq *s);
 extern int filter_assign_type(const char *type);
-extern int create_event_filter(struct trace_event_call *call,
+extern int create_event_filter(struct trace_array *tr,
+                               struct trace_event_call *call,
                               char *filter_str, bool set_str,
                               struct event_filter **filterp);
 extern void free_event_filter(struct event_filter *filter);
@@ -1876,6 +1881,11 @@ extern ssize_t trace_parse_run_command(struct file *file,
                const char __user *buffer, size_t count, loff_t *ppos,
                int (*createfn)(int, char**));
+extern unsigned int err_pos(char *cmd, const char *str);
+extern void tracing_log_err(struct trace_array *tr,
+                            const char *loc, const char *cmd,
+                            const char **errs, u8 type, u8 pos);
 /*
 * Normal trace_printk() and friends allocates special buffers
 * to do the manipulation, as well as saves the print formats
@@ -1956,4 +1966,22 @@ static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }
 extern struct trace_iterator *tracepoint_print_iter;
+/*
+ * Reset the state of the trace_iterator so that it can read consumed data.
+ * Normally, the trace_iterator is used for reading the data when it is not
+ * consumed, and must retain state.
+ */
+static __always_inline void trace_iterator_reset(struct trace_iterator *iter)
+{
+        const size_t offset = offsetof(struct trace_iterator, seq);
+        /*
+         * Keep gcc from complaining about overwriting more than just one
+         * member in the structure.
+         */
+        memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset);
+        iter->pos = -1;
+}
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5b3b0c3c8a47..0ce3db67f556 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -832,6 +832,7 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
        return ret;
 }
+EXPORT_SYMBOL_GPL(ftrace_set_clr_event);
 /**
 * trace_set_clr_event - enable or disable an event
@@ -1318,9 +1319,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
        char buf[32];
        int len;
-        if (*ppos)
-                return 0;
        if (unlikely(!id))
                return -ENODEV;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 05a66493a164..5079d1db3754 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -66,7 +66,8 @@ static const char * ops[] = { OPS };
        C(INVALID_FILTER,       "Meaningless filter expression"),       \
        C(IP_FIELD_ONLY,        "Only 'ip' field is supported for function trace"), \
        C(INVALID_VALUE,        "Invalid value (did you forget quotes)?"), \
-        C(NO_FILTER,            "No filter found"),
+        C(ERRNO,                "Error"),                               \
+        C(NO_FILTER,            "No filter found")
 #undef C
 #define C(a, b)         FILT_ERR_##a
@@ -76,7 +77,7 @@ enum { ERRORS };
 #undef C
 #define C(a, b)         b
-static char *err_text[] = { ERRORS };
+static const char *err_text[] = { ERRORS };
 /* Called after a '!' character but "!=" and "!~" are not "not"s */
 static bool is_not(const char *str)
@@ -427,7 +428,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
        op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL);
        if (!op_stack)
                return ERR_PTR(-ENOMEM);
-        prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL);
+        prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL);
        if (!prog_stack) {
                parse_error(pe, -ENOMEM, 0);
                goto out_free;
@@ -578,7 +579,11 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
 out_free:
        kfree(op_stack);
        kfree(inverts);
-        kfree(prog_stack);
+        if (prog_stack) {
+                for (i = 0; prog_stack[i].pred; i++)
+                        kfree(prog_stack[i].pred);
+                kfree(prog_stack);
+        }
        return ERR_PTR(ret);
 }
@@ -919,7 +924,8 @@ static void remove_filter_string(struct event_filter *filter)
        filter->filter_string = NULL;
 }
-static void append_filter_err(struct filter_parse_error *pe,
+static void append_filter_err(struct trace_array *tr,
+                              struct filter_parse_error *pe,
                              struct event_filter *filter)
 {
        struct trace_seq *s;
@@ -947,8 +953,14 @@ static void append_filter_err(struct filter_parse_error *pe,
        if (pe->lasterr > 0) {
                trace_seq_printf(s, "\n%*s", pos, "^");
                trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]);
+                tracing_log_err(tr, "event filter parse error",
+                                filter->filter_string, err_text,
+                                pe->lasterr, pe->lasterr_pos);
        } else {
                trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr);
+                tracing_log_err(tr, "event filter parse error",
+                                filter->filter_string, err_text,
+                                FILT_ERR_ERRNO, 0);
        }
        trace_seq_putc(s, 0);
        buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL);
@@ -1214,30 +1226,30 @@ static int parse_pred(const char *str, void *data,
                 * (perf doesn't use it) and grab everything.
                 */
                if (strcmp(field->name, "ip") != 0) {
-                         parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
+                        parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
-                         goto err_free;
+                        goto err_free;
-                 }
+                }
-                 pred->fn = filter_pred_none;
+                pred->fn = filter_pred_none;
-                 /*
+                /*
-                  * Quotes are not required, but if they exist then we need
+                 * Quotes are not required, but if they exist then we need
-                  * to read them till we hit a matching one.
+                 * to read them till we hit a matching one.
-                  */
+                 */
-                 if (str[i] == '\'' || str[i] == '"')
+                if (str[i] == '\'' || str[i] == '"')
-                         q = str[i];
+                        q = str[i];
-                 else
+                else
-                         q = 0;
+                        q = 0;
-                 for (i++; str[i]; i++) {
+                for (i++; str[i]; i++) {
-                         if (q && str[i] == q)
+                        if (q && str[i] == q)
-                                 break;
+                                break;
-                         if (!q && (str[i] == ')' || str[i] == '&' ||
+                        if (!q && (str[i] == ')' || str[i] == '&' ||
-                                    str[i] == '|'))
+                                   str[i] == '|'))
-                                 break;
+                                break;
-                 }
+                }
-                 /* Skip quotes */
+                /* Skip quotes */
-                 if (q)
+                if (q)
-                         s++;
+                        s++;
                len = i - s;
                if (len >= MAX_FILTER_STR_VAL) {
                        parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
@@ -1600,7 +1612,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
                if (err) {
                        filter_disable(file);
                        parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-                        append_filter_err(pe, filter);
+                        append_filter_err(tr, pe, filter);
                } else
                        event_set_filtered_flag(file);
@@ -1712,7 +1724,8 @@ static void create_filter_finish(struct filter_parse_error *pe)
 * information if @set_str is %true and the caller is responsible for
 * freeing it.
 */
-static int create_filter(struct trace_event_call *call,
+static int create_filter(struct trace_array *tr,
+                         struct trace_event_call *call,
                         char *filter_string, bool set_str,
                         struct event_filter **filterp)
 {
@@ -1729,17 +1742,18 @@ static int create_filter(struct trace_event_call *call,
        err = process_preds(call, filter_string, *filterp, pe);
        if (err && set_str)
-                append_filter_err(pe, *filterp);
+                append_filter_err(tr, pe, *filterp);
        create_filter_finish(pe);
        return err;
 }
-int create_event_filter(struct trace_event_call *call,
+int create_event_filter(struct trace_array *tr,
+                        struct trace_event_call *call,
                        char *filter_str, bool set_str,
                        struct event_filter **filterp)
 {
-        return create_filter(call, filter_str, set_str, filterp);
+        return create_filter(tr, call, filter_str, set_str, filterp);
 }
 /**
@@ -1766,7 +1780,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
                        kfree((*filterp)->filter_string);
                        (*filterp)->filter_string = NULL;
                } else {
-                        append_filter_err(pe, *filterp);
+                        append_filter_err(tr, pe, *filterp);
                }
        }
        create_filter_finish(pe);
@@ -1797,7 +1811,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
                return 0;
        }
-        err = create_filter(call, filter_string, true, &filter);
+        err = create_filter(file->tr, call, filter_string, true, &filter);
        /*
         * Always swap the call filter with the new filter
@@ -2053,7 +2067,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
        if (event->filter)
                goto out_unlock;
-        err = create_filter(call, filter_str, false, &filter);
+        err = create_filter(NULL, call, filter_str, false, &filter);
        if (err)
                goto free_filter;
@@ -2202,8 +2216,8 @@ static __init int ftrace_test_event_filter(void)
                struct test_filter_data_t *d = &test_filter_data[i];
                int err;
-                err = create_filter(&event_ftrace_test_filter, d->filter,
+                err = create_filter(NULL, &event_ftrace_test_filter,
-                                    false, &filter);
+                                    d->filter, false, &filter);
                if (err) {
                        printk(KERN_INFO
                               "Failed to get filter for '%s', err %d\n",
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index a1d20421f4b0..ca6b0dff60c5 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -22,6 +22,57 @@
 #define STR_VAR_LEN_MAX         32 /* must be multiple of sizeof(u64) */
+#define ERRORS                                                          \
+        C(NONE,                 "No error"),                            \
+        C(DUPLICATE_VAR,        "Variable already defined"),            \
+        C(VAR_NOT_UNIQUE,       "Variable name not unique, need to use fully qualified name (subsys.event.var) for variable"), \
+        C(TOO_MANY_VARS,        "Too many variables defined"),          \
+        C(MALFORMED_ASSIGNMENT, "Malformed assignment"),                \
+        C(NAMED_MISMATCH,       "Named hist trigger doesn't match existing named trigger (includes variables)"), \
+        C(TRIGGER_EEXIST,       "Hist trigger already exists"),         \
+        C(TRIGGER_ENOENT_CLEAR, "Can't clear or continue a nonexistent hist trigger"), \
+        C(SET_CLOCK_FAIL,       "Couldn't set trace_clock"),            \
+        C(BAD_FIELD_MODIFIER,   "Invalid field modifier"),              \
+        C(TOO_MANY_SUBEXPR,     "Too many subexpressions (3 max)"),     \
+        C(TIMESTAMP_MISMATCH,   "Timestamp units in expression don't match"), \
+        C(TOO_MANY_FIELD_VARS,  "Too many field variables defined"),    \
+        C(EVENT_FILE_NOT_FOUND, "Event file not found"),                \
+        C(HIST_NOT_FOUND,       "Matching event histogram not found"),  \
+        C(HIST_CREATE_FAIL,     "Couldn't create histogram for field"), \
+        C(SYNTH_VAR_NOT_FOUND,  "Couldn't find synthetic variable"),    \
+        C(SYNTH_EVENT_NOT_FOUND,"Couldn't find synthetic event"),       \
+        C(SYNTH_TYPE_MISMATCH,  "Param type doesn't match synthetic event field type"), \
+        C(SYNTH_COUNT_MISMATCH, "Param count doesn't match synthetic event field count"), \
+        C(FIELD_VAR_PARSE_FAIL, "Couldn't parse field variable"),       \
+        C(VAR_CREATE_FIND_FAIL, "Couldn't create or find variable"),    \
+        C(ONX_NOT_VAR,          "For onmax(x) or onchange(x), x must be a variable"), \
+        C(ONX_VAR_NOT_FOUND,    "Couldn't find onmax or onchange variable"), \
+        C(ONX_VAR_CREATE_FAIL,  "Couldn't create onmax or onchange variable"), \
+        C(FIELD_VAR_CREATE_FAIL,"Couldn't create field variable"),      \
+        C(TOO_MANY_PARAMS,      "Too many action params"),              \
+        C(PARAM_NOT_FOUND,      "Couldn't find param"),                 \
+        C(INVALID_PARAM,        "Invalid action param"),                \
+        C(ACTION_NOT_FOUND,     "No action found"),                     \
+        C(NO_SAVE_PARAMS,       "No params found for save()"),          \
+        C(TOO_MANY_SAVE_ACTIONS,"Can't have more than one save() action per hist"), \
+        C(ACTION_MISMATCH,      "Handler doesn't support action"),      \
+        C(NO_CLOSING_PAREN,     "No closing paren found"),              \
+        C(SUBSYS_NOT_FOUND,     "Missing subsystem"),                   \
+        C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"),     \
+        C(INVALID_REF_KEY,      "Using variable references in keys not supported"), \
+        C(VAR_NOT_FOUND,        "Couldn't find variable"),              \
+        C(FIELD_NOT_FOUND,      "Couldn't find field"),
+#undef C
+#define C(a, b)         HIST_ERR_##a
+enum { ERRORS };
+#undef C
+#define C(a, b)         b
+static const char *err_text[] = { ERRORS };
 struct hist_field;
 typedef u64 (*hist_field_fn_t) (struct hist_field *field,
@@ -535,62 +586,49 @@ static struct track_data *track_data_alloc(unsigned int key_len,
        return data;
 }
-static char last_hist_cmd[MAX_FILTER_STR_VAL];
+static char last_cmd[MAX_FILTER_STR_VAL];
-static char hist_err_str[MAX_FILTER_STR_VAL];
+static char last_cmd_loc[MAX_FILTER_STR_VAL];
-static void last_cmd_set(char *str)
+static int errpos(char *str)
 {
-        if (!str)
+        return err_pos(last_cmd, str);
-                return;
-        strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
 }
-static void hist_err(char *str, char *var)
+static void last_cmd_set(struct trace_event_file *file, char *str)
 {
-        int maxlen = MAX_FILTER_STR_VAL - 1;
+        const char *system = NULL, *name = NULL;
+        struct trace_event_call *call;
        if (!str)
                return;
-        if (strlen(hist_err_str))
+        strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
-                return;
-        if (!var)
+        if (file) {
-                var = "";
+                call = file->event_call;
-        if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
+                system = call->class->system;
-                return;
+                if (system) {
+                        name = trace_event_name(call);
+                        if (!name)
+                                system = NULL;
+                }
+        }
-        strcat(hist_err_str, str);
+        if (system)
-        strcat(hist_err_str, var);
+                snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
 }
-static void hist_err_event(char *str, char *system, char *event, char *var)
+static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
 {
-        char err[MAX_FILTER_STR_VAL];
+        tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
+                        err_type, err_pos);
-        if (system && var)
-                snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
-        else if (system)
-                snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
-        else
-                strscpy(err, var, MAX_FILTER_STR_VAL);
-        hist_err(str, err);
 }
 static void hist_err_clear(void)
 {
-        hist_err_str[0] = '\0';
+        last_cmd[0] = '\0';
-}
+        last_cmd_loc[0] = '\0';
-static bool have_hist_err(void)
-{
-        if (strlen(hist_err_str))
-                return true;
-        return false;
 }
 struct synth_trace_event {
@@ -1719,7 +1757,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr,
                if (find_var_field(var_hist_data, var_name)) {
                        if (found) {
-                                hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+                                hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name));
                                return NULL;
                        }
@@ -1770,7 +1808,8 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name)
                        hist_field = find_file_var(file, var_name);
                        if (hist_field) {
                                if (found) {
-                                        hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+                                        hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE,
+                                                 errpos(var_name));
                                        return ERR_PTR(-EINVAL);
                                }
@@ -1815,6 +1854,9 @@ static u64 hist_field_var_ref(struct hist_field *hist_field,
        struct hist_elt_data *elt_data;
        u64 var_val = 0;
+        if (WARN_ON_ONCE(!elt))
+                return var_val;
        elt_data = elt->private_data;
        var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
@@ -2002,11 +2044,11 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
                attrs->n_actions++;
                ret = 0;
        }
        return ret;
 }
-static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
+static int parse_assignment(struct trace_array *tr,
+                            char *str, struct hist_trigger_attrs *attrs)
 {
        int ret = 0;
@@ -2062,7 +2104,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
                char *assignment;
                if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
-                        hist_err("Too many variables defined: ", str);
+                        hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(str));
                        ret = -EINVAL;
                        goto out;
                }
@@ -2079,7 +2121,8 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
        return ret;
 }
-static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
+static struct hist_trigger_attrs *
+parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
 {
        struct hist_trigger_attrs *attrs;
        int ret = 0;
@@ -2092,7 +2135,7 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
                char *str = strsep(&trigger_str, ":");
                if (strchr(str, '=')) {
-                        ret = parse_assignment(str, attrs);
+                        ret = parse_assignment(tr, str, attrs);
                        if (ret)
                                goto free;
                } else if (strcmp(str, "pause") == 0)
@@ -2648,6 +2691,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
                                        char *var_name)
 {
        struct hist_field *var_field = NULL, *ref_field = NULL;
+        struct trace_array *tr = hist_data->event_file->tr;
        if (!is_var_ref(var_name))
                return NULL;
@@ -2660,8 +2704,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
                                           system, event_name);
        if (!ref_field)
-                hist_err_event("Couldn't find variable: $",
+                hist_err(tr, HIST_ERR_VAR_NOT_FOUND, errpos(var_name));
-                               system, event_name, var_name);
        return ref_field;
 }
@@ -2672,6 +2715,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
 {
        struct ftrace_event_field *field = NULL;
        char *field_name, *modifier, *str;
+        struct trace_array *tr = file->tr;
        modifier = str = kstrdup(field_str, GFP_KERNEL);
        if (!modifier)
@@ -2695,7 +2739,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
                else if (strcmp(modifier, "usecs") == 0)
                        *flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
                else {
-                        hist_err("Invalid field modifier: ", modifier);
+                        hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
                        field = ERR_PTR(-EINVAL);
                        goto out;
                }
@@ -2711,7 +2755,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
        else {
                field = trace_find_event_field(file->event_call, field_name);
                if (!field || !field->size) {
-                        hist_err("Couldn't find field: ", field_name);
+                        hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
                        field = ERR_PTR(-EINVAL);
                        goto out;
                }
@@ -2773,7 +2817,8 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
        s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
        if (!s) {
-                hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
+                hist_field = parse_var_ref(hist_data, ref_system,
+                                           ref_event, ref_var);
                if (hist_field) {
                        if (var_name) {
                                hist_field = create_alias(hist_data, hist_field, var_name);
@@ -2822,7 +2867,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
        /* we support only -(xxx) i.e. explicit parens required */
        if (level > 3) {
-                hist_err("Too many subexpressions (3 max): ", str);
+                hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
                ret = -EINVAL;
                goto free;
        }
@@ -2877,7 +2922,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
        return ERR_PTR(ret);
 }
-static int check_expr_operands(struct hist_field *operand1,
+static int check_expr_operands(struct trace_array *tr,
+                               struct hist_field *operand1,
                               struct hist_field *operand2)
 {
        unsigned long operand1_flags = operand1->flags;
@@ -2905,7 +2951,7 @@ static int check_expr_operands(struct hist_field *operand1,
        if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
            (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
-                hist_err("Timestamp units in expression don't match", NULL);
+                hist_err(tr, HIST_ERR_TIMESTAMP_MISMATCH, 0);
                return -EINVAL;
        }
@@ -2923,7 +2969,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
        char *sep, *operand1_str;
        if (level > 3) {
-                hist_err("Too many subexpressions (3 max): ", str);
+                hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
                return ERR_PTR(-EINVAL);
        }
@@ -2968,7 +3014,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
                goto free;
        }
-        ret = check_expr_operands(operand1, operand2);
+        ret = check_expr_operands(file->tr, operand1, operand2);
        if (ret)
                goto free;
@@ -3161,16 +3207,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
        int ret;
        if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
-                hist_err_event("trace action: Too many field variables defined: ",
+                hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
-                               subsys_name, event_name, field_name);
                return ERR_PTR(-EINVAL);
        }
        file = event_file(tr, subsys_name, event_name);
        if (IS_ERR(file)) {
-                hist_err_event("trace action: Event file not found: ",
+                hist_err(tr, HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name));
-                               subsys_name, event_name, field_name);
                ret = PTR_ERR(file);
                return ERR_PTR(ret);
        }
@@ -3183,8 +3227,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
         */
        hist_data = find_compatible_hist(target_hist_data, file);
        if (!hist_data) {
-                hist_err_event("trace action: Matching event histogram not found: ",
+                hist_err(tr, HIST_ERR_HIST_NOT_FOUND, errpos(field_name));
-                               subsys_name, event_name, field_name);
                return ERR_PTR(-EINVAL);
        }
@@ -3245,8 +3288,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
                kfree(cmd);
                kfree(var_hist->cmd);
                kfree(var_hist);
-                hist_err_event("trace action: Couldn't create histogram for field: ",
+                hist_err(tr, HIST_ERR_HIST_CREATE_FAIL, errpos(field_name));
-                               subsys_name, event_name, field_name);
                return ERR_PTR(ret);
        }
@@ -3258,8 +3300,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
        if (IS_ERR_OR_NULL(event_var)) {
                kfree(var_hist->cmd);
                kfree(var_hist);
-                hist_err_event("trace action: Couldn't find synthetic variable: ",
+                hist_err(tr, HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name));
-                               subsys_name, event_name, field_name);
                return ERR_PTR(-EINVAL);
        }
@@ -3392,25 +3433,26 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
 {
        struct hist_field *val = NULL, *var = NULL;
        unsigned long flags = HIST_FIELD_FL_VAR;
+        struct trace_array *tr = file->tr;
        struct field_var *field_var;
        int ret = 0;
        if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
-                hist_err("Too many field variables defined: ", field_name);
+                hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
                ret = -EINVAL;
                goto err;
        }
        val = parse_atom(hist_data, file, field_name, &flags, NULL);
        if (IS_ERR(val)) {
-                hist_err("Couldn't parse field variable: ", field_name);
+                hist_err(tr, HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name));
                ret = PTR_ERR(val);
                goto err;
        }
        var = create_var(hist_data, file, field_name, val->size, val->type);
        if (IS_ERR(var)) {
-                hist_err("Couldn't create or find variable: ", field_name);
+                hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name));
                kfree(val);
                ret = PTR_ERR(var);
                goto err;
@@ -3543,14 +3585,20 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
        struct track_data *track_data = tr->cond_snapshot->cond_data;
        struct hist_elt_data *elt_data, *track_elt_data;
        struct snapshot_context *context = cond_data;
+        struct action_data *action;
        u64 track_val;
        if (!track_data)
                return false;
+        action = track_data->action_data;
        track_val = get_track_val(track_data->hist_data, context->elt,
                                  track_data->action_data);
+        if (!action->track_data.check_val(track_data->track_val, track_val))
+                return false;
        track_data->track_val = track_val;
        memcpy(track_data->key, context->key, track_data->key_len);
@@ -3737,19 +3785,20 @@ static int track_data_create(struct hist_trigger_data *hist_data,
 {
        struct hist_field *var_field, *ref_field, *track_var = NULL;
        struct trace_event_file *file = hist_data->event_file;
+        struct trace_array *tr = file->tr;
        char *track_data_var_str;
        int ret = 0;
        track_data_var_str = data->track_data.var_str;
        if (track_data_var_str[0] != '$') {
-                hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str);
+                hist_err(tr, HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str));
                return -EINVAL;
        }
        track_data_var_str++;
        var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str);
        if (!var_field) {
-                hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str);
+                hist_err(tr, HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str));
                return -EINVAL;
        }
@@ -3762,7 +3811,7 @@ static int track_data_create(struct hist_trigger_data *hist_data,
        if (data->handler == HANDLER_ONMAX)
                track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64");
        if (IS_ERR(track_var)) {
-                hist_err("Couldn't create onmax variable: ", "__max");
+                hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
                ret = PTR_ERR(track_var);
                goto out;
        }
@@ -3770,7 +3819,7 @@ static int track_data_create(struct hist_trigger_data *hist_data,
        if (data->handler == HANDLER_ONCHANGE)
                track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64");
        if (IS_ERR(track_var)) {
-                hist_err("Couldn't create onchange variable: ", "__change");
+                hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
                ret = PTR_ERR(track_var);
                goto out;
        }
@@ -3781,7 +3830,8 @@ static int track_data_create(struct hist_trigger_data *hist_data,
        return ret;
 }
-static int parse_action_params(char *params, struct action_data *data)
+static int parse_action_params(struct trace_array *tr, char *params,
+                               struct action_data *data)
 {
        char *param, *saved_param;
        bool first_param = true;
@@ -3789,20 +3839,20 @@ static int parse_action_params(char *params, struct action_data *data)
        while (params) {
                if (data->n_params >= SYNTH_FIELDS_MAX) {
-                        hist_err("Too many action params", "");
+                        hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0);
                        goto out;
                }
                param = strsep(&params, ",");
                if (!param) {
-                        hist_err("No action param found", "");
+                        hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, 0);
                        ret = -EINVAL;
                        goto out;
                }
                param = strstrip(param);
                if (strlen(param) < 2) {
-                        hist_err("Invalid action param: ", param);
+                        hist_err(tr, HIST_ERR_INVALID_PARAM, errpos(param));
                        ret = -EINVAL;
                        goto out;
                }
@@ -3826,7 +3876,7 @@ static int parse_action_params(char *params, struct action_data *data)
        return ret;
 }
-static int action_parse(char *str, struct action_data *data,
+static int action_parse(struct trace_array *tr, char *str, struct action_data *data,
                        enum handler_id handler)
 {
        char *action_name;
@@ -3834,14 +3884,14 @@ static int action_parse(char *str, struct action_data *data,
        strsep(&str, ".");
        if (!str) {
-                hist_err("action parsing: No action found", "");
+                hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0);
                ret = -EINVAL;
                goto out;
        }
        action_name = strsep(&str, "(");
        if (!action_name || !str) {
-                hist_err("action parsing: No action found", "");
+                hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0);
                ret = -EINVAL;
                goto out;
        }
@@ -3850,12 +3900,12 @@ static int action_parse(char *str, struct action_data *data,
                char *params = strsep(&str, ")");
                if (!params) {
-                        hist_err("action parsing: No params found for %s", "save");
+                        hist_err(tr, HIST_ERR_NO_SAVE_PARAMS, 0);
                        ret = -EINVAL;
                        goto out;
                }
-                ret = parse_action_params(params, data);
+                ret = parse_action_params(tr, params, data);
                if (ret)
                        goto out;
@@ -3864,7 +3914,7 @@ static int action_parse(char *str, struct action_data *data,
                else if (handler == HANDLER_ONCHANGE)
                        data->track_data.check_val = check_track_val_changed;
                else {
-                        hist_err("action parsing: Handler doesn't support action: ", action_name);
+                        hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name));
                        ret = -EINVAL;
                        goto out;
                }
@@ -3876,7 +3926,7 @@ static int action_parse(char *str, struct action_data *data,
                char *params = strsep(&str, ")");
                if (!str) {
-                        hist_err("action parsing: No closing paren found: %s", params);
+                        hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(params));
                        ret = -EINVAL;
                        goto out;
                }
@@ -3886,7 +3936,7 @@ static int action_parse(char *str, struct action_data *data,
                else if (handler == HANDLER_ONCHANGE)
                        data->track_data.check_val = check_track_val_changed;
                else {
-                        hist_err("action parsing: Handler doesn't support action: ", action_name);
+                        hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name));
                        ret = -EINVAL;
                        goto out;
                }
@@ -3901,7 +3951,7 @@ static int action_parse(char *str, struct action_data *data,
                        data->use_trace_keyword = true;
                if (params) {
-                        ret = parse_action_params(params, data);
+                        ret = parse_action_params(tr, params, data);
                        if (ret)
                                goto out;
                }
@@ -3954,7 +4004,7 @@ static struct action_data *track_data_parse(struct hist_trigger_data *hist_data,
                goto free;
        }
-        ret = action_parse(str, data, handler);
+        ret = action_parse(hist_data->event_file->tr, str, data, handler);
        if (ret)
                goto free;
 out:
@@ -4024,6 +4074,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data,
                      struct action_data *data,
                      char *system, char *event, char *var)
 {
+        struct trace_array *tr = hist_data->event_file->tr;
        struct hist_field *hist_field;
        var++; /* skip '$' */
@@ -4039,7 +4090,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data,
        }
        if (!hist_field)
-                hist_err_event("trace action: Couldn't find param: $", system, event, var);
+                hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, errpos(var));
        return hist_field;
 }
@@ -4097,6 +4148,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
 static int trace_action_create(struct hist_trigger_data *hist_data,
                               struct action_data *data)
 {
+        struct trace_array *tr = hist_data->event_file->tr;
        char *event_name, *param, *system = NULL;
        struct hist_field *hist_field, *var_ref;
        unsigned int i, var_ref_idx;
@@ -4114,7 +4166,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
        event = find_synth_event(synth_event_name);
        if (!event) {
-                hist_err("trace action: Couldn't find synthetic event: ", synth_event_name);
+                hist_err(tr, HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name));
                return -EINVAL;
        }
@@ -4175,15 +4227,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
                        continue;
                }
-                hist_err_event("trace action: Param type doesn't match synthetic event field type: ",
+                hist_err(tr, HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param));
-                               system, event_name, param);
                kfree(p);
                ret = -EINVAL;
                goto err;
        }
        if (field_pos != event->n_fields) {
-                hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name);
+                hist_err(tr, HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name));
                ret = -EINVAL;
                goto err;
        }
@@ -4202,6 +4253,7 @@ static int action_create(struct hist_trigger_data *hist_data,
                         struct action_data *data)
 {
        struct trace_event_file *file = hist_data->event_file;
+        struct trace_array *tr = file->tr;
        struct track_data *track_data;
        struct field_var *field_var;
        unsigned int i;
@@ -4229,7 +4281,7 @@ static int action_create(struct hist_trigger_data *hist_data,
        if (data->action == ACTION_SAVE) {
                if (hist_data->n_save_vars) {
                        ret = -EEXIST;
-                        hist_err("save action: Can't have more than one save() action per hist", "");
+                        hist_err(tr, HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0);
                        goto out;
                }
@@ -4242,7 +4294,8 @@ static int action_create(struct hist_trigger_data *hist_data,
                        field_var = create_target_field_var(hist_data, NULL, NULL, param);
                        if (IS_ERR(field_var)) {
-                                hist_err("save action: Couldn't create field variable: ", param);
+                                hist_err(tr, HIST_ERR_FIELD_VAR_CREATE_FAIL,
+                                         errpos(param));
                                ret = PTR_ERR(field_var);
                                kfree(param);
                                goto out;
@@ -4276,19 +4329,18 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
        match_event = strsep(&str, ")");
        if (!match_event || !str) {
-                hist_err("onmatch: Missing closing paren: ", match_event);
+                hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(match_event));
                goto free;
        }
        match_event_system = strsep(&match_event, ".");
        if (!match_event) {
-                hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
+                hist_err(tr, HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system));
                goto free;
        }
        if (IS_ERR(event_file(tr, match_event_system, match_event))) {
-                hist_err_event("onmatch: Invalid subsystem or event name: ",
+                hist_err(tr, HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event));
-                               match_event_system, match_event, NULL);
                goto free;
        }
@@ -4304,7 +4356,7 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
                goto free;
        }
-        ret = action_parse(str, data, HANDLER_ONMATCH);
+        ret = action_parse(tr, str, data, HANDLER_ONMATCH);
        if (ret)
                goto free;
 out:
@@ -4373,13 +4425,14 @@ static int create_var_field(struct hist_trigger_data *hist_data,
                            struct trace_event_file *file,
                            char *var_name, char *expr_str)
 {
+        struct trace_array *tr = hist_data->event_file->tr;
        unsigned long flags = 0;
        if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
                return -EINVAL;
        if (find_var(hist_data, file, var_name) && !hist_data->remove) {
-                hist_err("Variable already defined: ", var_name);
+                hist_err(tr, HIST_ERR_DUPLICATE_VAR, errpos(var_name));
                return -EINVAL;
        }
@@ -4436,8 +4489,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
                            struct trace_event_file *file,
                            char *field_str)
 {
+        struct trace_array *tr = hist_data->event_file->tr;
        struct hist_field *hist_field = NULL;
        unsigned long flags = 0;
        unsigned int key_size;
        int ret = 0;
@@ -4459,8 +4512,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
                        goto out;
                }
-                if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
+                if (field_has_hist_vars(hist_field, 0)) {
-                        hist_err("Using variable references as keys not supported: ", field_str);
+                        hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str));
                        destroy_hist_field(hist_field, 0);
                        ret = -EINVAL;
                        goto out;
@@ -4561,6 +4614,7 @@ static void free_var_defs(struct hist_trigger_data *hist_data)
 static int parse_var_defs(struct hist_trigger_data *hist_data)
 {
+        struct trace_array *tr = hist_data->event_file->tr;
        char *s, *str, *var_name, *field_str;
        unsigned int i, j, n_vars = 0;
        int ret = 0;
@@ -4574,13 +4628,14 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)
                        var_name = strsep(&field_str, "=");
                        if (!var_name || !field_str) {
-                                hist_err("Malformed assignment: ", var_name);
+                                hist_err(tr, HIST_ERR_MALFORMED_ASSIGNMENT,
+                                         errpos(var_name));
                                ret = -EINVAL;
                                goto free;
                        }
                        if (n_vars == TRACING_MAP_VARS_MAX) {
-                                hist_err("Too many variables defined: ", var_name);
+                                hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(var_name));
                                ret = -EINVAL;
                                goto free;
                        }
@@ -5431,11 +5486,6 @@ static int hist_show(struct seq_file *m, void *v)
                        hist_trigger_show(m, data, n++);
        }
-        if (have_hist_err()) {
-                seq_printf(m, "\nERROR: %s\n", hist_err_str);
-                seq_printf(m, "  Last command: %s\n", last_hist_cmd);
-        }
 out_unlock:
        mutex_unlock(&event_mutex);
@@ -5800,6 +5850,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 {
        struct hist_trigger_data *hist_data = data->private_data;
        struct event_trigger_data *test, *named_data = NULL;
+        struct trace_array *tr = file->tr;
        int ret = 0;
        if (hist_data->attrs->name) {
@@ -5807,7 +5858,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
                if (named_data) {
                        if (!hist_trigger_match(data, named_data, named_data,
                                                true)) {
-                                hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
+                                hist_err(tr, HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name));
                                ret = -EINVAL;
                                goto out;
                        }
@@ -5828,7 +5879,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
                        else if (hist_data->attrs->clear)
                                hist_clear(test);
                        else {
-                                hist_err("Hist trigger already exists", NULL);
+                                hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0);
                                ret = -EEXIST;
                        }
                        goto out;
@@ -5836,7 +5887,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
        }
 new:
        if (hist_data->attrs->cont || hist_data->attrs->clear) {
-                hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
+                hist_err(tr, HIST_ERR_TRIGGER_ENOENT_CLEAR, 0);
                ret = -ENOENT;
                goto out;
        }
@@ -5861,7 +5912,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
                ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
                if (ret) {
-                        hist_err("Couldn't set trace_clock: ", clock);
+                        hist_err(tr, HIST_ERR_SET_CLOCK_FAIL, errpos(clock));
                        goto out;
                }
@@ -6037,8 +6088,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
        lockdep_assert_held(&event_mutex);
        if (glob && strlen(glob)) {
-                last_cmd_set(param);
                hist_err_clear();
+                last_cmd_set(file, param);
        }
        if (!param)
@@ -6079,7 +6130,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
                trigger = strstrip(trigger);
        }
-        attrs = parse_hist_trigger_attrs(trigger);
+        attrs = parse_hist_trigger_attrs(file->tr, trigger);
        if (IS_ERR(attrs))
                return PTR_ERR(attrs);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index cd12ecb66eb9..2a2912cb4533 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -731,7 +731,8 @@ int set_trigger_filter(char *filter_str,
                goto out;
        /* The filter is for the 'trigger' event, not the triggered event */
-        ret = create_event_filter(file->event_call, filter_str, false, &filter);
+        ret = create_event_filter(file->tr, file->event_call,
+                                  filter_str, false, &filter);
        /*
         * If create_event_filter() fails, filter still needs to be freed.
         * Which the calling code will do with data->filter.
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 1e6db9cbe4dc..fa95139445b2 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -277,7 +277,7 @@ static void move_to_next_cpu(void)
         * of this thread, than stop migrating for the duration
         * of the current test.
         */
-        if (!cpumask_equal(current_mask, &current->cpus_allowed))
+        if (!cpumask_equal(current_mask, current->cpus_ptr))
                goto disable;
        get_online_cpus();
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 810d78a8d14c..cca65044c14c 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -17,36 +17,28 @@
 #include "trace.h"
 #include "trace_output.h"
-static void ftrace_dump_buf(int skip_lines, long cpu_file)
+static struct trace_iterator iter;
+static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
+static void ftrace_dump_buf(int skip_entries, long cpu_file)
 {
-        /* use static because iter can be a bit big for the stack */
-        static struct trace_iterator iter;
-        static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
        struct trace_array *tr;
        unsigned int old_userobj;
        int cnt = 0, cpu;
-        trace_init_global_iter(&iter);
-        iter.buffer_iter = buffer_iter;
        tr = iter.tr;
-        for_each_tracing_cpu(cpu) {
-                atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
-        }
        old_userobj = tr->trace_flags;
        /* don't look at user memory in panic mode */
        tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
        kdb_printf("Dumping ftrace buffer:\n");
+        if (skip_entries)
+                kdb_printf("(skipping %d entries)\n", skip_entries);
-        /* reset all but tr, trace, and overruns */
+        trace_iterator_reset(&iter);
-        memset(&iter.seq, 0,
-                   sizeof(struct trace_iterator) -
-                   offsetof(struct trace_iterator, seq));
        iter.iter_flags |= TRACE_FILE_LAT_FMT;
-        iter.pos = -1;
        if (cpu_file == RING_BUFFER_ALL_CPUS) {
                for_each_tracing_cpu(cpu) {
@@ -70,11 +62,11 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
                        kdb_printf("---------------------------------\n");
                cnt++;
-                if (!skip_lines) {
+                if (!skip_entries) {
                        print_trace_line(&iter);
                        trace_printk_seq(&iter.seq);
                } else {
-                        skip_lines--;
+                        skip_entries--;
                }
                if (KDB_FLAG(CMD_INTERRUPT))
@@ -90,10 +82,6 @@ out:
        tr->trace_flags = old_userobj;
        for_each_tracing_cpu(cpu) {
-                atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
-        }
-        for_each_tracing_cpu(cpu) {
                if (iter.buffer_iter[cpu]) {
                        ring_buffer_read_finish(iter.buffer_iter[cpu]);
                        iter.buffer_iter[cpu] = NULL;
@@ -106,17 +94,19 @@ out:
 */
 static int kdb_ftdump(int argc, const char **argv)
 {
-        int skip_lines = 0;
+        int skip_entries = 0;
        long cpu_file;
        char *cp;
+        int cnt;
+        int cpu;
        if (argc > 2)
                return KDB_ARGCOUNT;
        if (argc) {
-                skip_lines = simple_strtol(argv[1], &cp, 0);
+                skip_entries = simple_strtol(argv[1], &cp, 0);
                if (*cp)
-                        skip_lines = 0;
+                        skip_entries = 0;
        }
        if (argc == 2) {
@@ -129,7 +119,29 @@ static int kdb_ftdump(int argc, const char **argv)
        }
        kdb_trap_printk++;
-        ftrace_dump_buf(skip_lines, cpu_file);
+        trace_init_global_iter(&iter);
+        iter.buffer_iter = buffer_iter;
+        for_each_tracing_cpu(cpu) {
+                atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+        }
+        /* A negative skip_entries means skip all but the last entries */
+        if (skip_entries < 0) {
+                if (cpu_file == RING_BUFFER_ALL_CPUS)
+                        cnt = trace_total_entries(NULL);
+                else
+                        cnt = trace_total_entries_cpu(NULL, cpu_file);
+                skip_entries = max(cnt + skip_entries, 0);
+        }
+        ftrace_dump_buf(skip_entries, cpu_file);
+        for_each_tracing_cpu(cpu) {
+                atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+        }
        kdb_trap_printk--;
        return 0;
@@ -137,8 +149,9 @@ static int kdb_ftdump(int argc, const char **argv)
 static __init int kdb_ftrace_register(void)
 {
-        kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+        kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]",
-                            "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
+                            "Dump ftrace log; -skip dumps last #entries", 0,
+                            KDB_ENABLE_ALWAYS_SAFE);
        return 0;
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5d5129b05df7..7d736248a070 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -441,13 +441,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
        else
                ret = register_kprobe(&tk->rp.kp);
-        if (ret == 0) {
+        if (ret == 0)
                tk->tp.flags |= TP_FLAG_REGISTERED;
-        } else if (ret == -EILSEQ) {
-                pr_warn("Probing address(0x%p) is not an instruction boundary.\n",
-                        tk->rp.kp.addr);
-                ret = -EINVAL;
-        }
        return ret;
 }
@@ -591,7 +586,7 @@ static int trace_kprobe_create(int argc, const char *argv[])
         * Type of args:
         *  FETCHARG:TYPE : use TYPE instead of unsigned long.
         */
-        struct trace_kprobe *tk;
+        struct trace_kprobe *tk = NULL;
        int i, len, ret = 0;
        bool is_return = false;
        char *symbol = NULL, *tmp = NULL;
@@ -615,44 +610,50 @@ static int trace_kprobe_create(int argc, const char *argv[])
        if (argc < 2)
                return -ECANCELED;
+        trace_probe_log_init("trace_kprobe", argc, argv);
        event = strchr(&argv[0][1], ':');
        if (event)
                event++;
        if (isdigit(argv[0][1])) {
                if (!is_return) {
-                        pr_info("Maxactive is not for kprobe");
+                        trace_probe_log_err(1, MAXACT_NO_KPROBE);
-                        return -EINVAL;
+                        goto parse_error;
                }
                if (event)
                        len = event - &argv[0][1] - 1;
                else
                        len = strlen(&argv[0][1]);
-                if (len > MAX_EVENT_NAME_LEN - 1)
+                if (len > MAX_EVENT_NAME_LEN - 1) {
-                        return -E2BIG;
+                        trace_probe_log_err(1, BAD_MAXACT);
+                        goto parse_error;
+                }
                memcpy(buf, &argv[0][1], len);
                buf[len] = '\0';
                ret = kstrtouint(buf, 0, &maxactive);
                if (ret || !maxactive) {
-                        pr_info("Invalid maxactive number\n");
+                        trace_probe_log_err(1, BAD_MAXACT);
-                        return ret;
+                        goto parse_error;
                }
                /* kretprobes instances are iterated over via a list. The
                 * maximum should stay reasonable.
                 */
                if (maxactive > KRETPROBE_MAXACTIVE_MAX) {
-                        pr_info("Maxactive is too big (%d > %d).\n",
+                        trace_probe_log_err(1, MAXACT_TOO_BIG);
-                                maxactive, KRETPROBE_MAXACTIVE_MAX);
+                        goto parse_error;
-                        return -E2BIG;
                }
        }
        /* try to parse an address. if that fails, try to read the
         * input as a symbol. */
        if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
+                trace_probe_log_set_index(1);
                /* Check whether uprobe event specified */
-                if (strchr(argv[1], '/') && strchr(argv[1], ':'))
+                if (strchr(argv[1], '/') && strchr(argv[1], ':')) {
-                        return -ECANCELED;
+                        ret = -ECANCELED;
+                        goto error;
+                }
                /* a symbol specified */
                symbol = kstrdup(argv[1], GFP_KERNEL);
                if (!symbol)
@@ -660,23 +661,23 @@ static int trace_kprobe_create(int argc, const char *argv[])
                /* TODO: support .init module functions */
                ret = traceprobe_split_symbol_offset(symbol, &offset);
                if (ret || offset < 0 || offset > UINT_MAX) {
-                        pr_info("Failed to parse either an address or a symbol.\n");
+                        trace_probe_log_err(0, BAD_PROBE_ADDR);
-                        goto out;
+                        goto parse_error;
                }
                if (kprobe_on_func_entry(NULL, symbol, offset))
                        flags |= TPARG_FL_FENTRY;
                if (offset && is_return && !(flags & TPARG_FL_FENTRY)) {
-                        pr_info("Given offset is not valid for return probe.\n");
+                        trace_probe_log_err(0, BAD_RETPROBE);
-                        ret = -EINVAL;
+                        goto parse_error;
-                        goto out;
                }
        }
-        argc -= 2; argv += 2;
+        trace_probe_log_set_index(0);
        if (event) {
-                ret = traceprobe_parse_event_name(&event, &group, buf);
+                ret = traceprobe_parse_event_name(&event, &group, buf,
+                                                  event - argv[0]);
                if (ret)
-                        goto out;
+                        goto parse_error;
        } else {
                /* Make a new event name */
                if (symbol)
@@ -691,13 +692,14 @@ static int trace_kprobe_create(int argc, const char *argv[])
        /* setup a probe */
        tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
-                               argc, is_return);
+                               argc - 2, is_return);
        if (IS_ERR(tk)) {
                ret = PTR_ERR(tk);
-                /* This must return -ENOMEM otherwise there is a bug */
+                /* This must return -ENOMEM, else there is a bug */
                WARN_ON_ONCE(ret != -ENOMEM);
-                goto out;
+                goto out;       /* We know tk is not allocated */
        }
+        argc -= 2; argv += 2;
        /* parse arguments */
        for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
@@ -707,19 +709,32 @@ static int trace_kprobe_create(int argc, const char *argv[])
                        goto error;
                }
+                trace_probe_log_set_index(i + 2);
                ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags);
                kfree(tmp);
                if (ret)
-                        goto error;
+                        goto error;     /* This can be -ENOMEM */
        }
        ret = register_trace_kprobe(tk);
-        if (ret)
+        if (ret) {
+                trace_probe_log_set_index(1);
+                if (ret == -EILSEQ)
+                        trace_probe_log_err(0, BAD_INSN_BNDRY);
+                else if (ret == -ENOENT)
+                        trace_probe_log_err(0, BAD_PROBE_ADDR);
+                else if (ret != -ENOMEM)
+                        trace_probe_log_err(0, FAIL_REG_PROBE);
                goto error;
+        }
 out:
+        trace_probe_log_clear();
        kfree(symbol);
        return ret;
+parse_error:
+        ret = -EINVAL;
 error:
        free_trace_kprobe(tk);
        goto out;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 54373d93e251..ba751f993c3b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1057,7 +1057,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
        trace_seq_puts(s, "<stack trace>\n");
-        for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
+        for (p = field->caller; p && p < end && *p != ULONG_MAX; p++) {
                if (trace_seq_has_overflowed(s))
                        break;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8f8411e7835f..a347faced959 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -13,6 +13,11 @@
 #include "trace_probe.h"
+#undef C
+#define C(a, b)         b
+static const char *trace_probe_err_text[] = { ERRORS };
 static const char *reserved_field_names[] = {
        "common_type",
        "common_flags",
@@ -133,6 +138,60 @@ fail:
        return NULL;
 }
+static struct trace_probe_log trace_probe_log;
+void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
+{
+        trace_probe_log.subsystem = subsystem;
+        trace_probe_log.argc = argc;
+        trace_probe_log.argv = argv;
+        trace_probe_log.index = 0;
+}
+void trace_probe_log_clear(void)
+{
+        memset(&trace_probe_log, 0, sizeof(trace_probe_log));
+}
+void trace_probe_log_set_index(int index)
+{
+        trace_probe_log.index = index;
+}
+void __trace_probe_log_err(int offset, int err_type)
+{
+        char *command, *p;
+        int i, len = 0, pos = 0;
+        if (!trace_probe_log.argv)
+                return;
+        /* Recalcurate the length and allocate buffer */
+        for (i = 0; i < trace_probe_log.argc; i++) {
+                if (i == trace_probe_log.index)
+                        pos = len;
+                len += strlen(trace_probe_log.argv[i]) + 1;
+        }
+        command = kzalloc(len, GFP_KERNEL);
+        if (!command)
+                return;
+        /* And make a command string from argv array */
+        p = command;
+        for (i = 0; i < trace_probe_log.argc; i++) {
+                len = strlen(trace_probe_log.argv[i]);
+                strcpy(p, trace_probe_log.argv[i]);
+                p[len] = ' ';
+                p += len + 1;
+        }
+        *(p - 1) = '\0';
+        tracing_log_err(NULL, trace_probe_log.subsystem, command,
+                        trace_probe_err_text, err_type, pos + offset);
+        kfree(command);
+}
 /* Split symbol and offset. */
 int traceprobe_split_symbol_offset(char *symbol, long *offset)
 {
@@ -156,7 +215,7 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset)
 /* @buf must has MAX_EVENT_NAME_LEN size */
 int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
-                                char *buf)
+                                char *buf, int offset)
 {
        const char *slash, *event = *pevent;
        int len;
@@ -164,32 +223,33 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
        slash = strchr(event, '/');
        if (slash) {
                if (slash == event) {
-                        pr_info("Group name is not specified\n");
+                        trace_probe_log_err(offset, NO_GROUP_NAME);
                        return -EINVAL;
                }
                if (slash - event + 1 > MAX_EVENT_NAME_LEN) {
-                        pr_info("Group name is too long\n");
+                        trace_probe_log_err(offset, GROUP_TOO_LONG);
-                        return -E2BIG;
+                        return -EINVAL;
                }
                strlcpy(buf, event, slash - event + 1);
                if (!is_good_name(buf)) {
-                        pr_info("Group name must follow the same rules as C identifiers\n");
+                        trace_probe_log_err(offset, BAD_GROUP_NAME);
                        return -EINVAL;
                }
                *pgroup = buf;
                *pevent = slash + 1;
+                offset += slash - event + 1;
                event = *pevent;
        }
        len = strlen(event);
        if (len == 0) {
-                pr_info("Event name is not specified\n");
+                trace_probe_log_err(offset, NO_EVENT_NAME);
                return -EINVAL;
        } else if (len > MAX_EVENT_NAME_LEN) {
-                pr_info("Event name is too long\n");
+                trace_probe_log_err(offset, EVENT_TOO_LONG);
-                return -E2BIG;
+                return -EINVAL;
        }
        if (!is_good_name(event)) {
-                pr_info("Event name must follow the same rules as C identifiers\n");
+                trace_probe_log_err(offset, BAD_EVENT_NAME);
                return -EINVAL;
        }
        return 0;
@@ -198,56 +258,67 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
 static int parse_probe_vars(char *arg, const struct fetch_type *t,
-                            struct fetch_insn *code, unsigned int flags)
+                        struct fetch_insn *code, unsigned int flags, int offs)
 {
        unsigned long param;
        int ret = 0;
        int len;
        if (strcmp(arg, "retval") == 0) {
-                if (flags & TPARG_FL_RETURN)
+                if (flags & TPARG_FL_RETURN) {
                        code->op = FETCH_OP_RETVAL;
-                else
+                } else {
+                        trace_probe_log_err(offs, RETVAL_ON_PROBE);
                        ret = -EINVAL;
+                }
        } else if ((len = str_has_prefix(arg, "stack"))) {
                if (arg[len] == '\0') {
                        code->op = FETCH_OP_STACKP;
                } else if (isdigit(arg[len])) {
                        ret = kstrtoul(arg + len, 10, &param);
-                        if (ret || ((flags & TPARG_FL_KERNEL) &&
+                        if (ret) {
-                                    param > PARAM_MAX_STACK))
+                                goto inval_var;
+                        } else if ((flags & TPARG_FL_KERNEL) &&
+                                    param > PARAM_MAX_STACK) {
+                                trace_probe_log_err(offs, BAD_STACK_NUM);
                                ret = -EINVAL;
-                        else {
+                        } else {
                                code->op = FETCH_OP_STACK;
                                code->param = (unsigned int)param;
                        }
                } else
-                        ret = -EINVAL;
+                        goto inval_var;
        } else if (strcmp(arg, "comm") == 0) {
                code->op = FETCH_OP_COMM;
 #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
        } else if (((flags & TPARG_FL_MASK) ==
                    (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) &&
                   (len = str_has_prefix(arg, "arg"))) {
-                if (!isdigit(arg[len]))
-                        return -EINVAL;
                ret = kstrtoul(arg + len, 10, &param);
-                if (ret || !param || param > PARAM_MAX_STACK)
+                if (ret) {
+                        goto inval_var;
+                } else if (!param || param > PARAM_MAX_STACK) {
+                        trace_probe_log_err(offs, BAD_ARG_NUM);
                        return -EINVAL;
+                }
                code->op = FETCH_OP_ARG;
                code->param = (unsigned int)param - 1;
 #endif
        } else
-                ret = -EINVAL;
+                goto inval_var;
        return ret;
+inval_var:
+        trace_probe_log_err(offs, BAD_VAR);
+        return -EINVAL;
 }
 /* Recursive argument parser */
 static int
 parse_probe_arg(char *arg, const struct fetch_type *type,
                struct fetch_insn **pcode, struct fetch_insn *end,
-                unsigned int flags)
+                unsigned int flags, int offs)
 {
        struct fetch_insn *code = *pcode;
        unsigned long param;
@@ -257,7 +328,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
        switch (arg[0]) {
        case '$':
-                ret = parse_probe_vars(arg + 1, type, code, flags);
+                ret = parse_probe_vars(arg + 1, type, code, flags, offs);
                break;
        case '%':       /* named register */
@@ -266,47 +337,57 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
                        code->op = FETCH_OP_REG;
                        code->param = (unsigned int)ret;
                        ret = 0;
-                }
+                } else
+                        trace_probe_log_err(offs, BAD_REG_NAME);
                break;
        case '@':       /* memory, file-offset or symbol */
                if (isdigit(arg[1])) {
                        ret = kstrtoul(arg + 1, 0, &param);
-                        if (ret)
+                        if (ret) {
+                                trace_probe_log_err(offs, BAD_MEM_ADDR);
                                break;
+                        }
                        /* load address */
                        code->op = FETCH_OP_IMM;
                        code->immediate = param;
                } else if (arg[1] == '+') {
                        /* kprobes don't support file offsets */
-                        if (flags & TPARG_FL_KERNEL)
+                        if (flags & TPARG_FL_KERNEL) {
+                                trace_probe_log_err(offs, FILE_ON_KPROBE);
                                return -EINVAL;
+                        }
                        ret = kstrtol(arg + 2, 0, &offset);
-                        if (ret)
+                        if (ret) {
+                                trace_probe_log_err(offs, BAD_FILE_OFFS);
                                break;
+                        }
                        code->op = FETCH_OP_FOFFS;
                        code->immediate = (unsigned long)offset;  // imm64?
                } else {
                        /* uprobes don't support symbols */
-                        if (!(flags & TPARG_FL_KERNEL))
+                        if (!(flags & TPARG_FL_KERNEL)) {
+                                trace_probe_log_err(offs, SYM_ON_UPROBE);
                                return -EINVAL;
+                        }
                        /* Preserve symbol for updating */
                        code->op = FETCH_NOP_SYMBOL;
                        code->data = kstrdup(arg + 1, GFP_KERNEL);
                        if (!code->data)
                                return -ENOMEM;
-                        if (++code == end)
+                        if (++code == end) {
-                                return -E2BIG;
+                                trace_probe_log_err(offs, TOO_MANY_OPS);
+                                return -EINVAL;
+                        }
                        code->op = FETCH_OP_IMM;
                        code->immediate = 0;
                }
                /* These are fetching from memory */
-                if (++code == end)
+                if (++code == end) {
-                        return -E2BIG;
+                        trace_probe_log_err(offs, TOO_MANY_OPS);
+                        return -EINVAL;
+                }
                *pcode = code;
                code->op = FETCH_OP_DEREF;
                code->offset = offset;
@@ -317,28 +398,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
                /* fall through */
        case '-':
                tmp = strchr(arg, '(');
-                if (!tmp)
+                if (!tmp) {
+                        trace_probe_log_err(offs, DEREF_NEED_BRACE);
                        return -EINVAL;
+                }
                *tmp = '\0';
                ret = kstrtol(arg, 0, &offset);
-                if (ret)
+                if (ret) {
+                        trace_probe_log_err(offs, BAD_DEREF_OFFS);
                        break;
+                }
+                offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
                arg = tmp + 1;
                tmp = strrchr(arg, ')');
+                if (!tmp) {
-                if (tmp) {
+                        trace_probe_log_err(offs + strlen(arg),
+                                            DEREF_OPEN_BRACE);
+                        return -EINVAL;
+                } else {
                        const struct fetch_type *t2 = find_fetch_type(NULL);
                        *tmp = '\0';
-                        ret = parse_probe_arg(arg, t2, &code, end, flags);
+                        ret = parse_probe_arg(arg, t2, &code, end, flags, offs);
                        if (ret)
                                break;
-                        if (code->op == FETCH_OP_COMM)
+                        if (code->op == FETCH_OP_COMM) {
+                                trace_probe_log_err(offs, COMM_CANT_DEREF);
                                return -EINVAL;
-                        if (++code == end)
+                        }
-                                return -E2BIG;
+                        if (++code == end) {
+                                trace_probe_log_err(offs, TOO_MANY_OPS);
+                                return -EINVAL;
+                        }
                        *pcode = code;
                        code->op = FETCH_OP_DEREF;
@@ -348,6 +439,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
        }
        if (!ret && code->op == FETCH_OP_NOP) {
                /* Parsed, but do not find fetch method */
+                trace_probe_log_err(offs, BAD_FETCH_ARG);
                ret = -EINVAL;
        }
        return ret;
@@ -379,7 +471,7 @@ static int __parse_bitfield_probe_arg(const char *bf,
                return -EINVAL;
        code++;
        if (code->op != FETCH_OP_NOP)
-                return -E2BIG;
+                return -EINVAL;
        *pcode = code;
        code->op = FETCH_OP_MOD_BF;
@@ -392,44 +484,66 @@ static int __parse_bitfield_probe_arg(const char *bf,
 /* String length checking wrapper */
 static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
-                struct probe_arg *parg, unsigned int flags)
+                struct probe_arg *parg, unsigned int flags, int offset)
 {
        struct fetch_insn *code, *scode, *tmp = NULL;
-        char *t, *t2;
+        char *t, *t2, *t3;
        int ret, len;
-        if (strlen(arg) > MAX_ARGSTR_LEN) {
+        len = strlen(arg);
-                pr_info("Argument is too long.: %s\n",  arg);
+        if (len > MAX_ARGSTR_LEN) {
-                return -ENOSPC;
+                trace_probe_log_err(offset, ARG_TOO_LONG);
+                return -EINVAL;
+        } else if (len == 0) {
+                trace_probe_log_err(offset, NO_ARG_BODY);
+                return -EINVAL;
        }
        parg->comm = kstrdup(arg, GFP_KERNEL);
-        if (!parg->comm) {
+        if (!parg->comm)
-                pr_info("Failed to allocate memory for command '%s'.\n", arg);
                return -ENOMEM;
-        }
        t = strchr(arg, ':');
        if (t) {
                *t = '\0';
                t2 = strchr(++t, '[');
                if (t2) {
-                        *t2 = '\0';
+                        *t2++ = '\0';
-                        parg->count = simple_strtoul(t2 + 1, &t2, 0);
+                        t3 = strchr(t2, ']');
-                        if (strcmp(t2, "]") || parg->count == 0)
+                        if (!t3) {
+                                offset += t2 + strlen(t2) - arg;
+                                trace_probe_log_err(offset,
+                                                    ARRAY_NO_CLOSE);
+                                return -EINVAL;
+                        } else if (t3[1] != '\0') {
+                                trace_probe_log_err(offset + t3 + 1 - arg,
+                                                    BAD_ARRAY_SUFFIX);
                                return -EINVAL;
-                        if (parg->count > MAX_ARRAY_LEN)
+                        }
-                                return -E2BIG;
+                        *t3 = '\0';
+                        if (kstrtouint(t2, 0, &parg->count) || !parg->count) {
+                                trace_probe_log_err(offset + t2 - arg,
+                                                    BAD_ARRAY_NUM);
+                                return -EINVAL;
+                        }
+                        if (parg->count > MAX_ARRAY_LEN) {
+                                trace_probe_log_err(offset + t2 - arg,
+                                                    ARRAY_TOO_BIG);
+                                return -EINVAL;
+                        }
                }
        }
-        /*
-         * The default type of $comm should be "string", and it can't be
+        /* Since $comm can not be dereferred, we can find $comm by strcmp */
-         * dereferenced.
+        if (strcmp(arg, "$comm") == 0) {
-         */
+                /* The type of $comm must be "string", and not an array. */
-        if (!t && strcmp(arg, "$comm") == 0)
+                if (parg->count || (t && strcmp(t, "string")))
+                        return -EINVAL;
                parg->type = find_fetch_type("string");
-        else
+        } else
                parg->type = find_fetch_type(t);
        if (!parg->type) {
-                pr_info("Unsupported type: %s\n", t);
+                trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);
                return -EINVAL;
        }
        parg->offset = *size;
@@ -444,13 +558,13 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
                         parg->count);
        }
-        code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL);
+        code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
        if (!code)
                return -ENOMEM;
        code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
        ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
-                              flags);
+                              flags, offset);
        if (ret)
                goto fail;
@@ -458,7 +572,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
        if (!strcmp(parg->type->name, "string")) {
                if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM &&
                    code->op != FETCH_OP_COMM) {
-                        pr_info("string only accepts memory or address.\n");
+                        trace_probe_log_err(offset + (t ? (t - arg) : 0),
+                                            BAD_STRING);
                        ret = -EINVAL;
                        goto fail;
                }
@@ -470,7 +585,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
                         */
                        code++;
                        if (code->op != FETCH_OP_NOP) {
-                                ret = -E2BIG;
+                                trace_probe_log_err(offset, TOO_MANY_OPS);
+                                ret = -EINVAL;
                                goto fail;
                        }
                }
@@ -483,7 +599,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
        } else {
                code++;
                if (code->op != FETCH_OP_NOP) {
-                        ret = -E2BIG;
+                        trace_probe_log_err(offset, TOO_MANY_OPS);
+                        ret = -EINVAL;
                        goto fail;
                }
                code->op = FETCH_OP_ST_RAW;
@@ -493,20 +610,24 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
        /* Modify operation */
        if (t != NULL) {
                ret = __parse_bitfield_probe_arg(t, parg->type, &code);
-                if (ret)
+                if (ret) {
+                        trace_probe_log_err(offset + t - arg, BAD_BITFIELD);
                        goto fail;
+                }
        }
        /* Loop(Array) operation */
        if (parg->count) {
                if (scode->op != FETCH_OP_ST_MEM &&
                    scode->op != FETCH_OP_ST_STRING) {
-                        pr_info("array only accepts memory or address\n");
+                        trace_probe_log_err(offset + (t ? (t - arg) : 0),
+                                            BAD_STRING);
                        ret = -EINVAL;
                        goto fail;
                }
                code++;
                if (code->op != FETCH_OP_NOP) {
-                        ret = -E2BIG;
+                        trace_probe_log_err(offset, TOO_MANY_OPS);
+                        ret = -EINVAL;
                        goto fail;
                }
                code->op = FETCH_OP_LP_ARRAY;
@@ -516,7 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
        code->op = FETCH_OP_END;
        /* Shrink down the code buffer */
-        parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL);
+        parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);
        if (!parg->code)
                ret = -ENOMEM;
        else
@@ -555,15 +676,19 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
 {
        struct probe_arg *parg = &tp->args[i];
        char *body;
-        int ret;
        /* Increment count for freeing args in error case */
        tp->nr_args++;
        body = strchr(arg, '=');
        if (body) {
-                if (body - arg > MAX_ARG_NAME_LEN || body == arg)
+                if (body - arg > MAX_ARG_NAME_LEN) {
+                        trace_probe_log_err(0, ARG_NAME_TOO_LONG);
+                        return -EINVAL;
+                } else if (body == arg) {
+                        trace_probe_log_err(0, NO_ARG_NAME);
                        return -EINVAL;
+                }
                parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL);
                body++;
        } else {
@@ -575,22 +700,16 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
                return -ENOMEM;
        if (!is_good_name(parg->name)) {
-                pr_info("Invalid argument[%d] name: %s\n",
+                trace_probe_log_err(0, BAD_ARG_NAME);
-                        i, parg->name);
                return -EINVAL;
        }
        if (traceprobe_conflict_field_name(parg->name, tp->args, i)) {
-                pr_info("Argument[%d]: '%s' conflicts with another field.\n",
+                trace_probe_log_err(0, USED_ARG_NAME);
-                        i, parg->name);
                return -EINVAL;
        }
        /* Parse fetch argument */
-        ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags);
+        return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags,
-        if (ret)
+                                               body - arg);
-                pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
-        return ret;
 }
 void traceprobe_free_probe_arg(struct probe_arg *arg)
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 2177c206de15..f9a8c632188b 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -124,6 +124,7 @@ struct fetch_insn {
 /* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */
 #define FETCH_INSN_MAX  16
+#define FETCH_TOKEN_COMM        (-ECOMM)
 /* Fetch type information table */
 struct fetch_type {
@@ -280,8 +281,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg);
 extern void traceprobe_free_probe_arg(struct probe_arg *arg);
 extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
-extern int traceprobe_parse_event_name(const char **pevent,
+int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
-                                       const char **pgroup, char *buf);
+                                char *buf, int offset);
 extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return);
@@ -298,3 +299,76 @@ extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);
 #endif
 extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
                                        size_t offset, struct trace_probe *tp);
+#undef ERRORS
+#define ERRORS  \
+        C(FILE_NOT_FOUND,       "Failed to find the given file"),       \
+        C(NO_REGULAR_FILE,      "Not a regular file"),                  \
+        C(BAD_REFCNT,           "Invalid reference counter offset"),    \
+        C(REFCNT_OPEN_BRACE,    "Reference counter brace is not closed"), \
+        C(BAD_REFCNT_SUFFIX,    "Reference counter has wrong suffix"),  \
+        C(BAD_UPROBE_OFFS,      "Invalid uprobe offset"),               \
+        C(MAXACT_NO_KPROBE,     "Maxactive is not for kprobe"),         \
+        C(BAD_MAXACT,           "Invalid maxactive number"),            \
+        C(MAXACT_TOO_BIG,       "Maxactive is too big"),                \
+        C(BAD_PROBE_ADDR,       "Invalid probed address or symbol"),    \
+        C(BAD_RETPROBE,         "Retprobe address must be an function entry"), \
+        C(NO_GROUP_NAME,        "Group name is not specified"),         \
+        C(GROUP_TOO_LONG,       "Group name is too long"),              \
+        C(BAD_GROUP_NAME,       "Group name must follow the same rules as C identifiers"), \
+        C(NO_EVENT_NAME,        "Event name is not specified"),         \
+        C(EVENT_TOO_LONG,       "Event name is too long"),              \
+        C(BAD_EVENT_NAME,       "Event name must follow the same rules as C identifiers"), \
+        C(RETVAL_ON_PROBE,      "$retval is not available on probe"),   \
+        C(BAD_STACK_NUM,        "Invalid stack number"),                \
+        C(BAD_ARG_NUM,          "Invalid argument number"),             \
+        C(BAD_VAR,              "Invalid $-valiable specified"),        \
+        C(BAD_REG_NAME,         "Invalid register name"),               \
+        C(BAD_MEM_ADDR,         "Invalid memory address"),              \
+        C(FILE_ON_KPROBE,       "File offset is not available with kprobe"), \
+        C(BAD_FILE_OFFS,        "Invalid file offset value"),           \
+        C(SYM_ON_UPROBE,        "Symbol is not available with uprobe"), \
+        C(TOO_MANY_OPS,         "Dereference is too much nested"),      \
+        C(DEREF_NEED_BRACE,     "Dereference needs a brace"),           \
+        C(BAD_DEREF_OFFS,       "Invalid dereference offset"),          \
+        C(DEREF_OPEN_BRACE,     "Dereference brace is not closed"),     \
+        C(COMM_CANT_DEREF,      "$comm can not be dereferenced"),       \
+        C(BAD_FETCH_ARG,        "Invalid fetch argument"),              \
+        C(ARRAY_NO_CLOSE,       "Array is not closed"),                 \
+        C(BAD_ARRAY_SUFFIX,     "Array has wrong suffix"),              \
+        C(BAD_ARRAY_NUM,        "Invalid array size"),                  \
+        C(ARRAY_TOO_BIG,        "Array number is too big"),             \
+        C(BAD_TYPE,             "Unknown type is specified"),           \
+        C(BAD_STRING,           "String accepts only memory argument"), \
+        C(BAD_BITFIELD,         "Invalid bitfield"),                    \
+        C(ARG_NAME_TOO_LONG,    "Argument name is too long"),           \
+        C(NO_ARG_NAME,          "Argument name is not specified"),      \
+        C(BAD_ARG_NAME,         "Argument name must follow the same rules as C identifiers"), \
+        C(USED_ARG_NAME,        "This argument name is already used"),  \
+        C(ARG_TOO_LONG,         "Argument expression is too long"),     \
+        C(NO_ARG_BODY,          "No argument expression"),              \
+        C(BAD_INSN_BNDRY,       "Probe point is not an instruction boundary"),\
+        C(FAIL_REG_PROBE,       "Failed to register probe event"),
+#undef C
+#define C(a, b)         TP_ERR_##a
+/* Define TP_ERR_ */
+enum { ERRORS };
+/* Error text is defined in trace_probe.c */
+struct trace_probe_log {
+        const char      *subsystem;
+        const char      **argv;
+        int             argc;
+        int             index;
+};
+void trace_probe_log_init(const char *subsystem, int argc, const char **argv);
+void trace_probe_log_set_index(int index);
+void trace_probe_log_clear(void);
+void __trace_probe_log_err(int offset, int err);
+#define trace_probe_log_err(offs, err)  \
+        __trace_probe_log_err(offs, TP_ERR_##err)
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 4737bb8c07a3..c30c61f12ddd 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -88,7 +88,7 @@ stage3:
        /* 3rd stage: store value to buffer */
        if (unlikely(!dest)) {
                if (code->op == FETCH_OP_ST_STRING) {
-                        ret += fetch_store_strlen(val + code->offset);
+                        ret = fetch_store_strlen(val + code->offset);
                        code++;
                        goto array;
                } else
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 9d402e7fc949..69ee8ef12cee 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -792,7 +792,10 @@ trace_selftest_startup_function_graph(struct tracer *trace,
        /* check the trace buffer */
        ret = trace_test_buffer(&tr->trace_buffer, &count);
-        trace->reset(tr);
+        /* Need to also simulate the tr->reset to remove this fgraph_ops */
+        tracing_stop_cmdline_record();
+        unregister_ftrace_graph(&fgraph_ops);
        tracing_start();
        if (!ret && !count) {
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index be78d99ee6bc..7860e3f59fad 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -156,7 +156,10 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
        if (unlikely(!maxlen))
                return -ENOMEM;
-        ret = strncpy_from_user(dst, src, maxlen);
+        if (addr == FETCH_TOKEN_COMM)
+                ret = strlcpy(dst, current->comm, maxlen);
+        else
+                ret = strncpy_from_user(dst, src, maxlen);
        if (ret >= 0) {
                if (ret == maxlen)
                        dst[ret - 1] = '\0';
@@ -180,7 +183,10 @@ fetch_store_strlen(unsigned long addr)
        int len;
        void __user *vaddr = (void __force __user *) addr;
-        len = strnlen_user(vaddr, MAX_STRING_SIZE);
+        if (addr == FETCH_TOKEN_COMM)
+                len = strlen(current->comm) + 1;
+        else
+                len = strnlen_user(vaddr, MAX_STRING_SIZE);
        return (len > MAX_STRING_SIZE) ? 0 : len;
 }
@@ -220,6 +226,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
        case FETCH_OP_IMM:
                val = code->immediate;
                break;
+        case FETCH_OP_COMM:
+                val = FETCH_TOKEN_COMM;
+                break;
        case FETCH_OP_FOFFS:
                val = translate_user_vaddr(code->immediate);
                break;
@@ -417,8 +426,6 @@ end:
 /*
 * Argument syntax:
 *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
- *
- *  - Remove uprobe: -:[GRP/]EVENT
 */
 static int trace_uprobe_create(int argc, const char **argv)
 {
@@ -434,10 +441,17 @@ static int trace_uprobe_create(int argc, const char **argv)
        ret = 0;
        ref_ctr_offset = 0;
-        /* argc must be >= 1 */
+        switch (argv[0][0]) {
-        if (argv[0][0] == 'r')
+        case 'r':
                is_return = true;
-        else if (argv[0][0] != 'p' || argc < 2)
+                break;
+        case 'p':
+                break;
+        default:
+                return -ECANCELED;
+        }
+        if (argc < 2)
                return -ECANCELED;
        if (argv[0][1] == ':')
@@ -457,13 +471,19 @@ static int trace_uprobe_create(int argc, const char **argv)
                return -ECANCELED;
        }
+        trace_probe_log_init("trace_uprobe", argc, argv);
+        trace_probe_log_set_index(1);   /* filename is the 2nd argument */
        *arg++ = '\0';
        ret = kern_path(filename, LOOKUP_FOLLOW, &path);
        if (ret) {
+                trace_probe_log_err(0, FILE_NOT_FOUND);
                kfree(filename);
+                trace_probe_log_clear();
                return ret;
        }
        if (!d_is_reg(path.dentry)) {
+                trace_probe_log_err(0, NO_REGULAR_FILE);
                ret = -EINVAL;
                goto fail_address_parse;
        }
@@ -472,9 +492,16 @@ static int trace_uprobe_create(int argc, const char **argv)
        rctr = strchr(arg, '(');
        if (rctr) {
                rctr_end = strchr(rctr, ')');
-                if (rctr > rctr_end || *(rctr_end + 1) != 0) {
+                if (!rctr_end) {
                        ret = -EINVAL;
-                        pr_info("Invalid reference counter offset.\n");
+                        rctr_end = rctr + strlen(rctr);
+                        trace_probe_log_err(rctr_end - filename,
+                                            REFCNT_OPEN_BRACE);
+                        goto fail_address_parse;
+                } else if (rctr_end[1] != '\0') {
+                        ret = -EINVAL;
+                        trace_probe_log_err(rctr_end + 1 - filename,
+                                            BAD_REFCNT_SUFFIX);
                        goto fail_address_parse;
                }
@@ -482,22 +509,23 @@ static int trace_uprobe_create(int argc, const char **argv)
                *rctr_end = '\0';
                ret = kstrtoul(rctr, 0, &ref_ctr_offset);
                if (ret) {
-                        pr_info("Invalid reference counter offset.\n");
+                        trace_probe_log_err(rctr - filename, BAD_REFCNT);
                        goto fail_address_parse;
                }
        }
        /* Parse uprobe offset. */
        ret = kstrtoul(arg, 0, &offset);
-        if (ret)
+        if (ret) {
+                trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS);
                goto fail_address_parse;
+        }
-        argc -= 2;
-        argv += 2;
        /* setup a probe */
+        trace_probe_log_set_index(0);
        if (event) {
-                ret = traceprobe_parse_event_name(&event, &group, buf);
+                ret = traceprobe_parse_event_name(&event, &group, buf,
+                                                  event - argv[0]);
                if (ret)
                        goto fail_address_parse;
        } else {
@@ -519,6 +547,9 @@ static int trace_uprobe_create(int argc, const char **argv)
                kfree(tail);
        }
+        argc -= 2;
+        argv += 2;
        tu = alloc_trace_uprobe(group, event, argc, is_return);
        if (IS_ERR(tu)) {
                ret = PTR_ERR(tu);
@@ -539,6 +570,7 @@ static int trace_uprobe_create(int argc, const char **argv)
                        goto error;
                }
+                trace_probe_log_set_index(i + 2);
                ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp,
                                        is_return ? TPARG_FL_RETURN : 0);
                kfree(tmp);
@@ -547,20 +579,20 @@ static int trace_uprobe_create(int argc, const char **argv)
        }
        ret = register_trace_uprobe(tu);
-        if (ret)
+        if (!ret)
-                goto error;
+                goto out;
-        return 0;
 error:
        free_trace_uprobe(tu);
+out:
+        trace_probe_log_clear();
        return ret;
 fail_address_parse:
+        trace_probe_log_clear();
        path_put(&path);
        kfree(filename);
-        pr_info("Failed to parse address or file.\n");
        return ret;
 }
@@ -1304,7 +1336,7 @@ static inline void init_trace_event_call(struct trace_uprobe *tu,
        call->event.funcs = &uprobe_funcs;
        call->class->define_fields = uprobe_event_define_fields;
-        call->flags = TRACE_EVENT_FL_UPROBE;
+        call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY;
        call->class->reg = trace_uprobe_register;
        call->data = tu;
 }
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 46f2ab1e08a9..df3ade14ccbd 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * Copyright (C) 2008-2014 Mathieu Desnoyers
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */
 #include <linux/module.h>
 #include <linux/mutex.h>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 370724b45391..7be3e7530841 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -1,19 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
 * tsacct.c - System accounting over taskstats interface
 *
 * Copyright (C) Jay Lan,       <jlan@sgi.com>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
 */
 #include <linux/kernel.h>
diff --git a/kernel/ucount.c b/kernel/ucount.c
index f48d1b6376a4..feb128c7b5d9 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -1,9 +1,4 @@
-/*
+// SPDX-License-Identifier: GPL-2.0-only
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License as
- *  published by the Free Software Foundation, version 2 of the
- *  License.
- */
 #include <linux/stat.h>
 #include <linux/sysctl.h>
diff --git a/kernel/umh.c b/kernel/umh.c
index d937cbad903a..7f255b5a8845 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * umh - the kernel usermode helper
 */
diff --git a/kernel/up.c b/kernel/up.c
index ff536f9cc8a2..862b460ab97a 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * Uniprocessor-only support functions.  The counterpart to kernel/smp.c
 */
@@ -34,14 +35,13 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 }
 EXPORT_SYMBOL(smp_call_function_single_async);
-int on_each_cpu(smp_call_func_t func, void *info, int wait)
+void on_each_cpu(smp_call_func_t func, void *info, int wait)
 {
        unsigned long flags;
        local_irq_save(flags);
        func(info);
        local_irq_restore(flags);
-        return 0;
 }
 EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 9586b670a5b2..870ecd7c63ed 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 #include <linux/user-return-notifier.h>
 #include <linux/percpu.h>
diff --git a/kernel/user.c b/kernel/user.c
index 0df9b1640b2a..5235d7f49982 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * The "user cache".
 *
@@ -62,9 +63,9 @@ struct user_namespace init_user_ns = {
        .ns.ops = &userns_operations,
 #endif
        .flags = USERNS_INIT_FLAGS,
-#ifdef CONFIG_PERSISTENT_KEYRINGS
+#ifdef CONFIG_KEYS
-        .persistent_keyring_register_sem =
+        .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
-        __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
+        .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
 #endif
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
@@ -140,8 +141,6 @@ static void free_user(struct user_struct *up, unsigned long flags)
 {
        uid_hash_remove(up);
        spin_unlock_irqrestore(&uidhash_lock, flags);
-        key_put(up->uid_keyring);
-        key_put(up->session_keyring);
        kmem_cache_free(uid_cachep, up);
 }
@@ -185,7 +184,7 @@ struct user_struct *alloc_uid(kuid_t uid)
        if (!up) {
                new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
                if (!new)
-                        goto out_unlock;
+                        return NULL;
                new->uid = uid;
                refcount_set(&new->__count, 1);
@@ -199,8 +198,6 @@ struct user_struct *alloc_uid(kuid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
-                        key_put(new->uid_keyring);
-                        key_put(new->session_keyring);
                        kmem_cache_free(uid_cachep, new);
                } else {
                        uid_hash_insert(new, hashent);
@@ -210,9 +207,6 @@ struct user_struct *alloc_uid(kuid_t uid)
        }
        return up;
-out_unlock:
-        return NULL;
 }
 static int __init uid_cache_init(void)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 923414a246e9..8eadadc478f9 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1,9 +1,4 @@
-/*
+// SPDX-License-Identifier: GPL-2.0-only
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License as
- *  published by the Free Software Foundation, version 2 of the
- *  License.
- */
 #include <linux/export.h>
 #include <linux/nsproxy.h>
@@ -133,8 +128,9 @@ int create_user_ns(struct cred *new)
        ns->flags = parent_ns->flags;
        mutex_unlock(&userns_state_mutex);
-#ifdef CONFIG_PERSISTENT_KEYRINGS
+#ifdef CONFIG_KEYS
-        init_rwsem(&ns->persistent_keyring_register_sem);
+        INIT_LIST_HEAD(&ns->keyring_name_list);
+        init_rwsem(&ns->keyring_sem);
 #endif
        ret = -ENOMEM;
        if (!setup_userns_sysctls(ns))
@@ -196,9 +192,7 @@ static void free_user_ns(struct work_struct *work)
                        kfree(ns->projid_map.reverse);
                }
                retire_userns_sysctls(ns);
-#ifdef CONFIG_PERSISTENT_KEYRINGS
+                key_free_user_ns(ns);
-                key_put(ns->persistent_keyring_register);
-#endif
                ns_free_inum(&ns->ns);
                kmem_cache_free(user_ns_cachep, ns);
                dec_user_namespaces(ucounts);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index dcd6be1996fe..f0e491193009 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  Copyright (C) 2004 IBM Corporation
 *
 *  Author: Serge Hallyn <serue@us.ibm.com>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License as
- *  published by the Free Software Foundation, version 2 of the
- *  License.
 */
 #include <linux/export.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 258033d62cb3..3732c888a949 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 *  Copyright (C) 2007
 *
 *  Author: Eric Biederman <ebiederm@xmision.com>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License as
- *  published by the Free Software Foundation, version 2 of the
- *  License.
 */
 #include <linux/export.h>
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9657315405de..601d61150b65 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
 * kernel/workqueue.c - generic async execution with shared worker pool
 *
@@ -3328,7 +3329,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
 *
 * Undo alloc_workqueue_attrs().
 */
-void free_workqueue_attrs(struct workqueue_attrs *attrs)
+static void free_workqueue_attrs(struct workqueue_attrs *attrs)
 {
        if (attrs) {
                free_cpumask_var(attrs->cpumask);
@@ -3338,21 +3339,20 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
 /**
 * alloc_workqueue_attrs - allocate a workqueue_attrs
- * @gfp_mask: allocation mask to use
 *
 * Allocate a new workqueue_attrs, initialize with default settings and
 * return it.
 *
 * Return: The allocated new workqueue_attr on success. %NULL on failure.
 */
-struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
+static struct workqueue_attrs *alloc_workqueue_attrs(void)
 {
        struct workqueue_attrs *attrs;
-        attrs = kzalloc(sizeof(*attrs), gfp_mask);
+        attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
        if (!attrs)
                goto fail;
-        if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
+        if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
                goto fail;
        cpumask_copy(attrs->cpumask, cpu_possible_mask);
@@ -3430,7 +3430,7 @@ static int init_worker_pool(struct worker_pool *pool)
        pool->refcnt = 1;
        /* shouldn't fail above this point */
-        pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
+        pool->attrs = alloc_workqueue_attrs();
        if (!pool->attrs)
                return -ENOMEM;
        return 0;
@@ -3895,8 +3895,8 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
        ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
-        new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+        new_attrs = alloc_workqueue_attrs();
-        tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+        tmp_attrs = alloc_workqueue_attrs();
        if (!ctx || !new_attrs || !tmp_attrs)
                goto out_free;
@@ -4032,7 +4032,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
 *
 * Return: 0 on success and -errno on failure.
 */
-int apply_workqueue_attrs(struct workqueue_struct *wq,
+static int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs)
 {
        int ret;
@@ -4043,7 +4043,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
        return ret;
 }
-EXPORT_SYMBOL_GPL(apply_workqueue_attrs);
 /**
 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
@@ -4241,7 +4240,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
                return NULL;
        if (flags & WQ_UNBOUND) {
-                wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+                wq->unbound_attrs = alloc_workqueue_attrs();
                if (!wq->unbound_attrs)
                        goto err_free_wq;
        }
@@ -5394,7 +5393,7 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
        lockdep_assert_held(&wq_pool_mutex);
-        attrs = alloc_workqueue_attrs(GFP_KERNEL);
+        attrs = alloc_workqueue_attrs();
        if (!attrs)
                return NULL;
@@ -5816,7 +5815,7 @@ static void __init wq_numa_init(void)
                return;
        }
-        wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
+        wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
        BUG_ON(!wq_update_unbound_numa_attrs_buf);
        /*
@@ -5891,7 +5890,7 @@ int __init workqueue_init_early(void)
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;
-                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;
@@ -5900,7 +5899,7 @@ int __init workqueue_init_early(void)
                 * guaranteed by max_active which is enforced by pwqs.
                 * Turn off NUMA so that dfl_pwq is used for all nodes.
                 */
-                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                attrs->no_numa = true;
                ordered_wq_attrs[i] = attrs;