Merge branch 'linus' into sched/urgent, to resolve conflicts

Conflicts: arch/arm64/kernel/entry.S arch/x86/Kconfig include/linux/sched/mm.h kernel/fork.c Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2018-02-06 15:12:31 -0500
committer: Ingo Molnar <mingo@kernel.org> 2018-02-06 15:12:31 -0500
commit: 82845079160817cc6ac64e5321bbd935e0a47b3a (patch)
tree: 0886d1d52428e9db14536cae4b37db896e7c360a /kernel
parent: 32e839dda3ba576943365f0f5817ce5c843137dc (diff)
parent: 68c5735eaa5e680e701c9a2d1e3c7880bdf5ab66 (diff)
58 files changed, 4082 insertions, 1225 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 172d151d429c..f85ae5dfa474 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_KCOV) += kcov.o
 obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e691da0b3bab..a713fd23ec88 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -9,9 +9,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
+ifeq ($(CONFIG_INET),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
 endif
+endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index ab94d304a634..b1f66480135b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,27 +49,35 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
 }
 /* Called from syscall */
-static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+static int array_map_alloc_check(union bpf_attr *attr)
 {
        bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
        int numa_node = bpf_map_attr_numa_node(attr);
-        u32 elem_size, index_mask, max_entries;
-        bool unpriv = !capable(CAP_SYS_ADMIN);
-        struct bpf_array *array;
-        u64 array_size, mask64;
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
            attr->value_size == 0 ||
            attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
            (percpu && numa_node != NUMA_NO_NODE))
-                return ERR_PTR(-EINVAL);
+                return -EINVAL;
        if (attr->value_size > KMALLOC_MAX_SIZE)
                /* if value_size is bigger, the user space won't be able to
                 * access the elements.
                 */
-                return ERR_PTR(-E2BIG);
+                return -E2BIG;
+        return 0;
+}
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+        bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+        int numa_node = bpf_map_attr_numa_node(attr);
+        u32 elem_size, index_mask, max_entries;
+        bool unpriv = !capable(CAP_SYS_ADMIN);
+        struct bpf_array *array;
+        u64 array_size, mask64;
        elem_size = round_up(attr->value_size, 8);
@@ -112,12 +120,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
        array->map.unpriv_array = unpriv;
        /* copy mandatory map attributes */
-        array->map.map_type = attr->map_type;
+        bpf_map_init_from_attr(&array->map, attr);
-        array->map.key_size = attr->key_size;
-        array->map.value_size = attr->value_size;
-        array->map.max_entries = attr->max_entries;
-        array->map.map_flags = attr->map_flags;
-        array->map.numa_node = numa_node;
        array->elem_size = elem_size;
        if (!percpu)
@@ -327,6 +330,7 @@ static void array_map_free(struct bpf_map *map)
 }
 const struct bpf_map_ops array_map_ops = {
+        .map_alloc_check = array_map_alloc_check,
        .map_alloc = array_map_alloc,
        .map_free = array_map_free,
        .map_get_next_key = array_map_get_next_key,
@@ -337,6 +341,7 @@ const struct bpf_map_ops array_map_ops = {
 };
 const struct bpf_map_ops percpu_array_map_ops = {
+        .map_alloc_check = array_map_alloc_check,
        .map_alloc = array_map_alloc,
        .map_free = array_map_free,
        .map_get_next_key = array_map_get_next_key,
@@ -345,12 +350,12 @@ const struct bpf_map_ops percpu_array_map_ops = {
        .map_delete_elem = array_map_delete_elem,
 };
-static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
+static int fd_array_map_alloc_check(union bpf_attr *attr)
 {
        /* only file descriptors can be stored in this type of map */
        if (attr->value_size != sizeof(u32))
-                return ERR_PTR(-EINVAL);
+                return -EINVAL;
-        return array_map_alloc(attr);
+        return array_map_alloc_check(attr);
 }
 static void fd_array_map_free(struct bpf_map *map)
@@ -474,7 +479,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
 }
 const struct bpf_map_ops prog_array_map_ops = {
-        .map_alloc = fd_array_map_alloc,
+        .map_alloc_check = fd_array_map_alloc_check,
+        .map_alloc = array_map_alloc,
        .map_free = fd_array_map_free,
        .map_get_next_key = array_map_get_next_key,
        .map_lookup_elem = fd_array_map_lookup_elem,
@@ -561,7 +567,8 @@ static void perf_event_fd_array_release(struct bpf_map *map,
 }
 const struct bpf_map_ops perf_event_array_map_ops = {
-        .map_alloc = fd_array_map_alloc,
+        .map_alloc_check = fd_array_map_alloc_check,
+        .map_alloc = array_map_alloc,
        .map_free = fd_array_map_free,
        .map_get_next_key = array_map_get_next_key,
        .map_lookup_elem = fd_array_map_lookup_elem,
@@ -592,7 +599,8 @@ static void cgroup_fd_array_free(struct bpf_map *map)
 }
 const struct bpf_map_ops cgroup_array_map_ops = {
-        .map_alloc = fd_array_map_alloc,
+        .map_alloc_check = fd_array_map_alloc_check,
+        .map_alloc = array_map_alloc,
        .map_free = cgroup_fd_array_free,
        .map_get_next_key = array_map_get_next_key,
        .map_lookup_elem = fd_array_map_lookup_elem,
@@ -610,7 +618,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
        if (IS_ERR(inner_map_meta))
                return inner_map_meta;
-        map = fd_array_map_alloc(attr);
+        map = array_map_alloc(attr);
        if (IS_ERR(map)) {
                bpf_map_meta_free(inner_map_meta);
                return map;
@@ -673,6 +681,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
 }
 const struct bpf_map_ops array_of_maps_map_ops = {
+        .map_alloc_check = fd_array_map_alloc_check,
        .map_alloc = array_of_map_alloc,
        .map_free = array_of_map_free,
        .map_get_next_key = array_map_get_next_key,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b789ab78d28f..c1c0b60d3f2f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -568,6 +568,8 @@ static bool cgroup_dev_is_valid_access(int off, int size,
                                       enum bpf_access_type type,
                                       struct bpf_insn_access_aux *info)
 {
+        const int size_default = sizeof(__u32);
        if (type == BPF_WRITE)
                return false;
@@ -576,8 +578,17 @@ static bool cgroup_dev_is_valid_access(int off, int size,
        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;
-        if (size != sizeof(__u32))
-                return false;
+        switch (off) {
+        case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
+                bpf_ctx_record_field_size(info, size_default);
+                if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+                        return false;
+                break;
+        default:
+                if (size != size_default)
+                        return false;
+        }
        return true;
 }
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7949e8b8f94e..5f35f93dcab2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -94,6 +94,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
        fp->pages = size / PAGE_SIZE;
        fp->aux = aux;
        fp->aux->prog = fp;
+        fp->jit_requested = ebpf_jit_enabled();
        INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
@@ -217,30 +218,40 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
        return 0;
 }
-static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
-{
-        return BPF_CLASS(insn->code) == BPF_JMP  &&
-               /* Call and Exit are both special jumps with no
-                * target inside the BPF instruction image.
-                */
-               BPF_OP(insn->code) != BPF_CALL &&
-               BPF_OP(insn->code) != BPF_EXIT;
-}
 static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
 {
        struct bpf_insn *insn = prog->insnsi;
        u32 i, insn_cnt = prog->len;
+        bool pseudo_call;
+        u8 code;
+        int off;
        for (i = 0; i < insn_cnt; i++, insn++) {
-                if (!bpf_is_jmp_and_has_target(insn))
+                code = insn->code;
+                if (BPF_CLASS(code) != BPF_JMP)
                        continue;
+                if (BPF_OP(code) == BPF_EXIT)
+                        continue;
+                if (BPF_OP(code) == BPF_CALL) {
+                        if (insn->src_reg == BPF_PSEUDO_CALL)
+                                pseudo_call = true;
+                        else
+                                continue;
+                } else {
+                        pseudo_call = false;
+                }
+                off = pseudo_call ? insn->imm : insn->off;
                /* Adjust offset of jmps if we cross boundaries. */
-                if (i < pos && i + insn->off + 1 > pos)
+                if (i < pos && i + off + 1 > pos)
-                        insn->off += delta;
+                        off += delta;
-                else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
+                else if (i > pos + delta && i + off + 1 <= pos + delta)
-                        insn->off -= delta;
+                        off -= delta;
+                if (pseudo_call)
+                        insn->imm = off;
+                else
+                        insn->off = off;
        }
 }
@@ -289,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 }
 #ifdef CONFIG_BPF_JIT
+/* All BPF JIT sysctl knobs here. */
+int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_harden   __read_mostly;
+int bpf_jit_kallsyms __read_mostly;
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
                         unsigned long *symbol_start,
@@ -370,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock);
 static LIST_HEAD(bpf_kallsyms);
 static struct latch_tree_root bpf_tree __cacheline_aligned;
-int bpf_jit_kallsyms __read_mostly;
 static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
 {
        WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
@@ -552,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
        bpf_prog_unlock_free(fp);
 }
-int bpf_jit_harden __read_mostly;
 static int bpf_jit_blind_insn(const struct bpf_insn *from,
                              const struct bpf_insn *aux,
                              struct bpf_insn *to_buff)
@@ -711,7 +723,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
        struct bpf_insn *insn;
        int i, rewritten;
-        if (!bpf_jit_blinding_enabled())
+        if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
                return prog;
        clone = bpf_prog_clone_create(prog, GFP_USER);
@@ -753,13 +765,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
                i        += insn_delta;
        }
+        clone->blinded = 1;
        return clone;
 }
 #endif /* CONFIG_BPF_JIT */
 /* Base function for offset calculation. Needs to go into .text section,
 * therefore keeping it non-static as well; will also be used by JITs
- * anyway later on, so do not let the compiler omit it.
+ * anyway later on, so do not let the compiler omit it. This also needs
+ * to go into kallsyms for correlation from e.g. bpftool, so naming
+ * must not change.
 */
 noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
@@ -767,6 +782,137 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 }
 EXPORT_SYMBOL_GPL(__bpf_call_base);
+/* All UAPI available opcodes. */
+#define BPF_INSN_MAP(INSN_2, INSN_3)            \
+        /* 32 bit ALU operations. */            \
+        /*   Register based. */                 \
+        INSN_3(ALU, ADD, X),                    \
+        INSN_3(ALU, SUB, X),                    \
+        INSN_3(ALU, AND, X),                    \
+        INSN_3(ALU, OR,  X),                    \
+        INSN_3(ALU, LSH, X),                    \
+        INSN_3(ALU, RSH, X),                    \
+        INSN_3(ALU, XOR, X),                    \
+        INSN_3(ALU, MUL, X),                    \
+        INSN_3(ALU, MOV, X),                    \
+        INSN_3(ALU, DIV, X),                    \
+        INSN_3(ALU, MOD, X),                    \
+        INSN_2(ALU, NEG),                       \
+        INSN_3(ALU, END, TO_BE),                \
+        INSN_3(ALU, END, TO_LE),                \
+        /*   Immediate based. */                \
+        INSN_3(ALU, ADD, K),                    \
+        INSN_3(ALU, SUB, K),                    \
+        INSN_3(ALU, AND, K),                    \
+        INSN_3(ALU, OR,  K),                    \
+        INSN_3(ALU, LSH, K),                    \
+        INSN_3(ALU, RSH, K),                    \
+        INSN_3(ALU, XOR, K),                    \
+        INSN_3(ALU, MUL, K),                    \
+        INSN_3(ALU, MOV, K),                    \
+        INSN_3(ALU, DIV, K),                    \
+        INSN_3(ALU, MOD, K),                    \
+        /* 64 bit ALU operations. */            \
+        /*   Register based. */                 \
+        INSN_3(ALU64, ADD,  X),                 \
+        INSN_3(ALU64, SUB,  X),                 \
+        INSN_3(ALU64, AND,  X),                 \
+        INSN_3(ALU64, OR,   X),                 \
+        INSN_3(ALU64, LSH,  X),                 \
+        INSN_3(ALU64, RSH,  X),                 \
+        INSN_3(ALU64, XOR,  X),                 \
+        INSN_3(ALU64, MUL,  X),                 \
+        INSN_3(ALU64, MOV,  X),                 \
+        INSN_3(ALU64, ARSH, X),                 \
+        INSN_3(ALU64, DIV,  X),                 \
+        INSN_3(ALU64, MOD,  X),                 \
+        INSN_2(ALU64, NEG),                     \
+        /*   Immediate based. */                \
+        INSN_3(ALU64, ADD,  K),                 \
+        INSN_3(ALU64, SUB,  K),                 \
+        INSN_3(ALU64, AND,  K),                 \
+        INSN_3(ALU64, OR,   K),                 \
+        INSN_3(ALU64, LSH,  K),                 \
+        INSN_3(ALU64, RSH,  K),                 \
+        INSN_3(ALU64, XOR,  K),                 \
+        INSN_3(ALU64, MUL,  K),                 \
+        INSN_3(ALU64, MOV,  K),                 \
+        INSN_3(ALU64, ARSH, K),                 \
+        INSN_3(ALU64, DIV,  K),                 \
+        INSN_3(ALU64, MOD,  K),                 \
+        /* Call instruction. */                 \
+        INSN_2(JMP, CALL),                      \
+        /* Exit instruction. */                 \
+        INSN_2(JMP, EXIT),                      \
+        /* Jump instructions. */                \
+        /*   Register based. */                 \
+        INSN_3(JMP, JEQ,  X),                   \
+        INSN_3(JMP, JNE,  X),                   \
+        INSN_3(JMP, JGT,  X),                   \
+        INSN_3(JMP, JLT,  X),                   \
+        INSN_3(JMP, JGE,  X),                   \
+        INSN_3(JMP, JLE,  X),                   \
+        INSN_3(JMP, JSGT, X),                   \
+        INSN_3(JMP, JSLT, X),                   \
+        INSN_3(JMP, JSGE, X),                   \
+        INSN_3(JMP, JSLE, X),                   \
+        INSN_3(JMP, JSET, X),                   \
+        /*   Immediate based. */                \
+        INSN_3(JMP, JEQ,  K),                   \
+        INSN_3(JMP, JNE,  K),                   \
+        INSN_3(JMP, JGT,  K),                   \
+        INSN_3(JMP, JLT,  K),                   \
+        INSN_3(JMP, JGE,  K),                   \
+        INSN_3(JMP, JLE,  K),                   \
+        INSN_3(JMP, JSGT, K),                   \
+        INSN_3(JMP, JSLT, K),                   \
+        INSN_3(JMP, JSGE, K),                   \
+        INSN_3(JMP, JSLE, K),                   \
+        INSN_3(JMP, JSET, K),                   \
+        INSN_2(JMP, JA),                        \
+        /* Store instructions. */               \
+        /*   Register based. */                 \
+        INSN_3(STX, MEM,  B),                   \
+        INSN_3(STX, MEM,  H),                   \
+        INSN_3(STX, MEM,  W),                   \
+        INSN_3(STX, MEM,  DW),                  \
+        INSN_3(STX, XADD, W),                   \
+        INSN_3(STX, XADD, DW),                  \
+        /*   Immediate based. */                \
+        INSN_3(ST, MEM, B),                     \
+        INSN_3(ST, MEM, H),                     \
+        INSN_3(ST, MEM, W),                     \
+        INSN_3(ST, MEM, DW),                    \
+        /* Load instructions. */                \
+        /*   Register based. */                 \
+        INSN_3(LDX, MEM, B),                    \
+        INSN_3(LDX, MEM, H),                    \
+        INSN_3(LDX, MEM, W),                    \
+        INSN_3(LDX, MEM, DW),                   \
+        /*   Immediate based. */                \
+        INSN_3(LD, IMM, DW),                    \
+        /*   Misc (old cBPF carry-over). */     \
+        INSN_3(LD, ABS, B),                     \
+        INSN_3(LD, ABS, H),                     \
+        INSN_3(LD, ABS, W),                     \
+        INSN_3(LD, IND, B),                     \
+        INSN_3(LD, IND, H),                     \
+        INSN_3(LD, IND, W)
+bool bpf_opcode_in_insntable(u8 code)
+{
+#define BPF_INSN_2_TBL(x, y)    [BPF_##x | BPF_##y] = true
+#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
+        static const bool public_insntable[256] = {
+                [0 ... 255] = false,
+                /* Now overwrite non-defaults ... */
+                BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
+        };
+#undef BPF_INSN_3_TBL
+#undef BPF_INSN_2_TBL
+        return public_insntable[code];
+}
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 /**
 *      __bpf_prog_run - run eBPF program on a given context
@@ -775,118 +921,21 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
 *
 * Decode and execute eBPF instructions.
 */
-static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
+static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
-                                    u64 *stack)
 {
        u64 tmp;
+#define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
+#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
        static const void *jumptable[256] = {
                [0 ... 255] = &&default_label,
                /* Now overwrite non-defaults ... */
-                /* 32 bit ALU operations */
+                BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
-                [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
+                /* Non-UAPI available opcodes. */
-                [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
+                [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
-                [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
-                [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
-                [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
-                [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
-                [BPF_ALU | BPF_OR | BPF_X]  = &&ALU_OR_X,
-                [BPF_ALU | BPF_OR | BPF_K]  = &&ALU_OR_K,
-                [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
-                [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
-                [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
-                [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
-                [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
-                [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
-                [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
-                [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
-                [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
-                [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
-                [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
-                [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
-                [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
-                [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
-                [BPF_ALU | BPF_NEG] = &&ALU_NEG,
-                [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
-                [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
-                /* 64 bit ALU operations */
-                [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
-                [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
-                [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
-                [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
-                [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
-                [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
-                [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
-                [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
-                [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
-                [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
-                [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
-                [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
-                [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
-                [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
-                [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
-                [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
-                [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
-                [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
-                [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
-                [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
-                [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
-                [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
-                [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
-                [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
-                [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
-                /* Call instruction */
-                [BPF_JMP | BPF_CALL] = &&JMP_CALL,
                [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
-                /* Jumps */
-                [BPF_JMP | BPF_JA] = &&JMP_JA,
-                [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
-                [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
-                [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
-                [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
-                [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
-                [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
-                [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
-                [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
-                [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
-                [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
-                [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
-                [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
-                [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
-                [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
-                [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
-                [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
-                [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
-                [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
-                [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
-                [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
-                [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
-                [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
-                /* Program return */
-                [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
-                /* Store instructions */
-                [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
-                [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
-                [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
-                [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
-                [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
-                [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
-                [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
-                [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
-                [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
-                [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
-                /* Load instructions */
-                [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
-                [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
-                [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
-                [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
-                [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
-                [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
-                [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
-                [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
-                [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
-                [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
-                [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
        };
+#undef BPF_INSN_3_LBL
+#undef BPF_INSN_2_LBL
        u32 tail_call_cnt = 0;
        void *ptr;
        int off;
@@ -950,14 +999,10 @@ select_insn:
                (*(s64 *) &DST) >>= IMM;
                CONT;
        ALU64_MOD_X:
-                if (unlikely(SRC == 0))
-                        return 0;
                div64_u64_rem(DST, SRC, &tmp);
                DST = tmp;
                CONT;
        ALU_MOD_X:
-                if (unlikely((u32)SRC == 0))
-                        return 0;
                tmp = (u32) DST;
                DST = do_div(tmp, (u32) SRC);
                CONT;
@@ -970,13 +1015,9 @@ select_insn:
                DST = do_div(tmp, (u32) IMM);
                CONT;
        ALU64_DIV_X:
-                if (unlikely(SRC == 0))
-                        return 0;
                DST = div64_u64(DST, SRC);
                CONT;
        ALU_DIV_X:
-                if (unlikely((u32)SRC == 0))
-                        return 0;
                tmp = (u32) DST;
                do_div(tmp, (u32) SRC);
                DST = (u32) tmp;
@@ -1026,6 +1067,13 @@ select_insn:
                                                       BPF_R4, BPF_R5);
                CONT;
+        JMP_CALL_ARGS:
+                BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
+                                                            BPF_R3, BPF_R4,
+                                                            BPF_R5,
+                                                            insn + insn->off + 1);
+                CONT;
        JMP_TAIL_CALL: {
                struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
                struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -1280,8 +1328,14 @@ load_byte:
                goto load_byte;
        default_label:
-                /* If we ever reach this, we have a bug somewhere. */
+                /* If we ever reach this, we have a bug somewhere. Die hard here
-                WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+                 * instead of just returning 0; we could be somewhere in a subprog,
+                 * so execution could continue otherwise which we do /not/ want.
+                 *
+                 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
+                 */
+                pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
+                BUG_ON(1);
                return 0;
 }
 STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
@@ -1298,6 +1352,23 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn
        return ___bpf_prog_run(regs, insn, stack); \
 }
+#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
+#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
+static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
+                                      const struct bpf_insn *insn) \
+{ \
+        u64 stack[stack_size / sizeof(u64)]; \
+        u64 regs[MAX_BPF_REG]; \
+\
+        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
+        BPF_R1 = r1; \
+        BPF_R2 = r2; \
+        BPF_R3 = r3; \
+        BPF_R4 = r4; \
+        BPF_R5 = r5; \
+        return ___bpf_prog_run(regs, insn, stack); \
+}
 #define EVAL1(FN, X) FN(X)
 #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
 #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
@@ -1309,6 +1380,10 @@ EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
 EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
 EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
+EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
+EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
+EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);
 #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
 static unsigned int (*interpreters[])(const void *ctx,
@@ -1317,11 +1392,33 @@ EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
 EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
 EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 };
+#undef PROG_NAME_LIST
+#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
+static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
+                                  const struct bpf_insn *insn) = {
+EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
+EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
+EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
+};
+#undef PROG_NAME_LIST
+void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
+{
+        stack_depth = max_t(u32, stack_depth, 1);
+        insn->off = (s16) insn->imm;
+        insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
+                __bpf_call_base_args;
+        insn->code = BPF_JMP | BPF_CALL_ARGS;
+}
 #else
-static unsigned int __bpf_prog_ret0(const void *ctx,
+static unsigned int __bpf_prog_ret0_warn(const void *ctx,
-                                    const struct bpf_insn *insn)
+                                         const struct bpf_insn *insn)
 {
+        /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
+         * is not working properly, so warn about it!
+         */
+        WARN_ON_ONCE(1);
        return 0;
 }
 #endif
@@ -1329,6 +1426,9 @@ static unsigned int __bpf_prog_ret0(const void *ctx,
 bool bpf_prog_array_compatible(struct bpf_array *array,
                               const struct bpf_prog *fp)
 {
+        if (fp->kprobe_override)
+                return false;
        if (!array->owner_prog_type) {
                /* There's no owner yet where we could check for
                 * compatibility.
@@ -1378,7 +1478,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
        fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
 #else
-        fp->bpf_func = __bpf_prog_ret0;
+        fp->bpf_func = __bpf_prog_ret0_warn;
 #endif
        /* eBPF JITs can rewrite the program in case constant
@@ -1481,6 +1581,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
        rcu_read_lock();
        prog = rcu_dereference(progs)->progs;
        for (; *prog; prog++) {
+                if (*prog == &dummy_bpf_prog.prog)
+                        continue;
                id = (*prog)->aux->id;
                if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
                        rcu_read_unlock();
@@ -1564,14 +1666,41 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
        return 0;
 }
+int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+                             __u32 __user *prog_ids, u32 request_cnt,
+                             __u32 __user *prog_cnt)
+{
+        u32 cnt = 0;
+        if (array)
+                cnt = bpf_prog_array_length(array);
+        if (copy_to_user(prog_cnt, &cnt, sizeof(cnt)))
+                return -EFAULT;
+        /* return early if user requested only program count or nothing to copy */
+        if (!request_cnt || !cnt)
+                return 0;
+        return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt);
+}
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
        struct bpf_prog_aux *aux;
+        int i;
        aux = container_of(work, struct bpf_prog_aux, work);
        if (bpf_prog_is_dev_bound(aux))
                bpf_prog_offload_destroy(aux->prog);
-        bpf_jit_free(aux->prog);
+        for (i = 0; i < aux->func_cnt; i++)
+                bpf_jit_free(aux->func[i]);
+        if (aux->func_cnt) {
+                kfree(aux->func);
+                bpf_prog_unlock_free(aux->prog);
+        } else {
+                bpf_jit_free(aux->prog);
+        }
 }
 /* Free internal BPF program */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ce5b669003b2..fbfdada6caee 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
        if (!cmap)
                return ERR_PTR(-ENOMEM);
-        /* mandatory map attributes */
+        bpf_map_init_from_attr(&cmap->map, attr);
-        cmap->map.map_type = attr->map_type;
-        cmap->map.key_size = attr->key_size;
-        cmap->map.value_size = attr->value_size;
-        cmap->map.max_entries = attr->max_entries;
-        cmap->map.map_flags = attr->map_flags;
-        cmap->map.numa_node = bpf_map_attr_numa_node(attr);
        /* Pre-limit array size based on NR_CPUS, not final CPU check */
        if (cmap->map.max_entries > NR_CPUS) {
@@ -143,7 +137,7 @@ free_cmap:
        return ERR_PTR(err);
 }
-void __cpu_map_queue_destructor(void *ptr)
+static void __cpu_map_queue_destructor(void *ptr)
 {
        /* The tear-down procedure should have made sure that queue is
         * empty.  See __cpu_map_entry_replace() and work-queue
@@ -222,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
        return xdp_pkt;
 }
-struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
-                                  struct xdp_pkt *xdp_pkt)
+                                         struct xdp_pkt *xdp_pkt)
 {
        unsigned int frame_size;
        void *pkt_data_start;
@@ -337,7 +331,8 @@ static int cpu_map_kthread_run(void *data)
        return 0;
 }
-struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
+                                                       int map_id)
 {
        gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
        struct bpf_cpu_map_entry *rcpu;
@@ -395,7 +390,7 @@ free_rcu:
        return NULL;
 }
-void __cpu_map_entry_free(struct rcu_head *rcu)
+static void __cpu_map_entry_free(struct rcu_head *rcu)
 {
        struct bpf_cpu_map_entry *rcpu;
        int cpu;
@@ -438,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu)
 * cpu_map_kthread_stop, which waits for an RCU graze period before
 * stopping kthread, emptying the queue.
 */
-void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
-                             u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+                                    u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
 {
        struct bpf_cpu_map_entry *old_rcpu;
@@ -451,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
        }
 }
-int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static int cpu_map_delete_elem(struct bpf_map *map, void *key)
 {
        struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
        u32 key_cpu = *(u32 *)key;
@@ -464,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key)
        return 0;
 }
-int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
-                                u64 map_flags)
+                               u64 map_flags)
 {
        struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
        struct bpf_cpu_map_entry *rcpu;
@@ -502,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
        return 0;
 }
-void cpu_map_free(struct bpf_map *map)
+static void cpu_map_free(struct bpf_map *map)
 {
        struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
        int cpu;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ebdef54bf7df..565f9ece9115 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
        if (!dtab)
                return ERR_PTR(-ENOMEM);
-        /* mandatory map attributes */
+        bpf_map_init_from_attr(&dtab->map, attr);
-        dtab->map.map_type = attr->map_type;
-        dtab->map.key_size = attr->key_size;
-        dtab->map.value_size = attr->value_size;
-        dtab->map.max_entries = attr->max_entries;
-        dtab->map.map_flags = attr->map_flags;
-        dtab->map.numa_node = bpf_map_attr_numa_node(attr);
        /* make sure page count doesn't overflow */
        cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index e682850c9715..8740406df2cd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -21,10 +21,39 @@ static const char * const func_id_str[] = {
 };
 #undef __BPF_FUNC_STR_FN
-const char *func_id_name(int id)
+static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
+                                   const struct bpf_insn *insn,
+                                   char *buff, size_t len)
 {
        BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+        if (insn->src_reg != BPF_PSEUDO_CALL &&
+            insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
+            func_id_str[insn->imm])
+                return func_id_str[insn->imm];
+        if (cbs && cbs->cb_call)
+                return cbs->cb_call(cbs->private_data, insn);
+        if (insn->src_reg == BPF_PSEUDO_CALL)
+                snprintf(buff, len, "%+d", insn->imm);
+        return buff;
+}
+static const char *__func_imm_name(const struct bpf_insn_cbs *cbs,
+                                   const struct bpf_insn *insn,
+                                   u64 full_imm, char *buff, size_t len)
+{
+        if (cbs && cbs->cb_imm)
+                return cbs->cb_imm(cbs->private_data, insn, full_imm);
+        snprintf(buff, len, "0x%llx", (unsigned long long)full_imm);
+        return buff;
+}
+const char *func_id_name(int id)
+{
        if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
                return func_id_str[id];
        else
@@ -83,7 +112,7 @@ static const char *const bpf_jmp_string[16] = {
        [BPF_EXIT >> 4] = "exit",
 };
-static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+static void print_bpf_end_insn(bpf_insn_print_t verbose,
                               struct bpf_verifier_env *env,
                               const struct bpf_insn *insn)
 {
@@ -92,9 +121,12 @@ static void print_bpf_end_insn(bpf_insn_print_cb verbose,
                insn->imm, insn->dst_reg);
 }
-void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+void print_bpf_insn(const struct bpf_insn_cbs *cbs,
-                    const struct bpf_insn *insn, bool allow_ptr_leaks)
+                    struct bpf_verifier_env *env,
+                    const struct bpf_insn *insn,
+                    bool allow_ptr_leaks)
 {
+        const bpf_insn_print_t verbose = cbs->cb_print;
        u8 class = BPF_CLASS(insn->code);
        if (class == BPF_ALU || class == BPF_ALU64) {
@@ -175,12 +207,15 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
                         */
                        u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
                        bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+                        char tmp[64];
                        if (map_ptr && !allow_ptr_leaks)
                                imm = 0;
-                        verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
+                        verbose(env, "(%02x) r%d = %s\n",
-                                insn->dst_reg, (unsigned long long)imm);
+                                insn->code, insn->dst_reg,
+                                __func_imm_name(cbs, insn, imm,
+                                                tmp, sizeof(tmp)));
                } else {
                        verbose(env, "BUG_ld_%02x\n", insn->code);
                        return;
@@ -189,8 +224,20 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
                u8 opcode = BPF_OP(insn->code);
                if (opcode == BPF_CALL) {
-                        verbose(env, "(%02x) call %s#%d\n", insn->code,
+                        char tmp[64];
-                                func_id_name(insn->imm), insn->imm);
+                        if (insn->src_reg == BPF_PSEUDO_CALL) {
+                                verbose(env, "(%02x) call pc%s\n",
+                                        insn->code,
+                                        __func_get_name(cbs, insn,
+                                                        tmp, sizeof(tmp)));
+                        } else {
+                                strcpy(tmp, "unknown");
+                                verbose(env, "(%02x) call %s#%d\n", insn->code,
+                                        __func_get_name(cbs, insn,
+                                                        tmp, sizeof(tmp)),
+                                        insn->imm);
+                        }
                } else if (insn->code == (BPF_JMP | BPF_JA)) {
                        verbose(env, "(%02x) goto pc%+d\n",
                                insn->code, insn->off);
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index 8de977e420b6..266fe8ee542b 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -17,16 +17,35 @@
 #include <linux/bpf.h>
 #include <linux/kernel.h>
 #include <linux/stringify.h>
+#ifndef __KERNEL__
+#include <stdio.h>
+#include <string.h>
+#endif
+struct bpf_verifier_env;
 extern const char *const bpf_alu_string[16];
 extern const char *const bpf_class_string[8];
 const char *func_id_name(int id);
-struct bpf_verifier_env;
+typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
-typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
+                                                const char *, ...);
-                                  const char *, ...);
+typedef const char *(*bpf_insn_revmap_call_t)(void *private_data,
-void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+                                              const struct bpf_insn *insn);
-                    const struct bpf_insn *insn, bool allow_ptr_leaks);
+typedef const char *(*bpf_insn_print_imm_t)(void *private_data,
+                                            const struct bpf_insn *insn,
+                                            __u64 full_imm);
+struct bpf_insn_cbs {
+        bpf_insn_print_t        cb_print;
+        bpf_insn_revmap_call_t  cb_call;
+        bpf_insn_print_imm_t    cb_imm;
+        void                    *private_data;
+};
+void print_bpf_insn(const struct bpf_insn_cbs *cbs,
+                    struct bpf_verifier_env *env,
+                    const struct bpf_insn *insn,
+                    bool allow_ptr_leaks);
 #endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3905d4bc5b80..b76828f23b49 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -227,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab)
 }
 /* Called from syscall */
-static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+static int htab_map_alloc_check(union bpf_attr *attr)
 {
        bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
                       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
@@ -241,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
        bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
        bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
        int numa_node = bpf_map_attr_numa_node(attr);
-        struct bpf_htab *htab;
-        int err, i;
-        u64 cost;
        BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
                     offsetof(struct htab_elem, hash_node.pprev));
@@ -254,40 +251,68 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
                /* LRU implementation is much complicated than other
                 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
                 */
-                return ERR_PTR(-EPERM);
+                return -EPERM;
        if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
                /* reserved bits should not be used */
-                return ERR_PTR(-EINVAL);
+                return -EINVAL;
        if (!lru && percpu_lru)
-                return ERR_PTR(-EINVAL);
+                return -EINVAL;
        if (lru && !prealloc)
-                return ERR_PTR(-ENOTSUPP);
+                return -ENOTSUPP;
        if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
-                return ERR_PTR(-EINVAL);
+                return -EINVAL;
+        /* check sanity of attributes.
+         * value_size == 0 may be allowed in the future to use map as a set
+         */
+        if (attr->max_entries == 0 || attr->key_size == 0 ||
+            attr->value_size == 0)
+                return -EINVAL;
+        if (attr->key_size > MAX_BPF_STACK)
+                /* eBPF programs initialize keys on stack, so they cannot be
+                 * larger than max stack size
+                 */
+                return -E2BIG;
+        if (attr->value_size >= KMALLOC_MAX_SIZE -
+            MAX_BPF_STACK - sizeof(struct htab_elem))
+                /* if value_size is bigger, the user space won't be able to
+                 * access the elements via bpf syscall. This check also makes
+                 * sure that the elem_size doesn't overflow and it's
+                 * kmalloc-able later in htab_map_update_elem()
+                 */
+                return -E2BIG;
+        return 0;
+}
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+        bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+                       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+        bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
+                    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+        /* percpu_lru means each cpu has its own LRU list.
+         * it is different from BPF_MAP_TYPE_PERCPU_HASH where
+         * the map's value itself is percpu.  percpu_lru has
+         * nothing to do with the map's value.
+         */
+        bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
+        bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+        struct bpf_htab *htab;
+        int err, i;
+        u64 cost;
        htab = kzalloc(sizeof(*htab), GFP_USER);
        if (!htab)
                return ERR_PTR(-ENOMEM);
-        /* mandatory map attributes */
+        bpf_map_init_from_attr(&htab->map, attr);
-        htab->map.map_type = attr->map_type;
-        htab->map.key_size = attr->key_size;
-        htab->map.value_size = attr->value_size;
-        htab->map.max_entries = attr->max_entries;
-        htab->map.map_flags = attr->map_flags;
-        htab->map.numa_node = numa_node;
-        /* check sanity of attributes.
-         * value_size == 0 may be allowed in the future to use map as a set
-         */
-        err = -EINVAL;
-        if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
-            htab->map.value_size == 0)
-                goto free_htab;
        if (percpu_lru) {
                /* ensure each CPU's lru list has >=1 elements.
@@ -304,22 +329,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
        /* hash table size must be power of 2 */
        htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
-        err = -E2BIG;
-        if (htab->map.key_size > MAX_BPF_STACK)
-                /* eBPF programs initialize keys on stack, so they cannot be
-                 * larger than max stack size
-                 */
-                goto free_htab;
-        if (htab->map.value_size >= KMALLOC_MAX_SIZE -
-            MAX_BPF_STACK - sizeof(struct htab_elem))
-                /* if value_size is bigger, the user space won't be able to
-                 * access the elements via bpf syscall. This check also makes
-                 * sure that the elem_size doesn't overflow and it's
-                 * kmalloc-able later in htab_map_update_elem()
-                 */
-                goto free_htab;
        htab->elem_size = sizeof(struct htab_elem) +
                          round_up(htab->map.key_size, 8);
        if (percpu)
@@ -327,6 +336,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
        else
                htab->elem_size += round_up(htab->map.value_size, 8);
+        err = -E2BIG;
        /* prevent zero size kmalloc and check for u32 overflow */
        if (htab->n_buckets == 0 ||
            htab->n_buckets > U32_MAX / sizeof(struct bucket))
@@ -1143,6 +1153,7 @@ static void htab_map_free(struct bpf_map *map)
 }
 const struct bpf_map_ops htab_map_ops = {
+        .map_alloc_check = htab_map_alloc_check,
        .map_alloc = htab_map_alloc,
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
@@ -1153,6 +1164,7 @@ const struct bpf_map_ops htab_map_ops = {
 };
 const struct bpf_map_ops htab_lru_map_ops = {
+        .map_alloc_check = htab_map_alloc_check,
        .map_alloc = htab_map_alloc,
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
@@ -1236,6 +1248,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
 }
 const struct bpf_map_ops htab_percpu_map_ops = {
+        .map_alloc_check = htab_map_alloc_check,
        .map_alloc = htab_map_alloc,
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
@@ -1245,6 +1258,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 };
 const struct bpf_map_ops htab_lru_percpu_map_ops = {
+        .map_alloc_check = htab_map_alloc_check,
        .map_alloc = htab_map_alloc,
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
@@ -1253,11 +1267,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
        .map_delete_elem = htab_lru_map_delete_elem,
 };
-static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
+static int fd_htab_map_alloc_check(union bpf_attr *attr)
 {
        if (attr->value_size != sizeof(u32))
-                return ERR_PTR(-EINVAL);
+                return -EINVAL;
-        return htab_map_alloc(attr);
+        return htab_map_alloc_check(attr);
 }
 static void fd_htab_map_free(struct bpf_map *map)
@@ -1328,7 +1342,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
        if (IS_ERR(inner_map_meta))
                return inner_map_meta;
-        map = fd_htab_map_alloc(attr);
+        map = htab_map_alloc(attr);
        if (IS_ERR(map)) {
                bpf_map_meta_free(inner_map_meta);
                return map;
@@ -1372,6 +1386,7 @@ static void htab_of_map_free(struct bpf_map *map)
 }
 const struct bpf_map_ops htab_of_maps_map_ops = {
+        .map_alloc_check = fd_htab_map_alloc_check,
        .map_alloc = htab_of_map_alloc,
        .map_free = htab_of_map_free,
        .map_get_next_key = htab_map_get_next_key,
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5bb5e49ef4c3..81e2f6995adb 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,39 +150,29 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        return 0;
 }
-static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
+static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
-                         umode_t mode, const struct inode_operations *iops)
+                         const struct inode_operations *iops)
 {
-        struct inode *inode;
+        struct inode *dir = dentry->d_parent->d_inode;
+        struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
-        inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
        inode->i_op = iops;
-        inode->i_private = dentry->d_fsdata;
+        inode->i_private = raw;
        bpf_dentry_finalize(dentry, inode, dir);
        return 0;
 }
-static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
+static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
-                     dev_t devt)
 {
-        enum bpf_type type = MINOR(devt);
+        return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops);
+}
-        if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) ||
-            dentry->d_fsdata == NULL)
-                return -EPERM;
-        switch (type) {
+static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
-        case BPF_TYPE_PROG:
+{
-                return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops);
+        return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops);
-        case BPF_TYPE_MAP:
-                return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops);
-        default:
-                return -EPERM;
-        }
 }
 static struct dentry *
@@ -218,7 +208,6 @@ static int bpf_symlink(struct inode *dir, struct dentry *dentry,
 static const struct inode_operations bpf_dir_iops = {
        .lookup         = bpf_lookup,
-        .mknod          = bpf_mkobj,
        .mkdir          = bpf_mkdir,
        .symlink        = bpf_symlink,
        .rmdir          = simple_rmdir,
@@ -234,7 +223,6 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
        struct inode *dir;
        struct path path;
        umode_t mode;
-        dev_t devt;
        int ret;
        dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
@@ -242,9 +230,8 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
                return PTR_ERR(dentry);
        mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
-        devt = MKDEV(UNNAMED_MAJOR, type);
-        ret = security_path_mknod(&path, dentry, mode, devt);
+        ret = security_path_mknod(&path, dentry, mode, 0);
        if (ret)
                goto out;
@@ -254,9 +241,16 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
                goto out;
        }
-        dentry->d_fsdata = raw;
+        switch (type) {
-        ret = vfs_mknod(dir, dentry, mode, devt);
+        case BPF_TYPE_PROG:
-        dentry->d_fsdata = NULL;
+                ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
+                break;
+        case BPF_TYPE_MAP:
+                ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
+                break;
+        default:
+                ret = -EPERM;
+        }
 out:
        done_path_create(&path, dentry);
        return ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 885e45479680..7b469d10d0e9 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
                return ERR_PTR(-ENOMEM);
        /* copy mandatory map attributes */
-        trie->map.map_type = attr->map_type;
+        bpf_map_init_from_attr(&trie->map, attr);
-        trie->map.key_size = attr->key_size;
-        trie->map.value_size = attr->value_size;
-        trie->map.max_entries = attr->max_entries;
-        trie->map.map_flags = attr->map_flags;
-        trie->map.numa_node = bpf_map_attr_numa_node(attr);
        trie->data_size = attr->key_size -
                          offsetof(struct bpf_lpm_trie_key, data);
        trie->max_prefixlen = trie->data_size * 8;
@@ -596,9 +591,96 @@ unlock:
        raw_spin_unlock(&trie->lock);
 }
-static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
+static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 {
-        return -ENOTSUPP;
+        struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
+        struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+        struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
+        struct lpm_trie_node **node_stack = NULL;
+        int err = 0, stack_ptr = -1;
+        unsigned int next_bit;
+        size_t matchlen;
+        /* The get_next_key follows postorder. For the 4 node example in
+         * the top of this file, the trie_get_next_key() returns the following
+         * one after another:
+         *   192.168.0.0/24
+         *   192.168.1.0/24
+         *   192.168.128.0/24
+         *   192.168.0.0/16
+         *
+         * The idea is to return more specific keys before less specific ones.
+         */
+        /* Empty trie */
+        search_root = rcu_dereference(trie->root);
+        if (!search_root)
+                return -ENOENT;
+        /* For invalid key, find the leftmost node in the trie */
+        if (!key || key->prefixlen > trie->max_prefixlen)
+                goto find_leftmost;
+        node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
+                             GFP_ATOMIC | __GFP_NOWARN);
+        if (!node_stack)
+                return -ENOMEM;
+        /* Try to find the exact node for the given key */
+        for (node = search_root; node;) {
+                node_stack[++stack_ptr] = node;
+                matchlen = longest_prefix_match(trie, node, key);
+                if (node->prefixlen != matchlen ||
+                    node->prefixlen == key->prefixlen)
+                        break;
+                next_bit = extract_bit(key->data, node->prefixlen);
+                node = rcu_dereference(node->child[next_bit]);
+        }
+        if (!node || node->prefixlen != key->prefixlen ||
+            (node->flags & LPM_TREE_NODE_FLAG_IM))
+                goto find_leftmost;
+        /* The node with the exactly-matching key has been found,
+         * find the first node in postorder after the matched node.
+         */
+        node = node_stack[stack_ptr];
+        while (stack_ptr > 0) {
+                parent = node_stack[stack_ptr - 1];
+                if (rcu_dereference(parent->child[0]) == node) {
+                        search_root = rcu_dereference(parent->child[1]);
+                        if (search_root)
+                                goto find_leftmost;
+                }
+                if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
+                        next_node = parent;
+                        goto do_copy;
+                }
+                node = parent;
+                stack_ptr--;
+        }
+        /* did not find anything */
+        err = -ENOENT;
+        goto free_stack;
+find_leftmost:
+        /* Find the leftmost non-intermediate node, all intermediate nodes
+         * have exact two children, so this function will never return NULL.
+         */
+        for (node = search_root; node;) {
+                if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+                        next_node = node;
+                node = rcu_dereference(node->child[0]);
+        }
+do_copy:
+        next_key->prefixlen = next_node->prefixlen;
+        memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
+               next_node->data, trie->data_size);
+free_stack:
+        kfree(node_stack);
+        return err;
 }
 const struct bpf_map_ops trie_map_ops = {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 8455b89d1bbf..c9401075b58c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -16,18 +16,35 @@
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bug.h>
+#include <linux/kdev_t.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
+#include <linux/proc_ns.h>
 #include <linux/rtnetlink.h>
+#include <linux/rwsem.h>
-/* protected by RTNL */
+/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
+ * of all progs.
+ * RTNL lock cannot be taken when holding this lock.
+ */
+static DECLARE_RWSEM(bpf_devs_lock);
 static LIST_HEAD(bpf_prog_offload_devs);
+static LIST_HEAD(bpf_map_offload_devs);
+static int bpf_dev_offload_check(struct net_device *netdev)
+{
+        if (!netdev)
+                return -EINVAL;
+        if (!netdev->netdev_ops->ndo_bpf)
+                return -EOPNOTSUPP;
+        return 0;
+}
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 {
-        struct net *net = current->nsproxy->net_ns;
+        struct bpf_prog_offload *offload;
-        struct bpf_dev_offload *offload;
+        int err;
        if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
            attr->prog_type != BPF_PROG_TYPE_XDP)
@@ -41,34 +58,44 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
                return -ENOMEM;
        offload->prog = prog;
-        init_waitqueue_head(&offload->verifier_done);
-        rtnl_lock();
+        offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
-        offload->netdev = __dev_get_by_index(net, attr->prog_ifindex);
+                                           attr->prog_ifindex);
-        if (!offload->netdev) {
+        err = bpf_dev_offload_check(offload->netdev);
-                rtnl_unlock();
+        if (err)
-                kfree(offload);
+                goto err_maybe_put;
-                return -EINVAL;
-        }
+        down_write(&bpf_devs_lock);
+        if (offload->netdev->reg_state != NETREG_REGISTERED) {
+                err = -EINVAL;
+                goto err_unlock;
+        }
        prog->aux->offload = offload;
        list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
-        rtnl_unlock();
+        dev_put(offload->netdev);
+        up_write(&bpf_devs_lock);
        return 0;
+err_unlock:
+        up_write(&bpf_devs_lock);
+err_maybe_put:
+        if (offload->netdev)
+                dev_put(offload->netdev);
+        kfree(offload);
+        return err;
 }
 static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
                             struct netdev_bpf *data)
 {
-        struct net_device *netdev = prog->aux->offload->netdev;
+        struct bpf_prog_offload *offload = prog->aux->offload;
+        struct net_device *netdev;
        ASSERT_RTNL();
-        if (!netdev)
+        if (!offload)
                return -ENODEV;
-        if (!netdev->netdev_ops->ndo_bpf)
+        netdev = offload->netdev;
-                return -EOPNOTSUPP;
        data->command = cmd;
@@ -87,62 +114,63 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
        if (err)
                goto exit_unlock;
-        env->dev_ops = data.verifier.ops;
+        env->prog->aux->offload->dev_ops = data.verifier.ops;
        env->prog->aux->offload->dev_state = true;
-        env->prog->aux->offload->verifier_running = true;
 exit_unlock:
        rtnl_unlock();
        return err;
 }
+int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
+                                 int insn_idx, int prev_insn_idx)
+{
+        struct bpf_prog_offload *offload;
+        int ret = -ENODEV;
+        down_read(&bpf_devs_lock);
+        offload = env->prog->aux->offload;
+        if (offload)
+                ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
+        up_read(&bpf_devs_lock);
+        return ret;
+}
 static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 {
-        struct bpf_dev_offload *offload = prog->aux->offload;
+        struct bpf_prog_offload *offload = prog->aux->offload;
        struct netdev_bpf data = {};
-        /* Caution - if netdev is destroyed before the program, this function
-         * will be called twice.
-         */
        data.offload.prog = prog;
-        if (offload->verifier_running)
-                wait_event(offload->verifier_done, !offload->verifier_running);
        if (offload->dev_state)
                WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
-        offload->dev_state = false;
+        /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
+        bpf_prog_free_id(prog, true);
        list_del_init(&offload->offloads);
-        offload->netdev = NULL;
+        kfree(offload);
+        prog->aux->offload = NULL;
 }
 void bpf_prog_offload_destroy(struct bpf_prog *prog)
 {
-        struct bpf_dev_offload *offload = prog->aux->offload;
-        offload->verifier_running = false;
-        wake_up(&offload->verifier_done);
        rtnl_lock();
-        __bpf_prog_offload_destroy(prog);
+        down_write(&bpf_devs_lock);
+        if (prog->aux->offload)
+                __bpf_prog_offload_destroy(prog);
+        up_write(&bpf_devs_lock);
        rtnl_unlock();
-        kfree(offload);
 }
 static int bpf_prog_offload_translate(struct bpf_prog *prog)
 {
-        struct bpf_dev_offload *offload = prog->aux->offload;
        struct netdev_bpf data = {};
        int ret;
        data.offload.prog = prog;
-        offload->verifier_running = false;
-        wake_up(&offload->verifier_done);
        rtnl_lock();
        ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
        rtnl_unlock();
@@ -164,14 +192,323 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
        return bpf_prog_offload_translate(prog);
 }
+struct ns_get_path_bpf_prog_args {
+        struct bpf_prog *prog;
+        struct bpf_prog_info *info;
+};
+static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data)
+{
+        struct ns_get_path_bpf_prog_args *args = private_data;
+        struct bpf_prog_aux *aux = args->prog->aux;
+        struct ns_common *ns;
+        struct net *net;
+        rtnl_lock();
+        down_read(&bpf_devs_lock);
+        if (aux->offload) {
+                args->info->ifindex = aux->offload->netdev->ifindex;
+                net = dev_net(aux->offload->netdev);
+                get_net(net);
+                ns = &net->ns;
+        } else {
+                args->info->ifindex = 0;
+                ns = NULL;
+        }
+        up_read(&bpf_devs_lock);
+        rtnl_unlock();
+        return ns;
+}
+int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
+                               struct bpf_prog *prog)
+{
+        struct ns_get_path_bpf_prog_args args = {
+                .prog   = prog,
+                .info   = info,
+        };
+        struct bpf_prog_aux *aux = prog->aux;
+        struct inode *ns_inode;
+        struct path ns_path;
+        char __user *uinsns;
+        void *res;
+        u32 ulen;
+        res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args);
+        if (IS_ERR(res)) {
+                if (!info->ifindex)
+                        return -ENODEV;
+                return PTR_ERR(res);
+        }
+        down_read(&bpf_devs_lock);
+        if (!aux->offload) {
+                up_read(&bpf_devs_lock);
+                return -ENODEV;
+        }
+        ulen = info->jited_prog_len;
+        info->jited_prog_len = aux->offload->jited_len;
+        if (info->jited_prog_len & ulen) {
+                uinsns = u64_to_user_ptr(info->jited_prog_insns);
+                ulen = min_t(u32, info->jited_prog_len, ulen);
+                if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {
+                        up_read(&bpf_devs_lock);
+                        return -EFAULT;
+                }
+        }
+        up_read(&bpf_devs_lock);
+        ns_inode = ns_path.dentry->d_inode;
+        info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+        info->netns_ino = ns_inode->i_ino;
+        path_put(&ns_path);
+        return 0;
+}
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+                               enum bpf_netdev_command cmd)
+{
+        struct netdev_bpf data = {};
+        struct net_device *netdev;
+        ASSERT_RTNL();
+        data.command = cmd;
+        data.offmap = offmap;
+        /* Caller must make sure netdev is valid */
+        netdev = offmap->netdev;
+        return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+        struct net *net = current->nsproxy->net_ns;
+        struct bpf_offloaded_map *offmap;
+        int err;
+        if (!capable(CAP_SYS_ADMIN))
+                return ERR_PTR(-EPERM);
+        if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
+            attr->map_type != BPF_MAP_TYPE_HASH)
+                return ERR_PTR(-EINVAL);
+        offmap = kzalloc(sizeof(*offmap), GFP_USER);
+        if (!offmap)
+                return ERR_PTR(-ENOMEM);
+        bpf_map_init_from_attr(&offmap->map, attr);
+        rtnl_lock();
+        down_write(&bpf_devs_lock);
+        offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
+        err = bpf_dev_offload_check(offmap->netdev);
+        if (err)
+                goto err_unlock;
+        err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
+        if (err)
+                goto err_unlock;
+        list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
+        up_write(&bpf_devs_lock);
+        rtnl_unlock();
+        return &offmap->map;
+err_unlock:
+        up_write(&bpf_devs_lock);
+        rtnl_unlock();
+        kfree(offmap);
+        return ERR_PTR(err);
+}
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+        WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+        /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+        bpf_map_free_id(&offmap->map, true);
+        list_del_init(&offmap->offloads);
+        offmap->netdev = NULL;
+}
+void bpf_map_offload_map_free(struct bpf_map *map)
+{
+        struct bpf_offloaded_map *offmap = map_to_offmap(map);
+        rtnl_lock();
+        down_write(&bpf_devs_lock);
+        if (offmap->netdev)
+                __bpf_map_offload_destroy(offmap);
+        up_write(&bpf_devs_lock);
+        rtnl_unlock();
+        kfree(offmap);
+}
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
+{
+        struct bpf_offloaded_map *offmap = map_to_offmap(map);
+        int ret = -ENODEV;
+        down_read(&bpf_devs_lock);
+        if (offmap->netdev)
+                ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
+        up_read(&bpf_devs_lock);
+        return ret;
+}
+int bpf_map_offload_update_elem(struct bpf_map *map,
+                                void *key, void *value, u64 flags)
+{
+        struct bpf_offloaded_map *offmap = map_to_offmap(map);
+        int ret = -ENODEV;
+        if (unlikely(flags > BPF_EXIST))
+                return -EINVAL;
+        down_read(&bpf_devs_lock);
+        if (offmap->netdev)
+                ret = offmap->dev_ops->map_update_elem(offmap, key, value,
+                                                       flags);
+        up_read(&bpf_devs_lock);
+        return ret;
+}
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
+{
+        struct bpf_offloaded_map *offmap = map_to_offmap(map);
+        int ret = -ENODEV;
+        down_read(&bpf_devs_lock);
+        if (offmap->netdev)
+                ret = offmap->dev_ops->map_delete_elem(offmap, key);
+        up_read(&bpf_devs_lock);
+        return ret;
+}
+int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+        struct bpf_offloaded_map *offmap = map_to_offmap(map);
+        int ret = -ENODEV;
+        down_read(&bpf_devs_lock);
+        if (offmap->netdev)
+                ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
+        up_read(&bpf_devs_lock);
+        return ret;
+}
+struct ns_get_path_bpf_map_args {
+        struct bpf_offloaded_map *offmap;
+        struct bpf_map_info *info;
+};
+static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data)
+{
+        struct ns_get_path_bpf_map_args *args = private_data;
+        struct ns_common *ns;
+        struct net *net;
+        rtnl_lock();
+        down_read(&bpf_devs_lock);
+        if (args->offmap->netdev) {
+                args->info->ifindex = args->offmap->netdev->ifindex;
+                net = dev_net(args->offmap->netdev);
+                get_net(net);
+                ns = &net->ns;
+        } else {
+                args->info->ifindex = 0;
+                ns = NULL;
+        }
+        up_read(&bpf_devs_lock);
+        rtnl_unlock();
+        return ns;
+}
+int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+        struct ns_get_path_bpf_map_args args = {
+                .offmap = map_to_offmap(map),
+                .info   = info,
+        };
+        struct inode *ns_inode;
+        struct path ns_path;
+        void *res;
+        res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args);
+        if (IS_ERR(res)) {
+                if (!info->ifindex)
+                        return -ENODEV;
+                return PTR_ERR(res);
+        }
+        ns_inode = ns_path.dentry->d_inode;
+        info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+        info->netns_ino = ns_inode->i_ino;
+        path_put(&ns_path);
+        return 0;
+}
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
+{
+        struct bpf_offloaded_map *offmap;
+        struct bpf_prog_offload *offload;
+        bool ret;
+        if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map))
+                return false;
+        down_read(&bpf_devs_lock);
+        offload = prog->aux->offload;
+        offmap = map_to_offmap(map);
+        ret = offload && offload->netdev == offmap->netdev;
+        up_read(&bpf_devs_lock);
+        return ret;
+}
+static void bpf_offload_orphan_all_progs(struct net_device *netdev)
+{
+        struct bpf_prog_offload *offload, *tmp;
+        list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
+                if (offload->netdev == netdev)
+                        __bpf_prog_offload_destroy(offload->prog);
+}
+static void bpf_offload_orphan_all_maps(struct net_device *netdev)
+{
+        struct bpf_offloaded_map *offmap, *tmp;
+        list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
+                if (offmap->netdev == netdev)
+                        __bpf_map_offload_destroy(offmap);
+}
 static int bpf_offload_notification(struct notifier_block *notifier,
                                    ulong event, void *ptr)
 {
        struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
-        struct bpf_dev_offload *offload, *tmp;
        ASSERT_RTNL();
@@ -181,11 +518,10 @@ static int bpf_offload_notification(struct notifier_block *notifier,
                if (netdev->reg_state != NETREG_UNREGISTERING)
                        break;
-                list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+                down_write(&bpf_devs_lock);
-                                         offloads) {
+                bpf_offload_orphan_all_progs(netdev);
-                        if (offload->netdev == netdev)
+                bpf_offload_orphan_all_maps(netdev);
-                                __bpf_prog_offload_destroy(offload->prog);
+                up_write(&bpf_devs_lock);
-                }
                break;
        default:
                break;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 1712d319c2d8..0314d1783d77 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -96,14 +96,6 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
        return rcu_dereference_sk_user_data(sk);
 }
-/* compute the linear packet data range [data, data_end) for skb when
- * sk_skb type programs are in use.
- */
-static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
-{
-        TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
-}
 enum __sk_action {
        __SK_DROP = 0,
        __SK_PASS,
@@ -521,13 +513,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
        if (!stab)
                return ERR_PTR(-ENOMEM);
-        /* mandatory map attributes */
+        bpf_map_init_from_attr(&stab->map, attr);
-        stab->map.map_type = attr->map_type;
-        stab->map.key_size = attr->key_size;
-        stab->map.value_size = attr->value_size;
-        stab->map.max_entries = attr->max_entries;
-        stab->map.map_flags = attr->map_flags;
-        stab->map.numa_node = bpf_map_attr_numa_node(attr);
        /* make sure page count doesn't overflow */
        cost = (u64) stab->map.max_entries * sizeof(struct sock *);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index a15bc636cc98..b0ecf43f5894 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
        if (cost >= U32_MAX - PAGE_SIZE)
                goto free_smap;
-        smap->map.map_type = attr->map_type;
+        bpf_map_init_from_attr(&smap->map, attr);
-        smap->map.key_size = attr->key_size;
        smap->map.value_size = value_size;
-        smap->map.max_entries = attr->max_entries;
-        smap->map.map_flags = attr->map_flags;
        smap->n_buckets = n_buckets;
        smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-        smap->map.numa_node = bpf_map_attr_numa_node(attr);
        err = bpf_map_precharge_memlock(smap->map.pages);
        if (err)
@@ -226,9 +222,33 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
        return 0;
 }
-static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+static int stack_map_get_next_key(struct bpf_map *map, void *key,
+                                  void *next_key)
 {
-        return -EINVAL;
+        struct bpf_stack_map *smap = container_of(map,
+                                                  struct bpf_stack_map, map);
+        u32 id;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        if (!key) {
+                id = 0;
+        } else {
+                id = *(u32 *)key;
+                if (id >= smap->n_buckets || !smap->buckets[id])
+                        id = 0;
+                else
+                        id++;
+        }
+        while (id < smap->n_buckets && !smap->buckets[id])
+                id++;
+        if (id >= smap->n_buckets)
+                return -ENOENT;
+        *(u32 *)next_key = id;
+        return 0;
 }
 static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5cb783fc8224..e24aa3241387 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -94,18 +94,34 @@ static int check_uarg_tail_zero(void __user *uaddr,
        return 0;
 }
+const struct bpf_map_ops bpf_map_offload_ops = {
+        .map_alloc = bpf_map_offload_map_alloc,
+        .map_free = bpf_map_offload_map_free,
+};
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 {
+        const struct bpf_map_ops *ops;
        struct bpf_map *map;
+        int err;
-        if (attr->map_type >= ARRAY_SIZE(bpf_map_types) ||
+        if (attr->map_type >= ARRAY_SIZE(bpf_map_types))
-            !bpf_map_types[attr->map_type])
+                return ERR_PTR(-EINVAL);
+        ops = bpf_map_types[attr->map_type];
+        if (!ops)
                return ERR_PTR(-EINVAL);
-        map = bpf_map_types[attr->map_type]->map_alloc(attr);
+        if (ops->map_alloc_check) {
+                err = ops->map_alloc_check(attr);
+                if (err)
+                        return ERR_PTR(err);
+        }
+        if (attr->map_ifindex)
+                ops = &bpf_map_offload_ops;
+        map = ops->map_alloc(attr);
        if (IS_ERR(map))
                return map;
-        map->ops = bpf_map_types[attr->map_type];
+        map->ops = ops;
        map->map_type = attr->map_type;
        return map;
 }
@@ -134,6 +150,16 @@ void bpf_map_area_free(void *area)
        kvfree(area);
 }
+void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
+{
+        map->map_type = attr->map_type;
+        map->key_size = attr->key_size;
+        map->value_size = attr->value_size;
+        map->max_entries = attr->max_entries;
+        map->map_flags = attr->map_flags;
+        map->numa_node = bpf_map_attr_numa_node(attr);
+}
 int bpf_map_precharge_memlock(u32 pages)
 {
        struct user_struct *user = get_current_user();
@@ -189,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map)
        return id > 0 ? 0 : id;
 }
-static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 {
        unsigned long flags;
+        /* Offloaded maps are removed from the IDR store when their device
+         * disappears - even if someone holds an fd to them they are unusable,
+         * the memory is gone, all ops will fail; they are simply waiting for
+         * refcnt to drop to be freed.
+         */
+        if (!map->id)
+                return;
        if (do_idr_lock)
                spin_lock_irqsave(&map_idr_lock, flags);
        else
                __acquire(&map_idr_lock);
        idr_remove(&map_idr, map->id);
+        map->id = 0;
        if (do_idr_lock)
                spin_unlock_irqrestore(&map_idr_lock, flags);
@@ -378,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
        return 0;
 }
-#define BPF_MAP_CREATE_LAST_FIELD map_name
+#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -566,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr)
        if (!value)
                goto free_key;
-        if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+        if (bpf_map_is_dev_bound(map)) {
-            map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+                err = bpf_map_offload_lookup_elem(map, key, value);
+        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+                   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
                err = bpf_percpu_hash_copy(map, key, value);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
                err = bpf_percpu_array_copy(map, key, value);
@@ -654,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr)
                goto free_value;
        /* Need to create a kthread, thus must support schedule */
-        if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+        if (bpf_map_is_dev_bound(map)) {
+                err = bpf_map_offload_update_elem(map, key, value, attr->flags);
+                goto out;
+        } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
                err = map->ops->map_update_elem(map, key, value, attr->flags);
                goto out;
        }
@@ -669,10 +709,7 @@ static int map_update_elem(union bpf_attr *attr)
                err = bpf_percpu_hash_update(map, key, value, attr->flags);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
                err = bpf_percpu_array_update(map, key, value, attr->flags);
-        } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+        } else if (IS_FD_ARRAY(map)) {
-                   map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
-                   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY ||
-                   map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
                rcu_read_lock();
                err = bpf_fd_array_map_update_elem(map, f.file, key, value,
                                                   attr->flags);
@@ -731,6 +768,11 @@ static int map_delete_elem(union bpf_attr *attr)
                goto err_put;
        }
+        if (bpf_map_is_dev_bound(map)) {
+                err = bpf_map_offload_delete_elem(map, key);
+                goto out;
+        }
        preempt_disable();
        __this_cpu_inc(bpf_prog_active);
        rcu_read_lock();
@@ -738,7 +780,7 @@ static int map_delete_elem(union bpf_attr *attr)
        rcu_read_unlock();
        __this_cpu_dec(bpf_prog_active);
        preempt_enable();
+out:
        if (!err)
                trace_bpf_map_delete_elem(map, ufd, key);
        kfree(key);
@@ -788,9 +830,15 @@ static int map_get_next_key(union bpf_attr *attr)
        if (!next_key)
                goto free_key;
+        if (bpf_map_is_dev_bound(map)) {
+                err = bpf_map_offload_get_next_key(map, key, next_key);
+                goto out;
+        }
        rcu_read_lock();
        err = map->ops->map_get_next_key(map, key, next_key);
        rcu_read_unlock();
+out:
        if (err)
                goto free_next_key;
@@ -905,9 +953,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
        return id > 0 ? 0 : id;
 }
-static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
 {
-        /* cBPF to eBPF migrations are currently not in the idr store. */
+        /* cBPF to eBPF migrations are currently not in the idr store.
+         * Offloaded programs are removed from the store when their device
+         * disappears - even if someone grabs an fd to them they are unusable,
+         * simply waiting for refcnt to drop to be freed.
+         */
        if (!prog->aux->id)
                return;
@@ -917,6 +969,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
                __acquire(&prog_idr_lock);
        idr_remove(&prog_idr, prog->aux->id);
+        prog->aux->id = 0;
        if (do_idr_lock)
                spin_unlock_bh(&prog_idr_lock);
@@ -937,10 +990,16 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
        if (atomic_dec_and_test(&prog->aux->refcnt)) {
+                int i;
                trace_bpf_prog_put_rcu(prog);
                /* bpf_prog_free_id() must be called first */
                bpf_prog_free_id(prog, do_idr_lock);
+                for (i = 0; i < prog->aux->func_cnt; i++)
+                        bpf_prog_kallsyms_del(prog->aux->func[i]);
                bpf_prog_kallsyms_del(prog);
                call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
        }
 }
@@ -1151,6 +1210,8 @@ static int bpf_prog_load(union bpf_attr *attr)
        if (!prog)
                return -ENOMEM;
+        prog->aux->offload_requested = !!attr->prog_ifindex;
        err = security_bpf_prog_alloc(prog->aux);
        if (err)
                goto free_prog_nouncharge;
@@ -1172,7 +1233,7 @@ static int bpf_prog_load(union bpf_attr *attr)
        atomic_set(&prog->aux->refcnt, 1);
        prog->gpl_compatible = is_gpl ? 1 : 0;
-        if (attr->prog_ifindex) {
+        if (bpf_prog_is_dev_bound(prog->aux)) {
                err = bpf_prog_offload_init(prog, attr);
                if (err)
                        goto free_prog;
@@ -1194,7 +1255,8 @@ static int bpf_prog_load(union bpf_attr *attr)
                goto free_used_maps;
        /* eBPF program is ready to be JITed */
-        prog = bpf_prog_select_runtime(prog, &err);
+        if (!prog->bpf_func)
+                prog = bpf_prog_select_runtime(prog, &err);
        if (err < 0)
                goto free_used_maps;
@@ -1439,6 +1501,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
        struct bpf_prog *prog;
        int ret = -ENOTSUPP;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
        if (CHECK_ATTR(BPF_PROG_TEST_RUN))
                return -EINVAL;
@@ -1551,6 +1615,67 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
        return fd;
 }
+static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
+                                              unsigned long addr)
+{
+        int i;
+        for (i = 0; i < prog->aux->used_map_cnt; i++)
+                if (prog->aux->used_maps[i] == (void *)addr)
+                        return prog->aux->used_maps[i];
+        return NULL;
+}
+static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
+{
+        const struct bpf_map *map;
+        struct bpf_insn *insns;
+        u64 imm;
+        int i;
+        insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
+                        GFP_USER);
+        if (!insns)
+                return insns;
+        for (i = 0; i < prog->len; i++) {
+                if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) {
+                        insns[i].code = BPF_JMP | BPF_CALL;
+                        insns[i].imm = BPF_FUNC_tail_call;
+                        /* fall-through */
+                }
+                if (insns[i].code == (BPF_JMP | BPF_CALL) ||
+                    insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) {
+                        if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS))
+                                insns[i].code = BPF_JMP | BPF_CALL;
+                        if (!bpf_dump_raw_ok())
+                                insns[i].imm = 0;
+                        continue;
+                }
+                if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW))
+                        continue;
+                imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
+                map = bpf_map_from_imm(prog, imm);
+                if (map) {
+                        insns[i].src_reg = BPF_PSEUDO_MAP_FD;
+                        insns[i].imm = map->id;
+                        insns[i + 1].imm = 0;
+                        continue;
+                }
+                if (!bpf_dump_raw_ok() &&
+                    imm == (unsigned long)prog->aux) {
+                        insns[i].imm = 0;
+                        insns[i + 1].imm = 0;
+                        continue;
+                }
+        }
+        return insns;
+}
 static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
                                   const union bpf_attr *attr,
                                   union bpf_attr __user *uattr)
@@ -1598,24 +1723,51 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
                goto done;
        }
-        ulen = info.jited_prog_len;
-        info.jited_prog_len = prog->jited_len;
-        if (info.jited_prog_len && ulen) {
-                uinsns = u64_to_user_ptr(info.jited_prog_insns);
-                ulen = min_t(u32, info.jited_prog_len, ulen);
-                if (copy_to_user(uinsns, prog->bpf_func, ulen))
-                        return -EFAULT;
-        }
        ulen = info.xlated_prog_len;
        info.xlated_prog_len = bpf_prog_insn_size(prog);
        if (info.xlated_prog_len && ulen) {
+                struct bpf_insn *insns_sanitized;
+                bool fault;
+                if (prog->blinded && !bpf_dump_raw_ok()) {
+                        info.xlated_prog_insns = 0;
+                        goto done;
+                }
+                insns_sanitized = bpf_insn_prepare_dump(prog);
+                if (!insns_sanitized)
+                        return -ENOMEM;
                uinsns = u64_to_user_ptr(info.xlated_prog_insns);
                ulen = min_t(u32, info.xlated_prog_len, ulen);
-                if (copy_to_user(uinsns, prog->insnsi, ulen))
+                fault = copy_to_user(uinsns, insns_sanitized, ulen);
+                kfree(insns_sanitized);
+                if (fault)
                        return -EFAULT;
        }
+        if (bpf_prog_is_dev_bound(prog->aux)) {
+                err = bpf_prog_offload_info_fill(&info, prog);
+                if (err)
+                        return err;
+                goto done;
+        }
+        /* NOTE: the following code is supposed to be skipped for offload.
+         * bpf_prog_offload_info_fill() is the place to fill similar fields
+         * for offload.
+         */
+        ulen = info.jited_prog_len;
+        info.jited_prog_len = prog->jited_len;
+        if (info.jited_prog_len && ulen) {
+                if (bpf_dump_raw_ok()) {
+                        uinsns = u64_to_user_ptr(info.jited_prog_insns);
+                        ulen = min_t(u32, info.jited_prog_len, ulen);
+                        if (copy_to_user(uinsns, prog->bpf_func, ulen))
+                                return -EFAULT;
+                } else {
+                        info.jited_prog_insns = 0;
+                }
+        }
 done:
        if (copy_to_user(uinfo, &info, info_len) ||
            put_user(info_len, &uattr->info.info_len))
@@ -1646,6 +1798,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
        info.map_flags = map->map_flags;
        memcpy(info.name, map->name, sizeof(map->name));
+        if (bpf_map_is_dev_bound(map)) {
+                err = bpf_map_offload_info_fill(&info, map);
+                if (err)
+                        return err;
+        }
        if (copy_to_user(uinfo, &info, info_len) ||
            put_user(info_len, &uattr->info.info_len))
                return -EFAULT;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 13551e623501..5fb69a85d967 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20,6 +20,8 @@
 #include <linux/file.h>
 #include <linux/vmalloc.h>
 #include <linux/stringify.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
 #include "disasm.h"
@@ -167,11 +169,11 @@ struct bpf_call_arg_meta {
 static DEFINE_MUTEX(bpf_verifier_lock);
 /* log_level controls verbosity level of eBPF verifier.
- * verbose() is used to dump the verification trace to the log, so the user
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * can figure out what's wrong with the program
+ * so the user can figure out what's wrong with the program
 */
-static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
-                                   const char *fmt, ...)
+                                           const char *fmt, ...)
 {
        struct bpf_verifer_log *log = &env->log;
        unsigned int n;
@@ -195,6 +197,14 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
        else
                log->ubuf = NULL;
 }
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+/* Historically bpf_verifier_log_write was called verbose, but the name was too
+ * generic for symbol export. The function was renamed, but not the calls in
+ * the verifier to avoid complicating backports. Hence the alias below.
+ */
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+                                   const char *fmt, ...)
+        __attribute__((alias("bpf_verifier_log_write")));
 static bool type_is_pkt_pointer(enum bpf_reg_type type)
 {
@@ -216,23 +226,48 @@ static const char * const reg_type_str[] = {
        [PTR_TO_PACKET_END]     = "pkt_end",
 };
+static void print_liveness(struct bpf_verifier_env *env,
+                           enum bpf_reg_liveness live)
+{
+        if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN))
+            verbose(env, "_");
+        if (live & REG_LIVE_READ)
+                verbose(env, "r");
+        if (live & REG_LIVE_WRITTEN)
+                verbose(env, "w");
+}
+static struct bpf_func_state *func(struct bpf_verifier_env *env,
+                                   const struct bpf_reg_state *reg)
+{
+        struct bpf_verifier_state *cur = env->cur_state;
+        return cur->frame[reg->frameno];
+}
 static void print_verifier_state(struct bpf_verifier_env *env,
-                                 struct bpf_verifier_state *state)
+                                 const struct bpf_func_state *state)
 {
-        struct bpf_reg_state *reg;
+        const struct bpf_reg_state *reg;
        enum bpf_reg_type t;
        int i;
+        if (state->frameno)
+                verbose(env, " frame%d:", state->frameno);
        for (i = 0; i < MAX_BPF_REG; i++) {
                reg = &state->regs[i];
                t = reg->type;
                if (t == NOT_INIT)
                        continue;
-                verbose(env, " R%d=%s", i, reg_type_str[t]);
+                verbose(env, " R%d", i);
+                print_liveness(env, reg->live);
+                verbose(env, "=%s", reg_type_str[t]);
                if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
                    tnum_is_const(reg->var_off)) {
                        /* reg->off should be 0 for SCALAR_VALUE */
                        verbose(env, "%lld", reg->var_off.value + reg->off);
+                        if (t == PTR_TO_STACK)
+                                verbose(env, ",call_%d", func(env, reg)->callsite);
                } else {
                        verbose(env, "(id=%d", reg->id);
                        if (t != SCALAR_VALUE)
@@ -277,16 +312,21 @@ static void print_verifier_state(struct bpf_verifier_env *env,
                }
        }
        for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-                if (state->stack[i].slot_type[0] == STACK_SPILL)
+                if (state->stack[i].slot_type[0] == STACK_SPILL) {
-                        verbose(env, " fp%d=%s",
+                        verbose(env, " fp%d",
-                                -MAX_BPF_STACK + i * BPF_REG_SIZE,
+                                (-i - 1) * BPF_REG_SIZE);
+                        print_liveness(env, state->stack[i].spilled_ptr.live);
+                        verbose(env, "=%s",
                                reg_type_str[state->stack[i].spilled_ptr.type]);
+                }
+                if (state->stack[i].slot_type[0] == STACK_ZERO)
+                        verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE);
        }
        verbose(env, "\n");
 }
-static int copy_stack_state(struct bpf_verifier_state *dst,
+static int copy_stack_state(struct bpf_func_state *dst,
-                            const struct bpf_verifier_state *src)
+                            const struct bpf_func_state *src)
 {
        if (!src->stack)
                return 0;
@@ -302,13 +342,13 @@ static int copy_stack_state(struct bpf_verifier_state *dst,
 /* do_check() starts with zero-sized stack in struct bpf_verifier_state to
 * make it consume minimal amount of memory. check_stack_write() access from
- * the program calls into realloc_verifier_state() to grow the stack size.
+ * the program calls into realloc_func_state() to grow the stack size.
 * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
 * which this function copies over. It points to previous bpf_verifier_state
 * which is never reallocated
 */
-static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
+static int realloc_func_state(struct bpf_func_state *state, int size,
-                                  bool copy_old)
+                              bool copy_old)
 {
        u32 old_size = state->allocated_stack;
        struct bpf_stack_state *new_stack;
@@ -341,10 +381,23 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
        return 0;
 }
+static void free_func_state(struct bpf_func_state *state)
+{
+        if (!state)
+                return;
+        kfree(state->stack);
+        kfree(state);
+}
 static void free_verifier_state(struct bpf_verifier_state *state,
                                bool free_self)
 {
-        kfree(state->stack);
+        int i;
+        for (i = 0; i <= state->curframe; i++) {
+                free_func_state(state->frame[i]);
+                state->frame[i] = NULL;
+        }
        if (free_self)
                kfree(state);
 }
@@ -352,18 +405,46 @@ static void free_verifier_state(struct bpf_verifier_state *state,
 /* copy verifier state from src to dst growing dst stack space
 * when necessary to accommodate larger src stack
 */
-static int copy_verifier_state(struct bpf_verifier_state *dst,
+static int copy_func_state(struct bpf_func_state *dst,
-                               const struct bpf_verifier_state *src)
+                           const struct bpf_func_state *src)
 {
        int err;
-        err = realloc_verifier_state(dst, src->allocated_stack, false);
+        err = realloc_func_state(dst, src->allocated_stack, false);
        if (err)
                return err;
-        memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+        memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack));
        return copy_stack_state(dst, src);
 }
+static int copy_verifier_state(struct bpf_verifier_state *dst_state,
+                               const struct bpf_verifier_state *src)
+{
+        struct bpf_func_state *dst;
+        int i, err;
+        /* if dst has more stack frames then src frame, free them */
+        for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
+                free_func_state(dst_state->frame[i]);
+                dst_state->frame[i] = NULL;
+        }
+        dst_state->curframe = src->curframe;
+        dst_state->parent = src->parent;
+        for (i = 0; i <= src->curframe; i++) {
+                dst = dst_state->frame[i];
+                if (!dst) {
+                        dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+                        if (!dst)
+                                return -ENOMEM;
+                        dst_state->frame[i] = dst;
+                }
+                err = copy_func_state(dst, src->frame[i]);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
                     int *insn_idx)
 {
@@ -416,6 +497,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
        }
        return &elem->st;
 err:
+        free_verifier_state(env->cur_state, true);
+        env->cur_state = NULL;
        /* pop all elements and return */
        while (!pop_stack(env, NULL, NULL));
        return NULL;
@@ -425,6 +508,10 @@ err:
 static const int caller_saved[CALLER_SAVED_REGS] = {
        BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
 };
+#define CALLEE_SAVED_REGS 5
+static const int callee_saved[CALLEE_SAVED_REGS] = {
+        BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9
+};
 static void __mark_reg_not_init(struct bpf_reg_state *reg);
@@ -449,6 +536,13 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
        __mark_reg_known(reg, 0);
 }
+static void __mark_reg_const_zero(struct bpf_reg_state *reg)
+{
+        __mark_reg_known(reg, 0);
+        reg->off = 0;
+        reg->type = SCALAR_VALUE;
+}
 static void mark_reg_known_zero(struct bpf_verifier_env *env,
                                struct bpf_reg_state *regs, u32 regno)
 {
@@ -560,6 +654,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
        reg->id = 0;
        reg->off = 0;
        reg->var_off = tnum_unknown;
+        reg->frameno = 0;
        __mark_reg_unbounded(reg);
 }
@@ -568,8 +663,8 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
 {
        if (WARN_ON(regno >= MAX_BPF_REG)) {
                verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
-                /* Something bad happened, let's kill all regs */
+                /* Something bad happened, let's kill all regs except FP */
-                for (regno = 0; regno < MAX_BPF_REG; regno++)
+                for (regno = 0; regno < BPF_REG_FP; regno++)
                        __mark_reg_not_init(regs + regno);
                return;
        }
@@ -587,8 +682,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
 {
        if (WARN_ON(regno >= MAX_BPF_REG)) {
                verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
-                /* Something bad happened, let's kill all regs */
+                /* Something bad happened, let's kill all regs except FP */
-                for (regno = 0; regno < MAX_BPF_REG; regno++)
+                for (regno = 0; regno < BPF_REG_FP; regno++)
                        __mark_reg_not_init(regs + regno);
                return;
        }
@@ -596,8 +691,9 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
 }
 static void init_reg_state(struct bpf_verifier_env *env,
-                           struct bpf_reg_state *regs)
+                           struct bpf_func_state *state)
 {
+        struct bpf_reg_state *regs = state->regs;
        int i;
        for (i = 0; i < MAX_BPF_REG; i++) {
@@ -608,41 +704,218 @@ static void init_reg_state(struct bpf_verifier_env *env,
        /* frame pointer */
        regs[BPF_REG_FP].type = PTR_TO_STACK;
        mark_reg_known_zero(env, regs, BPF_REG_FP);
+        regs[BPF_REG_FP].frameno = state->frameno;
        /* 1st arg to a function */
        regs[BPF_REG_1].type = PTR_TO_CTX;
        mark_reg_known_zero(env, regs, BPF_REG_1);
 }
+#define BPF_MAIN_FUNC (-1)
+static void init_func_state(struct bpf_verifier_env *env,
+                            struct bpf_func_state *state,
+                            int callsite, int frameno, int subprogno)
+{
+        state->callsite = callsite;
+        state->frameno = frameno;
+        state->subprogno = subprogno;
+        init_reg_state(env, state);
+}
 enum reg_arg_type {
        SRC_OP,         /* register is used as source operand */
        DST_OP,         /* register is used as destination operand */
        DST_OP_NO_MARK  /* same as above, check only, don't mark */
 };
-static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
+static int cmp_subprogs(const void *a, const void *b)
+{
+        return *(int *)a - *(int *)b;
+}
+static int find_subprog(struct bpf_verifier_env *env, int off)
 {
-        struct bpf_verifier_state *parent = state->parent;
+        u32 *p;
+        p = bsearch(&off, env->subprog_starts, env->subprog_cnt,
+                    sizeof(env->subprog_starts[0]), cmp_subprogs);
+        if (!p)
+                return -ENOENT;
+        return p - env->subprog_starts;
+}
+static int add_subprog(struct bpf_verifier_env *env, int off)
+{
+        int insn_cnt = env->prog->len;
+        int ret;
+        if (off >= insn_cnt || off < 0) {
+                verbose(env, "call to invalid destination\n");
+                return -EINVAL;
+        }
+        ret = find_subprog(env, off);
+        if (ret >= 0)
+                return 0;
+        if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
+                verbose(env, "too many subprograms\n");
+                return -E2BIG;
+        }
+        env->subprog_starts[env->subprog_cnt++] = off;
+        sort(env->subprog_starts, env->subprog_cnt,
+             sizeof(env->subprog_starts[0]), cmp_subprogs, NULL);
+        return 0;
+}
+static int check_subprogs(struct bpf_verifier_env *env)
+{
+        int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
+        struct bpf_insn *insn = env->prog->insnsi;
+        int insn_cnt = env->prog->len;
+        /* determine subprog starts. The end is one before the next starts */
+        for (i = 0; i < insn_cnt; i++) {
+                if (insn[i].code != (BPF_JMP | BPF_CALL))
+                        continue;
+                if (insn[i].src_reg != BPF_PSEUDO_CALL)
+                        continue;
+                if (!env->allow_ptr_leaks) {
+                        verbose(env, "function calls to other bpf functions are allowed for root only\n");
+                        return -EPERM;
+                }
+                if (bpf_prog_is_dev_bound(env->prog->aux)) {
+                        verbose(env, "function calls in offloaded programs are not supported yet\n");
+                        return -EINVAL;
+                }
+                ret = add_subprog(env, i + insn[i].imm + 1);
+                if (ret < 0)
+                        return ret;
+        }
+        if (env->log.level > 1)
+                for (i = 0; i < env->subprog_cnt; i++)
+                        verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]);
+        /* now check that all jumps are within the same subprog */
+        subprog_start = 0;
+        if (env->subprog_cnt == cur_subprog)
+                subprog_end = insn_cnt;
+        else
+                subprog_end = env->subprog_starts[cur_subprog++];
+        for (i = 0; i < insn_cnt; i++) {
+                u8 code = insn[i].code;
+                if (BPF_CLASS(code) != BPF_JMP)
+                        goto next;
+                if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
+                        goto next;
+                off = i + insn[i].off + 1;
+                if (off < subprog_start || off >= subprog_end) {
+                        verbose(env, "jump out of range from insn %d to %d\n", i, off);
+                        return -EINVAL;
+                }
+next:
+                if (i == subprog_end - 1) {
+                        /* to avoid fall-through from one subprog into another
+                         * the last insn of the subprog should be either exit
+                         * or unconditional jump back
+                         */
+                        if (code != (BPF_JMP | BPF_EXIT) &&
+                            code != (BPF_JMP | BPF_JA)) {
+                                verbose(env, "last insn is not an exit or jmp\n");
+                                return -EINVAL;
+                        }
+                        subprog_start = subprog_end;
+                        if (env->subprog_cnt == cur_subprog)
+                                subprog_end = insn_cnt;
+                        else
+                                subprog_end = env->subprog_starts[cur_subprog++];
+                }
+        }
+        return 0;
+}
+static
+struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env,
+                                       const struct bpf_verifier_state *state,
+                                       struct bpf_verifier_state *parent,
+                                       u32 regno)
+{
+        struct bpf_verifier_state *tmp = NULL;
+        /* 'parent' could be a state of caller and
+         * 'state' could be a state of callee. In such case
+         * parent->curframe < state->curframe
+         * and it's ok for r1 - r5 registers
+         *
+         * 'parent' could be a callee's state after it bpf_exit-ed.
+         * In such case parent->curframe > state->curframe
+         * and it's ok for r0 only
+         */
+        if (parent->curframe == state->curframe ||
+            (parent->curframe < state->curframe &&
+             regno >= BPF_REG_1 && regno <= BPF_REG_5) ||
+            (parent->curframe > state->curframe &&
+               regno == BPF_REG_0))
+                return parent;
+        if (parent->curframe > state->curframe &&
+            regno >= BPF_REG_6) {
+                /* for callee saved regs we have to skip the whole chain
+                 * of states that belong to callee and mark as LIVE_READ
+                 * the registers before the call
+                 */
+                tmp = parent;
+                while (tmp && tmp->curframe != state->curframe) {
+                        tmp = tmp->parent;
+                }
+                if (!tmp)
+                        goto bug;
+                parent = tmp;
+        } else {
+                goto bug;
+        }
+        return parent;
+bug:
+        verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp);
+        verbose(env, "regno %d parent frame %d current frame %d\n",
+                regno, parent->curframe, state->curframe);
+        return NULL;
+}
+static int mark_reg_read(struct bpf_verifier_env *env,
+                         const struct bpf_verifier_state *state,
+                         struct bpf_verifier_state *parent,
+                         u32 regno)
+{
+        bool writes = parent == state->parent; /* Observe write marks */
        if (regno == BPF_REG_FP)
                /* We don't need to worry about FP liveness because it's read-only */
-                return;
+                return 0;
        while (parent) {
                /* if read wasn't screened by an earlier write ... */
-                if (state->regs[regno].live & REG_LIVE_WRITTEN)
+                if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN)
                        break;
+                parent = skip_callee(env, state, parent, regno);
+                if (!parent)
+                        return -EFAULT;
                /* ... then we depend on parent's value */
-                parent->regs[regno].live |= REG_LIVE_READ;
+                parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ;
                state = parent;
                parent = state->parent;
+                writes = true;
        }
+        return 0;
 }
 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
                         enum reg_arg_type t)
 {
-        struct bpf_reg_state *regs = env->cur_state->regs;
+        struct bpf_verifier_state *vstate = env->cur_state;
+        struct bpf_func_state *state = vstate->frame[vstate->curframe];
+        struct bpf_reg_state *regs = state->regs;
        if (regno >= MAX_BPF_REG) {
                verbose(env, "R%d is invalid\n", regno);
@@ -655,7 +928,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
                        verbose(env, "R%d !read_ok\n", regno);
                        return -EACCES;
                }
-                mark_reg_read(env->cur_state, regno);
+                return mark_reg_read(env, vstate, vstate->parent, regno);
        } else {
                /* check whether register used as dest operand can be written to */
                if (regno == BPF_REG_FP) {
@@ -686,17 +959,25 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
        }
 }
+/* Does this register contain a constant zero? */
+static bool register_is_null(struct bpf_reg_state *reg)
+{
+        return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
+}
 /* check_stack_read/write functions track spill/fill of registers,
 * stack boundary and alignment are checked in check_mem_access()
 */
 static int check_stack_write(struct bpf_verifier_env *env,
-                             struct bpf_verifier_state *state, int off,
+                             struct bpf_func_state *state, /* func where register points to */
-                             int size, int value_regno)
+                             int off, int size, int value_regno)
 {
+        struct bpf_func_state *cur; /* state of the current function */
        int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+        enum bpf_reg_type type;
-        err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
+        err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
-                                     true);
+                                 true);
        if (err)
                return err;
        /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -709,8 +990,9 @@ static int check_stack_write(struct bpf_verifier_env *env,
                return -EACCES;
        }
+        cur = env->cur_state->frame[env->cur_state->curframe];
        if (value_regno >= 0 &&
-            is_spillable_regtype(state->regs[value_regno].type)) {
+            is_spillable_regtype((type = cur->regs[value_regno].type))) {
                /* register containing pointer is being spilled into stack */
                if (size != BPF_REG_SIZE) {
@@ -718,51 +1000,116 @@ static int check_stack_write(struct bpf_verifier_env *env,
                        return -EACCES;
                }
+                if (state != cur && type == PTR_TO_STACK) {
+                        verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
+                        return -EINVAL;
+                }
                /* save register state */
-                state->stack[spi].spilled_ptr = state->regs[value_regno];
+                state->stack[spi].spilled_ptr = cur->regs[value_regno];
                state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
                for (i = 0; i < BPF_REG_SIZE; i++)
                        state->stack[spi].slot_type[i] = STACK_SPILL;
        } else {
+                u8 type = STACK_MISC;
                /* regular write of data into stack */
                state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
+                /* only mark the slot as written if all 8 bytes were written
+                 * otherwise read propagation may incorrectly stop too soon
+                 * when stack slots are partially written.
+                 * This heuristic means that read propagation will be
+                 * conservative, since it will add reg_live_read marks
+                 * to stack slots all the way to first state when programs
+                 * writes+reads less than 8 bytes
+                 */
+                if (size == BPF_REG_SIZE)
+                        state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+                /* when we zero initialize stack slots mark them as such */
+                if (value_regno >= 0 &&
+                    register_is_null(&cur->regs[value_regno]))
+                        type = STACK_ZERO;
                for (i = 0; i < size; i++)
                        state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
-                                STACK_MISC;
+                                type;
        }
        return 0;
 }
-static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot)
+/* registers of every function are unique and mark_reg_read() propagates
+ * the liveness in the following cases:
+ * - from callee into caller for R1 - R5 that were used as arguments
+ * - from caller into callee for R0 that used as result of the call
+ * - from caller to the same caller skipping states of the callee for R6 - R9,
+ *   since R6 - R9 are callee saved by implicit function prologue and
+ *   caller's R6 != callee's R6, so when we propagate liveness up to
+ *   parent states we need to skip callee states for R6 - R9.
+ *
+ * stack slot marking is different, since stacks of caller and callee are
+ * accessible in both (since caller can pass a pointer to caller's stack to
+ * callee which can pass it to another function), hence mark_stack_slot_read()
+ * has to propagate the stack liveness to all parent states at given frame number.
+ * Consider code:
+ * f1() {
+ *   ptr = fp - 8;
+ *   *ptr = ctx;
+ *   call f2 {
+ *      .. = *ptr;
+ *   }
+ *   .. = *ptr;
+ * }
+ * First *ptr is reading from f1's stack and mark_stack_slot_read() has
+ * to mark liveness at the f1's frame and not f2's frame.
+ * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has
+ * to propagate liveness to f2 states at f1's frame level and further into
+ * f1 states at f1's frame level until write into that stack slot
+ */
+static void mark_stack_slot_read(struct bpf_verifier_env *env,
+                                 const struct bpf_verifier_state *state,
+                                 struct bpf_verifier_state *parent,
+                                 int slot, int frameno)
 {
-        struct bpf_verifier_state *parent = state->parent;
+        bool writes = parent == state->parent; /* Observe write marks */
        while (parent) {
+                if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE)
+                        /* since LIVE_WRITTEN mark is only done for full 8-byte
+                         * write the read marks are conservative and parent
+                         * state may not even have the stack allocated. In such case
+                         * end the propagation, since the loop reached beginning
+                         * of the function
+                         */
+                        break;
                /* if read wasn't screened by an earlier write ... */
-                if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
+                if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
                        break;
                /* ... then we depend on parent's value */
-                parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
+                parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
                state = parent;
                parent = state->parent;
+                writes = true;
        }
 }
 static int check_stack_read(struct bpf_verifier_env *env,
-                            struct bpf_verifier_state *state, int off, int size,
+                            struct bpf_func_state *reg_state /* func where register points to */,
-                            int value_regno)
+                            int off, int size, int value_regno)
 {
+        struct bpf_verifier_state *vstate = env->cur_state;
+        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
        u8 *stype;
-        if (state->allocated_stack <= slot) {
+        if (reg_state->allocated_stack <= slot) {
                verbose(env, "invalid read from stack off %d+0 size %d\n",
                        off, size);
                return -EACCES;
        }
-        stype = state->stack[spi].slot_type;
+        stype = reg_state->stack[spi].slot_type;
        if (stype[0] == STACK_SPILL) {
                if (size != BPF_REG_SIZE) {
@@ -778,21 +1125,44 @@ static int check_stack_read(struct bpf_verifier_env *env,
                if (value_regno >= 0) {
                        /* restore register state from stack */
-                        state->regs[value_regno] = state->stack[spi].spilled_ptr;
+                        state->regs[value_regno] = reg_state->stack[spi].spilled_ptr;
-                        mark_stack_slot_read(state, spi);
+                        /* mark reg as written since spilled pointer state likely
+                         * has its liveness marks cleared by is_state_visited()
+                         * which resets stack/reg liveness for state transitions
+                         */
+                        state->regs[value_regno].live |= REG_LIVE_WRITTEN;
                }
+                mark_stack_slot_read(env, vstate, vstate->parent, spi,
+                                     reg_state->frameno);
                return 0;
        } else {
+                int zeros = 0;
                for (i = 0; i < size; i++) {
-                        if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
+                        if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
-                                verbose(env, "invalid read from stack off %d+%d size %d\n",
+                                continue;
-                                        off, i, size);
+                        if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
-                                return -EACCES;
+                                zeros++;
+                                continue;
+                        }
+                        verbose(env, "invalid read from stack off %d+%d size %d\n",
+                                off, i, size);
+                        return -EACCES;
+                }
+                mark_stack_slot_read(env, vstate, vstate->parent, spi,
+                                     reg_state->frameno);
+                if (value_regno >= 0) {
+                        if (zeros == size) {
+                                /* any size read into register is zero extended,
+                                 * so the whole register == const_zero
+                                 */
+                                __mark_reg_const_zero(&state->regs[value_regno]);
+                        } else {
+                                /* have read misc data from the stack */
+                                mark_reg_unknown(env, state->regs, value_regno);
                        }
+                        state->regs[value_regno].live |= REG_LIVE_WRITTEN;
                }
-                if (value_regno >= 0)
-                        /* have read misc data from the stack */
-                        mark_reg_unknown(env, state->regs, value_regno);
                return 0;
        }
 }
@@ -817,7 +1187,8 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 static int check_map_access(struct bpf_verifier_env *env, u32 regno,
                            int off, int size, bool zero_size_allowed)
 {
-        struct bpf_verifier_state *state = env->cur_state;
+        struct bpf_verifier_state *vstate = env->cur_state;
+        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *reg = &state->regs[regno];
        int err;
@@ -1079,6 +1450,103 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
                                           strict);
 }
+static int update_stack_depth(struct bpf_verifier_env *env,
+                              const struct bpf_func_state *func,
+                              int off)
+{
+        u16 stack = env->subprog_stack_depth[func->subprogno];
+        if (stack >= -off)
+                return 0;
+        /* update known max for given subprogram */
+        env->subprog_stack_depth[func->subprogno] = -off;
+        return 0;
+}
+/* starting from main bpf function walk all instructions of the function
+ * and recursively walk all callees that given function can call.
+ * Ignore jump and exit insns.
+ * Since recursion is prevented by check_cfg() this algorithm
+ * only needs a local stack of MAX_CALL_FRAMES to remember callsites
+ */
+static int check_max_stack_depth(struct bpf_verifier_env *env)
+{
+        int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
+        struct bpf_insn *insn = env->prog->insnsi;
+        int insn_cnt = env->prog->len;
+        int ret_insn[MAX_CALL_FRAMES];
+        int ret_prog[MAX_CALL_FRAMES];
+process_func:
+        /* round up to 32-bytes, since this is granularity
+         * of interpreter stack size
+         */
+        depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+        if (depth > MAX_BPF_STACK) {
+                verbose(env, "combined stack size of %d calls is %d. Too large\n",
+                        frame + 1, depth);
+                return -EACCES;
+        }
+continue_func:
+        if (env->subprog_cnt == subprog)
+                subprog_end = insn_cnt;
+        else
+                subprog_end = env->subprog_starts[subprog];
+        for (; i < subprog_end; i++) {
+                if (insn[i].code != (BPF_JMP | BPF_CALL))
+                        continue;
+                if (insn[i].src_reg != BPF_PSEUDO_CALL)
+                        continue;
+                /* remember insn and function to return to */
+                ret_insn[frame] = i + 1;
+                ret_prog[frame] = subprog;
+                /* find the callee */
+                i = i + insn[i].imm + 1;
+                subprog = find_subprog(env, i);
+                if (subprog < 0) {
+                        WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
+                                  i);
+                        return -EFAULT;
+                }
+                subprog++;
+                frame++;
+                if (frame >= MAX_CALL_FRAMES) {
+                        WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
+                        return -EFAULT;
+                }
+                goto process_func;
+        }
+        /* end of for() loop means the last insn of the 'subprog'
+         * was reached. Doesn't matter whether it was JA or EXIT
+         */
+        if (frame == 0)
+                return 0;
+        depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+        frame--;
+        i = ret_insn[frame];
+        subprog = ret_prog[frame];
+        goto continue_func;
+}
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+static int get_callee_stack_depth(struct bpf_verifier_env *env,
+                                  const struct bpf_insn *insn, int idx)
+{
+        int start = idx + insn->imm + 1, subprog;
+        subprog = find_subprog(env, start);
+        if (subprog < 0) {
+                WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
+                          start);
+                return -EFAULT;
+        }
+        subprog++;
+        return env->subprog_stack_depth[subprog];
+}
+#endif
 /* truncate register to smaller size (in bytes)
 * must be called with size < BPF_REG_SIZE
 */
@@ -1112,9 +1580,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                            int bpf_size, enum bpf_access_type t,
                            int value_regno)
 {
-        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *reg = regs + regno;
+        struct bpf_func_state *state;
        int size, err = 0;
        size = bpf_size_to_bytes(bpf_size);
@@ -1203,8 +1671,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                        return -EACCES;
                }
-                if (env->prog->aux->stack_depth < -off)
+                state = func(env, reg);
-                        env->prog->aux->stack_depth = -off;
+                err = update_stack_depth(env, state, off);
+                if (err)
+                        return err;
                if (t == BPF_WRITE)
                        err = check_stack_write(env, state, off, size,
@@ -1282,12 +1752,6 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
                                BPF_SIZE(insn->code), BPF_WRITE, -1);
 }
-/* Does this register contain a constant zero? */
-static bool register_is_null(struct bpf_reg_state reg)
-{
-        return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0);
-}
 /* when register 'regno' is passed into function that will read 'access_size'
 * bytes from that pointer, make sure that it's within stack boundary
 * and all elements of stack are initialized.
@@ -1298,32 +1762,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
                                int access_size, bool zero_size_allowed,
                                struct bpf_call_arg_meta *meta)
 {
-        struct bpf_verifier_state *state = env->cur_state;
+        struct bpf_reg_state *reg = cur_regs(env) + regno;
-        struct bpf_reg_state *regs = state->regs;
+        struct bpf_func_state *state = func(env, reg);
        int off, i, slot, spi;
-        if (regs[regno].type != PTR_TO_STACK) {
+        if (reg->type != PTR_TO_STACK) {
                /* Allow zero-byte read from NULL, regardless of pointer type */
                if (zero_size_allowed && access_size == 0 &&
-                    register_is_null(regs[regno]))
+                    register_is_null(reg))
                        return 0;
                verbose(env, "R%d type=%s expected=%s\n", regno,
-                        reg_type_str[regs[regno].type],
+                        reg_type_str[reg->type],
                        reg_type_str[PTR_TO_STACK]);
                return -EACCES;
        }
        /* Only allow fixed-offset stack reads */
-        if (!tnum_is_const(regs[regno].var_off)) {
+        if (!tnum_is_const(reg->var_off)) {
                char tn_buf[48];
-                tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
+                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env, "invalid variable stack read R%d var_off=%s\n",
                        regno, tn_buf);
                return -EACCES;
        }
-        off = regs[regno].off + regs[regno].var_off.value;
+        off = reg->off + reg->var_off.value;
        if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
            access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
                verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
@@ -1331,9 +1795,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
                return -EACCES;
        }
-        if (env->prog->aux->stack_depth < -off)
-                env->prog->aux->stack_depth = -off;
        if (meta && meta->raw_mode) {
                meta->access_size = access_size;
                meta->regno = regno;
@@ -1341,17 +1802,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
        }
        for (i = 0; i < access_size; i++) {
+                u8 *stype;
                slot = -(off + i) - 1;
                spi = slot / BPF_REG_SIZE;
-                if (state->allocated_stack <= slot ||
+                if (state->allocated_stack <= slot)
-                    state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
+                        goto err;
-                        STACK_MISC) {
+                stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
-                        verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
+                if (*stype == STACK_MISC)
-                                off, i, access_size);
+                        goto mark;
-                        return -EACCES;
+                if (*stype == STACK_ZERO) {
+                        /* helper can write anything into the stack */
+                        *stype = STACK_MISC;
+                        goto mark;
                }
+err:
+                verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
+                        off, i, access_size);
+                return -EACCES;
+mark:
+                /* reading any byte out of 8-byte 'spill_slot' will cause
+                 * the whole slot to be marked as 'read'
+                 */
+                mark_stack_slot_read(env, env->cur_state, env->cur_state->parent,
+                                     spi, state->frameno);
        }
-        return 0;
+        return update_stack_depth(env, state, off);
 }
 static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
@@ -1374,6 +1850,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
        }
 }
+static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
+{
+        return type == ARG_PTR_TO_MEM ||
+               type == ARG_PTR_TO_MEM_OR_NULL ||
+               type == ARG_PTR_TO_UNINIT_MEM;
+}
+static bool arg_type_is_mem_size(enum bpf_arg_type type)
+{
+        return type == ARG_CONST_SIZE ||
+               type == ARG_CONST_SIZE_OR_ZERO;
+}
 static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                          enum bpf_arg_type arg_type,
                          struct bpf_call_arg_meta *meta)
@@ -1423,15 +1912,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                expected_type = PTR_TO_CTX;
                if (type != expected_type)
                        goto err_type;
-        } else if (arg_type == ARG_PTR_TO_MEM ||
+        } else if (arg_type_is_mem_ptr(arg_type)) {
-                   arg_type == ARG_PTR_TO_MEM_OR_NULL ||
-                   arg_type == ARG_PTR_TO_UNINIT_MEM) {
                expected_type = PTR_TO_STACK;
                /* One exception here. In case function allows for NULL to be
                 * passed in as argument, it's a SCALAR_VALUE type. Final test
                 * happens during stack boundary checking.
                 */
-                if (register_is_null(*reg) &&
+                if (register_is_null(reg) &&
                    arg_type == ARG_PTR_TO_MEM_OR_NULL)
                        /* final test in check_stack_boundary() */;
                else if (!type_is_pkt_pointer(type) &&
@@ -1486,25 +1973,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                        err = check_stack_boundary(env, regno,
                                                   meta->map_ptr->value_size,
                                                   false, NULL);
-        } else if (arg_type == ARG_CONST_SIZE ||
+        } else if (arg_type_is_mem_size(arg_type)) {
-                   arg_type == ARG_CONST_SIZE_OR_ZERO) {
                bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
-                /* bpf_xxx(..., buf, len) call will access 'len' bytes
-                 * from stack pointer 'buf'. Check it
-                 * note: regno == len, regno - 1 == buf
-                 */
-                if (regno == 0) {
-                        /* kernel subsystem misconfigured verifier */
-                        verbose(env,
-                                "ARG_CONST_SIZE cannot be first argument\n");
-                        return -EACCES;
-                }
                /* The register is SCALAR_VALUE; the access check
                 * happens using its boundaries.
                 */
                if (!tnum_is_const(reg->var_off))
                        /* For unprivileged variable accesses, disable raw
                         * mode so that the program is required to
@@ -1604,6 +2078,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
        case BPF_FUNC_tail_call:
                if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
                        goto error;
+                if (env->subprog_cnt) {
+                        verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
+                        return -EINVAL;
+                }
                break;
        case BPF_FUNC_perf_event_read:
        case BPF_FUNC_perf_event_output:
@@ -1644,7 +2122,7 @@ error:
        return -EINVAL;
 }
-static int check_raw_mode(const struct bpf_func_proto *fn)
+static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
 {
        int count = 0;
@@ -1659,15 +2137,52 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
        if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
                count++;
-        return count > 1 ? -EINVAL : 0;
+        /* We only support one arg being in raw mode at the moment,
+         * which is sufficient for the helper functions we have
+         * right now.
+         */
+        return count <= 1;
+}
+static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
+                                    enum bpf_arg_type arg_next)
+{
+        return (arg_type_is_mem_ptr(arg_curr) &&
+                !arg_type_is_mem_size(arg_next)) ||
+               (!arg_type_is_mem_ptr(arg_curr) &&
+                arg_type_is_mem_size(arg_next));
+}
+static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
+{
+        /* bpf_xxx(..., buf, len) call will access 'len'
+         * bytes from memory 'buf'. Both arg types need
+         * to be paired, so make sure there's no buggy
+         * helper function specification.
+         */
+        if (arg_type_is_mem_size(fn->arg1_type) ||
+            arg_type_is_mem_ptr(fn->arg5_type)  ||
+            check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
+            check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
+            check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
+            check_args_pair_invalid(fn->arg4_type, fn->arg5_type))
+                return false;
+        return true;
+}
+static int check_func_proto(const struct bpf_func_proto *fn)
+{
+        return check_raw_mode_ok(fn) &&
+               check_arg_pair_ok(fn) ? 0 : -EINVAL;
 }
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
 * are now invalid, so turn them into unknown SCALAR_VALUE.
 */
-static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
+static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
+                                     struct bpf_func_state *state)
 {
-        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_reg_state *regs = state->regs, *reg;
        int i;
@@ -1684,7 +2199,121 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
        }
 }
-static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
+{
+        struct bpf_verifier_state *vstate = env->cur_state;
+        int i;
+        for (i = 0; i <= vstate->curframe; i++)
+                __clear_all_pkt_pointers(env, vstate->frame[i]);
+}
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+                           int *insn_idx)
+{
+        struct bpf_verifier_state *state = env->cur_state;
+        struct bpf_func_state *caller, *callee;
+        int i, subprog, target_insn;
+        if (state->curframe + 1 >= MAX_CALL_FRAMES) {
+                verbose(env, "the call stack of %d frames is too deep\n",
+                        state->curframe + 2);
+                return -E2BIG;
+        }
+        target_insn = *insn_idx + insn->imm;
+        subprog = find_subprog(env, target_insn + 1);
+        if (subprog < 0) {
+                verbose(env, "verifier bug. No program starts at insn %d\n",
+                        target_insn + 1);
+                return -EFAULT;
+        }
+        caller = state->frame[state->curframe];
+        if (state->frame[state->curframe + 1]) {
+                verbose(env, "verifier bug. Frame %d already allocated\n",
+                        state->curframe + 1);
+                return -EFAULT;
+        }
+        callee = kzalloc(sizeof(*callee), GFP_KERNEL);
+        if (!callee)
+                return -ENOMEM;
+        state->frame[state->curframe + 1] = callee;
+        /* callee cannot access r0, r6 - r9 for reading and has to write
+         * into its own stack before reading from it.
+         * callee can read/write into caller's stack
+         */
+        init_func_state(env, callee,
+                        /* remember the callsite, it will be used by bpf_exit */
+                        *insn_idx /* callsite */,
+                        state->curframe + 1 /* frameno within this callchain */,
+                        subprog + 1 /* subprog number within this prog */);
+        /* copy r1 - r5 args that callee can access */
+        for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+                callee->regs[i] = caller->regs[i];
+        /* after the call regsiters r0 - r5 were scratched */
+        for (i = 0; i < CALLER_SAVED_REGS; i++) {
+                mark_reg_not_init(env, caller->regs, caller_saved[i]);
+                check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+        }
+        /* only increment it after check_reg_arg() finished */
+        state->curframe++;
+        /* and go analyze first insn of the callee */
+        *insn_idx = target_insn;
+        if (env->log.level) {
+                verbose(env, "caller:\n");
+                print_verifier_state(env, caller);
+                verbose(env, "callee:\n");
+                print_verifier_state(env, callee);
+        }
+        return 0;
+}
+static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
+{
+        struct bpf_verifier_state *state = env->cur_state;
+        struct bpf_func_state *caller, *callee;
+        struct bpf_reg_state *r0;
+        callee = state->frame[state->curframe];
+        r0 = &callee->regs[BPF_REG_0];
+        if (r0->type == PTR_TO_STACK) {
+                /* technically it's ok to return caller's stack pointer
+                 * (or caller's caller's pointer) back to the caller,
+                 * since these pointers are valid. Only current stack
+                 * pointer will be invalid as soon as function exits,
+                 * but let's be conservative
+                 */
+                verbose(env, "cannot return stack pointer to the caller\n");
+                return -EINVAL;
+        }
+        state->curframe--;
+        caller = state->frame[state->curframe];
+        /* return to the caller whatever r0 had in the callee */
+        caller->regs[BPF_REG_0] = *r0;
+        *insn_idx = callee->callsite + 1;
+        if (env->log.level) {
+                verbose(env, "returning from callee:\n");
+                print_verifier_state(env, callee);
+                verbose(env, "to caller at %d:\n", *insn_idx);
+                print_verifier_state(env, caller);
+        }
+        /* clear everything in the callee */
+        free_func_state(callee);
+        state->frame[state->curframe + 1] = NULL;
+        return 0;
+}
+static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
        const struct bpf_func_proto *fn = NULL;
        struct bpf_reg_state *regs;
@@ -1701,7 +2330,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
        if (env->ops->get_func_proto)
                fn = env->ops->get_func_proto(func_id);
        if (!fn) {
                verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
                        func_id);
@@ -1725,10 +2353,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
        memset(&meta, 0, sizeof(meta));
        meta.pkt_access = fn->pkt_access;
-        /* We only support one arg being in raw mode at the moment, which
+        err = check_func_proto(fn);
-         * is sufficient for the helper functions we have right now.
-         */
-        err = check_raw_mode(fn);
        if (err) {
                verbose(env, "kernel subsystem misconfigured func %s#%d\n",
                        func_id_name(func_id), func_id);
@@ -1884,7 +2509,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                                   const struct bpf_reg_state *ptr_reg,
                                   const struct bpf_reg_state *off_reg)
 {
-        struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
+        struct bpf_verifier_state *vstate = env->cur_state;
+        struct bpf_func_state *state = vstate->frame[vstate->curframe];
+        struct bpf_reg_state *regs = state->regs, *dst_reg;
        bool known = tnum_is_const(off_reg->var_off);
        s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
            smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
@@ -2319,7 +2946,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                                   struct bpf_insn *insn)
 {
-        struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
+        struct bpf_verifier_state *vstate = env->cur_state;
+        struct bpf_func_state *state = vstate->frame[vstate->curframe];
+        struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
        struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
        u8 opcode = BPF_OP(insn->code);
@@ -2370,12 +2999,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
        /* Got here implies adding two SCALAR_VALUEs */
        if (WARN_ON_ONCE(ptr_reg)) {
-                print_verifier_state(env, env->cur_state);
+                print_verifier_state(env, state);
                verbose(env, "verifier internal error: unexpected ptr_reg\n");
                return -EINVAL;
        }
        if (WARN_ON(!src_reg)) {
-                print_verifier_state(env, env->cur_state);
+                print_verifier_state(env, state);
                verbose(env, "verifier internal error: no src_reg\n");
                return -EINVAL;
        }
@@ -2537,14 +3166,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
        return 0;
 }
-static void find_good_pkt_pointers(struct bpf_verifier_state *state,
+static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
                                   struct bpf_reg_state *dst_reg,
                                   enum bpf_reg_type type,
                                   bool range_right_open)
 {
+        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *regs = state->regs, *reg;
        u16 new_range;
-        int i;
+        int i, j;
        if (dst_reg->off < 0 ||
            (dst_reg->off == 0 && range_right_open))
@@ -2614,12 +3244,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
                        /* keep the maximum range already checked */
                        regs[i].range = max(regs[i].range, new_range);
-        for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+        for (j = 0; j <= vstate->curframe; j++) {
-                if (state->stack[i].slot_type[0] != STACK_SPILL)
+                state = vstate->frame[j];
-                        continue;
+                for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-                reg = &state->stack[i].spilled_ptr;
+                        if (state->stack[i].slot_type[0] != STACK_SPILL)
-                if (reg->type == type && reg->id == dst_reg->id)
+                                continue;
-                        reg->range = max(reg->range, new_range);
+                        reg = &state->stack[i].spilled_ptr;
+                        if (reg->type == type && reg->id == dst_reg->id)
+                                reg->range = max(reg->range, new_range);
+                }
        }
 }
@@ -2857,20 +3490,24 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
 /* The logic is similar to find_good_pkt_pointers(), both could eventually
 * be folded together at some point.
 */
-static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
+static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
                          bool is_null)
 {
+        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *regs = state->regs;
        u32 id = regs[regno].id;
-        int i;
+        int i, j;
        for (i = 0; i < MAX_BPF_REG; i++)
                mark_map_reg(regs, i, id, is_null);
-        for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+        for (j = 0; j <= vstate->curframe; j++) {
-                if (state->stack[i].slot_type[0] != STACK_SPILL)
+                state = vstate->frame[j];
-                        continue;
+                for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-                mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
+                        if (state->stack[i].slot_type[0] != STACK_SPILL)
+                                continue;
+                        mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
+                }
        }
 }
@@ -2970,8 +3607,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
                             struct bpf_insn *insn, int *insn_idx)
 {
-        struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
+        struct bpf_verifier_state *this_branch = env->cur_state;
-        struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
+        struct bpf_verifier_state *other_branch;
+        struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
+        struct bpf_reg_state *dst_reg, *other_branch_regs;
        u8 opcode = BPF_OP(insn->code);
        int err;
@@ -3014,8 +3653,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
        if (BPF_SRC(insn->code) == BPF_K &&
            (opcode == BPF_JEQ || opcode == BPF_JNE) &&
            dst_reg->type == SCALAR_VALUE &&
-            tnum_equals_const(dst_reg->var_off, insn->imm)) {
+            tnum_is_const(dst_reg->var_off)) {
-                if (opcode == BPF_JEQ) {
+                if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) ||
+                    (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) {
                        /* if (imm == imm) goto pc+off;
                         * only follow the goto, ignore fall-through
                         */
@@ -3033,6 +3673,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
        other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
        if (!other_branch)
                return -EFAULT;
+        other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
        /* detect if we are comparing against a constant value so we can adjust
         * our min/max values for our dst register.
@@ -3045,22 +3686,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
                if (dst_reg->type == SCALAR_VALUE &&
                    regs[insn->src_reg].type == SCALAR_VALUE) {
                        if (tnum_is_const(regs[insn->src_reg].var_off))
-                                reg_set_min_max(&other_branch->regs[insn->dst_reg],
+                                reg_set_min_max(&other_branch_regs[insn->dst_reg],
                                                dst_reg, regs[insn->src_reg].var_off.value,
                                                opcode);
                        else if (tnum_is_const(dst_reg->var_off))
-                                reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+                                reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
                                                    &regs[insn->src_reg],
                                                    dst_reg->var_off.value, opcode);
                        else if (opcode == BPF_JEQ || opcode == BPF_JNE)
                                /* Comparing for equality, we can combine knowledge */
-                                reg_combine_min_max(&other_branch->regs[insn->src_reg],
+                                reg_combine_min_max(&other_branch_regs[insn->src_reg],
-                                                    &other_branch->regs[insn->dst_reg],
+                                                    &other_branch_regs[insn->dst_reg],
                                                    &regs[insn->src_reg],
                                                    &regs[insn->dst_reg], opcode);
                }
        } else if (dst_reg->type == SCALAR_VALUE) {
-                reg_set_min_max(&other_branch->regs[insn->dst_reg],
+                reg_set_min_max(&other_branch_regs[insn->dst_reg],
                                        dst_reg, insn->imm, opcode);
        }
@@ -3081,7 +3722,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
                return -EACCES;
        }
        if (env->log.level)
-                print_verifier_state(env, this_branch);
+                print_verifier_state(env, this_branch->frame[this_branch->curframe]);
        return 0;
 }
@@ -3166,6 +3807,18 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
                return -EINVAL;
        }
+        if (env->subprog_cnt) {
+                /* when program has LD_ABS insn JITs and interpreter assume
+                 * that r1 == ctx == skb which is not the case for callees
+                 * that can have arbitrary arguments. It's problematic
+                 * for main prog as well since JITs would need to analyze
+                 * all functions in order to make proper register save/restore
+                 * decisions in the main prog. Hence disallow LD_ABS with calls
+                 */
+                verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n");
+                return -EINVAL;
+        }
        if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
            BPF_SIZE(insn->code) == BPF_DW ||
            (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
@@ -3342,6 +3995,10 @@ static int check_cfg(struct bpf_verifier_env *env)
        int ret = 0;
        int i, t;
+        ret = check_subprogs(env);
+        if (ret < 0)
+                return ret;
        insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
        if (!insn_state)
                return -ENOMEM;
@@ -3374,6 +4031,14 @@ peek_stack:
                                goto err_free;
                        if (t + 1 < insn_cnt)
                                env->explored_states[t + 1] = STATE_LIST_MARK;
+                        if (insns[t].src_reg == BPF_PSEUDO_CALL) {
+                                env->explored_states[t] = STATE_LIST_MARK;
+                                ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+                                if (ret == 1)
+                                        goto peek_stack;
+                                else if (ret < 0)
+                                        goto err_free;
+                        }
                } else if (opcode == BPF_JA) {
                        if (BPF_SRC(insns[t].code) != BPF_K) {
                                ret = -EINVAL;
@@ -3492,11 +4157,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
 static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
                    struct idpair *idmap)
 {
+        bool equal;
        if (!(rold->live & REG_LIVE_READ))
                /* explored state didn't use this */
                return true;
-        if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0)
+        equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0;
+        if (rold->type == PTR_TO_STACK)
+                /* two stack pointers are equal only if they're pointing to
+                 * the same stack frame, since fp-8 in foo != fp-8 in bar
+                 */
+                return equal && rold->frameno == rcur->frameno;
+        if (equal)
                return true;
        if (rold->type == NOT_INIT)
@@ -3568,7 +4243,6 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
                       tnum_in(rold->var_off, rcur->var_off);
        case PTR_TO_CTX:
        case CONST_PTR_TO_MAP:
-        case PTR_TO_STACK:
        case PTR_TO_PACKET_END:
                /* Only valid matches are exact, which memcmp() above
                 * would have accepted
@@ -3583,8 +4257,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
        return false;
 }
-static bool stacksafe(struct bpf_verifier_state *old,
+static bool stacksafe(struct bpf_func_state *old,
-                      struct bpf_verifier_state *cur,
+                      struct bpf_func_state *cur,
                      struct idpair *idmap)
 {
        int i, spi;
@@ -3602,8 +4276,19 @@ static bool stacksafe(struct bpf_verifier_state *old,
        for (i = 0; i < old->allocated_stack; i++) {
                spi = i / BPF_REG_SIZE;
+                if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ))
+                        /* explored state didn't use this */
+                        continue;
                if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
                        continue;
+                /* if old state was safe with misc data in the stack
+                 * it will be safe with zero-initialized stack.
+                 * The opposite is not true
+                 */
+                if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
+                    cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
+                        continue;
                if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
                    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
                        /* Ex: old explored (safe) state has STACK_SPILL in
@@ -3660,9 +4345,8 @@ static bool stacksafe(struct bpf_verifier_state *old,
 * whereas register type in current state is meaningful, it means that
 * the current state will reach 'bpf_exit' instruction safely
 */
-static bool states_equal(struct bpf_verifier_env *env,
+static bool func_states_equal(struct bpf_func_state *old,
-                         struct bpf_verifier_state *old,
+                              struct bpf_func_state *cur)
-                         struct bpf_verifier_state *cur)
 {
        struct idpair *idmap;
        bool ret = false;
@@ -3686,71 +4370,72 @@ out_free:
        return ret;
 }
+static bool states_equal(struct bpf_verifier_env *env,
+                         struct bpf_verifier_state *old,
+                         struct bpf_verifier_state *cur)
+{
+        int i;
+        if (old->curframe != cur->curframe)
+                return false;
+        /* for states to be equal callsites have to be the same
+         * and all frame states need to be equivalent
+         */
+        for (i = 0; i <= old->curframe; i++) {
+                if (old->frame[i]->callsite != cur->frame[i]->callsite)
+                        return false;
+                if (!func_states_equal(old->frame[i], cur->frame[i]))
+                        return false;
+        }
+        return true;
+}
 /* A write screens off any subsequent reads; but write marks come from the
- * straight-line code between a state and its parent.  When we arrive at a
+ * straight-line code between a state and its parent.  When we arrive at an
- * jump target (in the first iteration of the propagate_liveness() loop),
+ * equivalent state (jump target or such) we didn't arrive by the straight-line
- * we didn't arrive by the straight-line code, so read marks in state must
+ * code, so read marks in the state must propagate to the parent regardless
- * propagate to parent regardless of state's write marks.
+ * of the state's write marks. That's what 'parent == state->parent' comparison
+ * in mark_reg_read() and mark_stack_slot_read() is for.
 */
-static bool do_propagate_liveness(const struct bpf_verifier_state *state,
+static int propagate_liveness(struct bpf_verifier_env *env,
-                                  struct bpf_verifier_state *parent)
+                              const struct bpf_verifier_state *vstate,
+                              struct bpf_verifier_state *vparent)
 {
-        bool writes = parent == state->parent; /* Observe write marks */
+        int i, frame, err = 0;
-        bool touched = false; /* any changes made? */
+        struct bpf_func_state *state, *parent;
-        int i;
-        if (!parent)
+        if (vparent->curframe != vstate->curframe) {
-                return touched;
+                WARN(1, "propagate_live: parent frame %d current frame %d\n",
+                     vparent->curframe, vstate->curframe);
+                return -EFAULT;
+        }
        /* Propagate read liveness of registers... */
        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
        /* We don't need to worry about FP liveness because it's read-only */
        for (i = 0; i < BPF_REG_FP; i++) {
-                if (parent->regs[i].live & REG_LIVE_READ)
+                if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
                        continue;
-                if (writes && (state->regs[i].live & REG_LIVE_WRITTEN))
+                if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
-                        continue;
+                        err = mark_reg_read(env, vstate, vparent, i);
-                if (state->regs[i].live & REG_LIVE_READ) {
+                        if (err)
-                        parent->regs[i].live |= REG_LIVE_READ;
+                                return err;
-                        touched = true;
                }
        }
        /* ... and stack slots */
-        for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+        for (frame = 0; frame <= vstate->curframe; frame++) {
-                    i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+                state = vstate->frame[frame];
-                if (parent->stack[i].slot_type[0] != STACK_SPILL)
+                parent = vparent->frame[frame];
-                        continue;
+                for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
-                if (state->stack[i].slot_type[0] != STACK_SPILL)
+                            i < parent->allocated_stack / BPF_REG_SIZE; i++) {
-                        continue;
+                        if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
-                if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
+                                continue;
-                        continue;
+                        if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
-                if (writes &&
+                                mark_stack_slot_read(env, vstate, vparent, i, frame);
-                    (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
-                        continue;
-                if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
-                        parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
-                        touched = true;
                }
        }
-        return touched;
+        return err;
-}
-/* "parent" is "a state from which we reach the current state", but initially
- * it is not the state->parent (i.e. "the state whose straight-line code leads
- * to the current state"), instead it is the state that happened to arrive at
- * a (prunable) equivalent of the current state.  See comment above
- * do_propagate_liveness() for consequences of this.
- * This function is just a more efficient way of calling mark_reg_read() or
- * mark_stack_slot_read() on each reg in "parent" that is read in "state",
- * though it requires that parent != state->parent in the call arguments.
- */
-static void propagate_liveness(const struct bpf_verifier_state *state,
-                               struct bpf_verifier_state *parent)
-{
-        while (do_propagate_liveness(state, parent)) {
-                /* Something changed, so we need to feed those changes onward */
-                state = parent;
-                parent = state->parent;
-        }
 }
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
@@ -3758,7 +4443,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
        struct bpf_verifier_state_list *new_sl;
        struct bpf_verifier_state_list *sl;
        struct bpf_verifier_state *cur = env->cur_state;
-        int i, err;
+        int i, j, err;
        sl = env->explored_states[insn_idx];
        if (!sl)
@@ -3779,7 +4464,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
                         * they'll be immediately forgotten as we're pruning
                         * this state and will pop a new one.
                         */
-                        propagate_liveness(&sl->state, cur);
+                        err = propagate_liveness(env, &sl->state, cur);
+                        if (err)
+                                return err;
                        return 1;
                }
                sl = sl->next;
@@ -3787,9 +4474,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
        /* there were no equivalent states, remember current one.
         * technically the current state is not proven to be safe yet,
-         * but it will either reach bpf_exit (which means it's safe) or
+         * but it will either reach outer most bpf_exit (which means it's safe)
-         * it will be rejected. Since there are no loops, we won't be
+         * or it will be rejected. Since there are no loops, we won't be
-         * seeing this 'insn_idx' instruction again on the way to bpf_exit
+         * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
+         * again on the way to bpf_exit
         */
        new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
        if (!new_sl)
@@ -3813,19 +4501,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
         * explored_states can get read marks.)
         */
        for (i = 0; i < BPF_REG_FP; i++)
-                cur->regs[i].live = REG_LIVE_NONE;
+                cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE;
-        for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
-                if (cur->stack[i].slot_type[0] == STACK_SPILL)
-                        cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
-        return 0;
-}
-static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
+        /* all stack frames are accessible from callee, clear them all */
-                                  int insn_idx, int prev_insn_idx)
+        for (j = 0; j <= cur->curframe; j++) {
-{
+                struct bpf_func_state *frame = cur->frame[j];
-        if (env->dev_ops && env->dev_ops->insn_hook)
-                return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
+                for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++)
+                        frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
+        }
        return 0;
 }
@@ -3834,7 +4518,7 @@ static int do_check(struct bpf_verifier_env *env)
        struct bpf_verifier_state *state;
        struct bpf_insn *insns = env->prog->insnsi;
        struct bpf_reg_state *regs;
-        int insn_cnt = env->prog->len;
+        int insn_cnt = env->prog->len, i;
        int insn_idx, prev_insn_idx = 0;
        int insn_processed = 0;
        bool do_print_state = false;
@@ -3842,9 +4526,18 @@ static int do_check(struct bpf_verifier_env *env)
        state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
        if (!state)
                return -ENOMEM;
-        env->cur_state = state;
+        state->curframe = 0;
-        init_reg_state(env, state->regs);
        state->parent = NULL;
+        state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
+        if (!state->frame[0]) {
+                kfree(state);
+                return -ENOMEM;
+        }
+        env->cur_state = state;
+        init_func_state(env, state->frame[0],
+                        BPF_MAIN_FUNC /* callsite */,
+                        0 /* frameno */,
+                        0 /* subprogno, zero == main subprog */);
        insn_idx = 0;
        for (;;) {
                struct bpf_insn *insn;
@@ -3891,19 +4584,25 @@ static int do_check(struct bpf_verifier_env *env)
                        else
                                verbose(env, "\nfrom %d to %d:",
                                        prev_insn_idx, insn_idx);
-                        print_verifier_state(env, state);
+                        print_verifier_state(env, state->frame[state->curframe]);
                        do_print_state = false;
                }
                if (env->log.level) {
+                        const struct bpf_insn_cbs cbs = {
+                                .cb_print       = verbose,
+                        };
                        verbose(env, "%d: ", insn_idx);
-                        print_bpf_insn(verbose, env, insn,
+                        print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks);
-                                       env->allow_ptr_leaks);
                }
-                err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
+                if (bpf_prog_is_dev_bound(env->prog->aux)) {
-                if (err)
+                        err = bpf_prog_offload_verify_insn(env, insn_idx,
-                        return err;
+                                                           prev_insn_idx);
+                        if (err)
+                                return err;
+                }
                regs = cur_regs(env);
                env->insn_aux_data[insn_idx].seen = true;
@@ -4030,13 +4729,17 @@ static int do_check(struct bpf_verifier_env *env)
                        if (opcode == BPF_CALL) {
                                if (BPF_SRC(insn->code) != BPF_K ||
                                    insn->off != 0 ||
-                                    insn->src_reg != BPF_REG_0 ||
+                                    (insn->src_reg != BPF_REG_0 &&
+                                     insn->src_reg != BPF_PSEUDO_CALL) ||
                                    insn->dst_reg != BPF_REG_0) {
                                        verbose(env, "BPF_CALL uses reserved fields\n");
                                        return -EINVAL;
                                }
-                                err = check_call(env, insn->imm, insn_idx);
+                                if (insn->src_reg == BPF_PSEUDO_CALL)
+                                        err = check_func_call(env, insn, &insn_idx);
+                                else
+                                        err = check_helper_call(env, insn->imm, insn_idx);
                                if (err)
                                        return err;
@@ -4061,6 +4764,16 @@ static int do_check(struct bpf_verifier_env *env)
                                        return -EINVAL;
                                }
+                                if (state->curframe) {
+                                        /* exit from nested function */
+                                        prev_insn_idx = insn_idx;
+                                        err = prepare_func_exit(env, &insn_idx);
+                                        if (err)
+                                                return err;
+                                        do_print_state = true;
+                                        continue;
+                                }
                                /* eBPF calling convetion is such that R0 is used
                                 * to return the value from eBPF program.
                                 * Make sure that it's readable at this time
@@ -4121,8 +4834,17 @@ process_bpf_exit:
                insn_idx++;
        }
-        verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+        verbose(env, "processed %d insns (limit %d), stack depth ",
-                env->prog->aux->stack_depth);
+                insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
+        for (i = 0; i < env->subprog_cnt + 1; i++) {
+                u32 depth = env->subprog_stack_depth[i];
+                verbose(env, "%d", depth);
+                if (i + 1 < env->subprog_cnt + 1)
+                        verbose(env, "+");
+        }
+        verbose(env, "\n");
+        env->prog->aux->stack_depth = env->subprog_stack_depth[0];
        return 0;
 }
@@ -4155,6 +4877,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
                        return -EINVAL;
                }
        }
+        if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
+            !bpf_offload_dev_match(prog, map)) {
+                verbose(env, "offload device mismatch between prog and map\n");
+                return -EINVAL;
+        }
        return 0;
 }
@@ -4252,6 +4981,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 next_insn:
                        insn++;
                        i++;
+                        continue;
+                }
+                /* Basic sanity check before we invest more work here. */
+                if (!bpf_opcode_in_insntable(insn->code)) {
+                        verbose(env, "unknown opcode %02x\n", insn->code);
+                        return -EINVAL;
                }
        }
@@ -4308,6 +5044,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
        return 0;
 }
+static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+        int i;
+        if (len == 1)
+                return;
+        for (i = 0; i < env->subprog_cnt; i++) {
+                if (env->subprog_starts[i] < off)
+                        continue;
+                env->subprog_starts[i] += len - 1;
+        }
+}
 static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
                                            const struct bpf_insn *patch, u32 len)
 {
@@ -4318,17 +5067,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
                return NULL;
        if (adjust_insn_aux_data(env, new_prog->len, off, len))
                return NULL;
+        adjust_subprog_starts(env, off, len);
        return new_prog;
 }
-/* The verifier does more data flow analysis than llvm and will not explore
+/* The verifier does more data flow analysis than llvm and will not
- * branches that are dead at run time. Malicious programs can have dead code
+ * explore branches that are dead at run time. Malicious programs can
- * too. Therefore replace all dead at-run-time code with nops.
+ * have dead code too. Therefore replace all dead at-run-time code
+ * with 'ja -1'.
+ *
+ * Just nops are not optimal, e.g. if they would sit at the end of the
+ * program and through another bug we would manage to jump there, then
+ * we'd execute beyond program memory otherwise. Returning exception
+ * code also wouldn't work since we can have subprogs where the dead
+ * code could be located.
 */
 static void sanitize_dead_code(struct bpf_verifier_env *env)
 {
        struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
-        struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+        struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
        struct bpf_insn *insn = env->prog->insnsi;
        const int insn_cnt = env->prog->len;
        int i;
@@ -4336,7 +5093,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
        for (i = 0; i < insn_cnt; i++) {
                if (aux_data[i].seen)
                        continue;
-                memcpy(insn + i, &nop, sizeof(nop));
+                memcpy(insn + i, &trap, sizeof(trap));
        }
 }
@@ -4452,6 +5209,180 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
        return 0;
 }
+static int jit_subprogs(struct bpf_verifier_env *env)
+{
+        struct bpf_prog *prog = env->prog, **func, *tmp;
+        int i, j, subprog_start, subprog_end = 0, len, subprog;
+        struct bpf_insn *insn;
+        void *old_bpf_func;
+        int err = -ENOMEM;
+        if (env->subprog_cnt == 0)
+                return 0;
+        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+                if (insn->code != (BPF_JMP | BPF_CALL) ||
+                    insn->src_reg != BPF_PSEUDO_CALL)
+                        continue;
+                subprog = find_subprog(env, i + insn->imm + 1);
+                if (subprog < 0) {
+                        WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
+                                  i + insn->imm + 1);
+                        return -EFAULT;
+                }
+                /* temporarily remember subprog id inside insn instead of
+                 * aux_data, since next loop will split up all insns into funcs
+                 */
+                insn->off = subprog + 1;
+                /* remember original imm in case JIT fails and fallback
+                 * to interpreter will be needed
+                 */
+                env->insn_aux_data[i].call_imm = insn->imm;
+                /* point imm to __bpf_call_base+1 from JITs point of view */
+                insn->imm = 1;
+        }
+        func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL);
+        if (!func)
+                return -ENOMEM;
+        for (i = 0; i <= env->subprog_cnt; i++) {
+                subprog_start = subprog_end;
+                if (env->subprog_cnt == i)
+                        subprog_end = prog->len;
+                else
+                        subprog_end = env->subprog_starts[i];
+                len = subprog_end - subprog_start;
+                func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
+                if (!func[i])
+                        goto out_free;
+                memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
+                       len * sizeof(struct bpf_insn));
+                func[i]->type = prog->type;
+                func[i]->len = len;
+                if (bpf_prog_calc_tag(func[i]))
+                        goto out_free;
+                func[i]->is_func = 1;
+                /* Use bpf_prog_F_tag to indicate functions in stack traces.
+                 * Long term would need debug info to populate names
+                 */
+                func[i]->aux->name[0] = 'F';
+                func[i]->aux->stack_depth = env->subprog_stack_depth[i];
+                func[i]->jit_requested = 1;
+                func[i] = bpf_int_jit_compile(func[i]);
+                if (!func[i]->jited) {
+                        err = -ENOTSUPP;
+                        goto out_free;
+                }
+                cond_resched();
+        }
+        /* at this point all bpf functions were successfully JITed
+         * now populate all bpf_calls with correct addresses and
+         * run last pass of JIT
+         */
+        for (i = 0; i <= env->subprog_cnt; i++) {
+                insn = func[i]->insnsi;
+                for (j = 0; j < func[i]->len; j++, insn++) {
+                        if (insn->code != (BPF_JMP | BPF_CALL) ||
+                            insn->src_reg != BPF_PSEUDO_CALL)
+                                continue;
+                        subprog = insn->off;
+                        insn->off = 0;
+                        insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
+                                func[subprog]->bpf_func -
+                                __bpf_call_base;
+                }
+        }
+        for (i = 0; i <= env->subprog_cnt; i++) {
+                old_bpf_func = func[i]->bpf_func;
+                tmp = bpf_int_jit_compile(func[i]);
+                if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
+                        verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
+                        err = -EFAULT;
+                        goto out_free;
+                }
+                cond_resched();
+        }
+        /* finally lock prog and jit images for all functions and
+         * populate kallsysm
+         */
+        for (i = 0; i <= env->subprog_cnt; i++) {
+                bpf_prog_lock_ro(func[i]);
+                bpf_prog_kallsyms_add(func[i]);
+        }
+        /* Last step: make now unused interpreter insns from main
+         * prog consistent for later dump requests, so they can
+         * later look the same as if they were interpreted only.
+         */
+        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+                unsigned long addr;
+                if (insn->code != (BPF_JMP | BPF_CALL) ||
+                    insn->src_reg != BPF_PSEUDO_CALL)
+                        continue;
+                insn->off = env->insn_aux_data[i].call_imm;
+                subprog = find_subprog(env, i + insn->off + 1);
+                addr  = (unsigned long)func[subprog + 1]->bpf_func;
+                addr &= PAGE_MASK;
+                insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
+                            addr - __bpf_call_base;
+        }
+        prog->jited = 1;
+        prog->bpf_func = func[0]->bpf_func;
+        prog->aux->func = func;
+        prog->aux->func_cnt = env->subprog_cnt + 1;
+        return 0;
+out_free:
+        for (i = 0; i <= env->subprog_cnt; i++)
+                if (func[i])
+                        bpf_jit_free(func[i]);
+        kfree(func);
+        /* cleanup main prog to be interpreted */
+        prog->jit_requested = 0;
+        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+                if (insn->code != (BPF_JMP | BPF_CALL) ||
+                    insn->src_reg != BPF_PSEUDO_CALL)
+                        continue;
+                insn->off = 0;
+                insn->imm = env->insn_aux_data[i].call_imm;
+        }
+        return err;
+}
+static int fixup_call_args(struct bpf_verifier_env *env)
+{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+        struct bpf_prog *prog = env->prog;
+        struct bpf_insn *insn = prog->insnsi;
+        int i, depth;
+#endif
+        int err;
+        err = 0;
+        if (env->prog->jit_requested) {
+                err = jit_subprogs(env);
+                if (err == 0)
+                        return 0;
+        }
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+        for (i = 0; i < prog->len; i++, insn++) {
+                if (insn->code != (BPF_JMP | BPF_CALL) ||
+                    insn->src_reg != BPF_PSEUDO_CALL)
+                        continue;
+                depth = get_callee_stack_depth(env, insn, i);
+                if (depth < 0)
+                        return depth;
+                bpf_patch_call_args(insn, depth);
+        }
+        err = 0;
+#endif
+        return err;
+}
 /* fixup insn->imm field of bpf_call instructions
 * and inline eligible helpers as explicit sequence of BPF instructions
 *
@@ -4469,15 +5400,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
        int i, cnt, delta = 0;
        for (i = 0; i < insn_cnt; i++, insn++) {
-                if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+                if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
+                    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
+                    insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
                    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
-                        /* due to JIT bugs clear upper 32-bits of src register
+                        bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
-                         * before div/mod operation
+                        struct bpf_insn mask_and_div[] = {
-                         */
+                                BPF_MOV32_REG(insn->src_reg, insn->src_reg),
-                        insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
+                                /* Rx div 0 -> 0 */
-                        insn_buf[1] = *insn;
+                                BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2),
-                        cnt = 2;
+                                BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
-                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+                                BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+                                *insn,
+                        };
+                        struct bpf_insn mask_and_mod[] = {
+                                BPF_MOV32_REG(insn->src_reg, insn->src_reg),
+                                /* Rx mod 0 -> Rx */
+                                BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1),
+                                *insn,
+                        };
+                        struct bpf_insn *patchlet;
+                        if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
+                            insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+                                patchlet = mask_and_div + (is64 ? 1 : 0);
+                                cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0);
+                        } else {
+                                patchlet = mask_and_mod + (is64 ? 1 : 0);
+                                cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0);
+                        }
+                        new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
                        if (!new_prog)
                                return -ENOMEM;
@@ -4489,11 +5442,15 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
                if (insn->code != (BPF_JMP | BPF_CALL))
                        continue;
+                if (insn->src_reg == BPF_PSEUDO_CALL)
+                        continue;
                if (insn->imm == BPF_FUNC_get_route_realm)
                        prog->dst_needed = 1;
                if (insn->imm == BPF_FUNC_get_prandom_u32)
                        bpf_user_rnd_init_once();
+                if (insn->imm == BPF_FUNC_override_return)
+                        prog->kprobe_override = 1;
                if (insn->imm == BPF_FUNC_tail_call) {
                        /* If we tail call into other programs, we
                         * cannot make any assumptions since they can
@@ -4545,7 +5502,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
                /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
                 * handlers are currently limited to 64 bit only.
                 */
-                if (ebpf_jit_enabled() && BITS_PER_LONG == 64 &&
+                if (prog->jit_requested && BITS_PER_LONG == 64 &&
                    insn->imm == BPF_FUNC_map_lookup_elem) {
                        map_ptr = env->insn_aux_data[i + delta].map_ptr;
                        if (map_ptr == BPF_MAP_PTR_POISON ||
@@ -4680,7 +5637,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                env->strict_alignment = true;
-        if (env->prog->aux->offload) {
+        if (bpf_prog_is_dev_bound(env->prog->aux)) {
                ret = bpf_prog_offload_verifier_prep(env);
                if (ret)
                        goto err_unlock;
@@ -4697,12 +5654,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
        if (!env->explored_states)
                goto skip_full_check;
+        env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
        ret = check_cfg(env);
        if (ret < 0)
                goto skip_full_check;
-        env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
        ret = do_check(env);
        if (env->cur_state) {
                free_verifier_state(env->cur_state, true);
@@ -4717,12 +5674,18 @@ skip_full_check:
                sanitize_dead_code(env);
        if (ret == 0)
+                ret = check_max_stack_depth(env);
+        if (ret == 0)
                /* program is valid, convert *(u32*)(ctx + off) accesses */
                ret = convert_ctx_accesses(env);
        if (ret == 0)
                ret = fixup_bpf_calls(env);
+        if (ret == 0)
+                ret = fixup_call_args(env);
        if (log->level && bpf_verifier_log_full(log))
                ret = -ENOSPC;
        if (log->level && !log->ubuf) {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7e4c44538119..8cda3bc3ae22 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
                         cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
                         cft->name);
        else
-                strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+                strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
        return buf;
 }
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
        root->flags = opts->flags;
        if (opts->release_agent)
-                strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX);
+                strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
        if (opts->name)
-                strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
+                strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
        if (opts->cpuset_clone_children)
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index c8146d53ca67..dbb0781a0533 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2441,7 +2441,6 @@ static int kdb_kill(int argc, const char **argv)
        long sig, pid;
        char *endp;
        struct task_struct *p;
-        struct siginfo info;
        if (argc != 2)
                return KDB_ARGCOUNT;
@@ -2449,7 +2448,7 @@ static int kdb_kill(int argc, const char **argv)
        sig = simple_strtol(argv[1], &endp, 0);
        if (*endp)
                return KDB_BADINT;
-        if (sig >= 0) {
+        if ((sig >= 0) || !valid_signal(-sig)) {
                kdb_printf("Invalid signal parameter.<-signal>\n");
                return 0;
        }
@@ -2470,12 +2469,7 @@ static int kdb_kill(int argc, const char **argv)
                return 0;
        }
        p = p->group_leader;
-        info.si_signo = sig;
+        kdb_send_sig(p, sig);
-        info.si_errno = 0;
-        info.si_code = SI_USER;
-        info.si_pid = pid;  /* same capabilities as process being signalled */
-        info.si_uid = 0;    /* kdb has root authority */
-        kdb_send_sig_info(p, &info);
        return 0;
 }
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index fc224fbcf954..1e5a502ba4a7 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -208,7 +208,7 @@ extern unsigned long kdb_task_state(const struct task_struct *p,
 extern void kdb_ps_suppressed(void);
 extern void kdb_ps1(const struct task_struct *p);
 extern void kdb_print_nameval(const char *name, unsigned long val);
-extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
+extern void kdb_send_sig(struct task_struct *p, int sig);
 extern void kdb_meminfo_proc_show(void);
 extern char *kdb_getstr(char *, size_t, const char *);
 extern void kdb_gdb_state_pass(char *buf);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d99fe3fdec8a..f0549e79978b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4520,11 +4520,11 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        return ret;
 }
-static unsigned int perf_poll(struct file *file, poll_table *wait)
+static __poll_t perf_poll(struct file *file, poll_table *wait)
 {
        struct perf_event *event = file->private_data;
        struct ring_buffer *rb;
-        unsigned int events = POLLHUP;
+        __poll_t events = POLLHUP;
        poll_wait(file, &event->waitq, wait);
@@ -4732,6 +4732,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
                rcu_read_unlock();
                return 0;
        }
+        case PERF_EVENT_IOC_QUERY_BPF:
+                return perf_event_query_prog_array(event, (void __user *)arg);
        default:
                return -ENOTTY;
        }
@@ -4913,6 +4916,7 @@ void perf_event_update_userpage(struct perf_event *event)
 unlock:
        rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(perf_event_update_userpage);
 static int perf_mmap_fault(struct vm_fault *vmf)
 {
@@ -8099,6 +8103,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
                return -EINVAL;
        }
+        /* Kprobe override only works for kprobes, not uprobes. */
+        if (prog->kprobe_override &&
+            !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
+                bpf_prog_put(prog);
+                return -EINVAL;
+        }
        if (is_tracepoint || is_syscall_tp) {
                int off = trace_event_get_offsets(event->tp_event);
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
new file mode 100644
index 000000000000..21b0122cb39c
--- /dev/null
+++ b/kernel/fail_function.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fail_function.c: Function-based error injection
+ */
+#include <linux/error-injection.h>
+#include <linux/debugfs.h>
+#include <linux/fault-inject.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
+struct fei_attr {
+        struct list_head list;
+        struct kprobe kp;
+        unsigned long retval;
+};
+static DEFINE_MUTEX(fei_lock);
+static LIST_HEAD(fei_attr_list);
+static DECLARE_FAULT_ATTR(fei_fault_attr);
+static struct dentry *fei_debugfs_dir;
+static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv)
+{
+        switch (get_injectable_error_type(addr)) {
+        case EI_ETYPE_NULL:
+                if (retv != 0)
+                        return 0;
+                break;
+        case EI_ETYPE_ERRNO:
+                if (retv < (unsigned long)-MAX_ERRNO)
+                        return (unsigned long)-EINVAL;
+                break;
+        case EI_ETYPE_ERRNO_NULL:
+                if (retv != 0 && retv < (unsigned long)-MAX_ERRNO)
+                        return (unsigned long)-EINVAL;
+                break;
+        }
+        return retv;
+}
+static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr)
+{
+        struct fei_attr *attr;
+        attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+        if (attr) {
+                attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL);
+                if (!attr->kp.symbol_name) {
+                        kfree(attr);
+                        return NULL;
+                }
+                attr->kp.pre_handler = fei_kprobe_handler;
+                attr->retval = adjust_error_retval(addr, 0);
+                INIT_LIST_HEAD(&attr->list);
+        }
+        return attr;
+}
+static void fei_attr_free(struct fei_attr *attr)
+{
+        if (attr) {
+                kfree(attr->kp.symbol_name);
+                kfree(attr);
+        }
+}
+static struct fei_attr *fei_attr_lookup(const char *sym)
+{
+        struct fei_attr *attr;
+        list_for_each_entry(attr, &fei_attr_list, list) {
+                if (!strcmp(attr->kp.symbol_name, sym))
+                        return attr;
+        }
+        return NULL;
+}
+static bool fei_attr_is_valid(struct fei_attr *_attr)
+{
+        struct fei_attr *attr;
+        list_for_each_entry(attr, &fei_attr_list, list) {
+                if (attr == _attr)
+                        return true;
+        }
+        return false;
+}
+static int fei_retval_set(void *data, u64 val)
+{
+        struct fei_attr *attr = data;
+        unsigned long retv = (unsigned long)val;
+        int err = 0;
+        mutex_lock(&fei_lock);
+        /*
+         * Since this operation can be done after retval file is removed,
+         * It is safer to check the attr is still valid before accessing
+         * its member.
+         */
+        if (!fei_attr_is_valid(attr)) {
+                err = -ENOENT;
+                goto out;
+        }
+        if (attr->kp.addr) {
+                if (adjust_error_retval((unsigned long)attr->kp.addr,
+                                        val) != retv)
+                        err = -EINVAL;
+        }
+        if (!err)
+                attr->retval = val;
+out:
+        mutex_unlock(&fei_lock);
+        return err;
+}
+static int fei_retval_get(void *data, u64 *val)
+{
+        struct fei_attr *attr = data;
+        int err = 0;
+        mutex_lock(&fei_lock);
+        /* Here we also validate @attr to ensure it still exists. */
+        if (!fei_attr_is_valid(attr))
+                err = -ENOENT;
+        else
+                *val = attr->retval;
+        mutex_unlock(&fei_lock);
+        return err;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set,
+                         "%llx\n");
+static int fei_debugfs_add_attr(struct fei_attr *attr)
+{
+        struct dentry *dir;
+        dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir);
+        if (!dir)
+                return -ENOMEM;
+        if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) {
+                debugfs_remove_recursive(dir);
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void fei_debugfs_remove_attr(struct fei_attr *attr)
+{
+        struct dentry *dir;
+        dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
+        if (dir)
+                debugfs_remove_recursive(dir);
+}
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
+{
+        struct fei_attr *attr = container_of(kp, struct fei_attr, kp);
+        if (should_fail(&fei_fault_attr, 1)) {
+                regs_set_return_value(regs, attr->retval);
+                override_function_with_return(regs);
+                /* Kprobe specific fixup */
+                reset_current_kprobe();
+                preempt_enable_no_resched();
+                return 1;
+        }
+        return 0;
+}
+NOKPROBE_SYMBOL(fei_kprobe_handler)
+static void *fei_seq_start(struct seq_file *m, loff_t *pos)
+{
+        mutex_lock(&fei_lock);
+        return seq_list_start(&fei_attr_list, *pos);
+}
+static void fei_seq_stop(struct seq_file *m, void *v)
+{
+        mutex_unlock(&fei_lock);
+}
+static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        return seq_list_next(v, &fei_attr_list, pos);
+}
+static int fei_seq_show(struct seq_file *m, void *v)
+{
+        struct fei_attr *attr = list_entry(v, struct fei_attr, list);
+        seq_printf(m, "%pf\n", attr->kp.addr);
+        return 0;
+}
+static const struct seq_operations fei_seq_ops = {
+        .start  = fei_seq_start,
+        .next   = fei_seq_next,
+        .stop   = fei_seq_stop,
+        .show   = fei_seq_show,
+};
+static int fei_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &fei_seq_ops);
+}
+static void fei_attr_remove(struct fei_attr *attr)
+{
+        fei_debugfs_remove_attr(attr);
+        unregister_kprobe(&attr->kp);
+        list_del(&attr->list);
+        fei_attr_free(attr);
+}
+static void fei_attr_remove_all(void)
+{
+        struct fei_attr *attr, *n;
+        list_for_each_entry_safe(attr, n, &fei_attr_list, list) {
+                fei_attr_remove(attr);
+        }
+}
+static ssize_t fei_write(struct file *file, const char __user *buffer,
+                         size_t count, loff_t *ppos)
+{
+        struct fei_attr *attr;
+        unsigned long addr;
+        char *buf, *sym;
+        int ret;
+        /* cut off if it is too long */
+        if (count > KSYM_NAME_LEN)
+                count = KSYM_NAME_LEN;
+        buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        if (copy_from_user(buf, buffer, count)) {
+                ret = -EFAULT;
+                goto out;
+        }
+        buf[count] = '\0';
+        sym = strstrip(buf);
+        mutex_lock(&fei_lock);
+        /* Writing just spaces will remove all injection points */
+        if (sym[0] == '\0') {
+                fei_attr_remove_all();
+                ret = count;
+                goto out;
+        }
+        /* Writing !function will remove one injection point */
+        if (sym[0] == '!') {
+                attr = fei_attr_lookup(sym + 1);
+                if (!attr) {
+                        ret = -ENOENT;
+                        goto out;
+                }
+                fei_attr_remove(attr);
+                ret = count;
+                goto out;
+        }
+        addr = kallsyms_lookup_name(sym);
+        if (!addr) {
+                ret = -EINVAL;
+                goto out;
+        }
+        if (!within_error_injection_list(addr)) {
+                ret = -ERANGE;
+                goto out;
+        }
+        if (fei_attr_lookup(sym)) {
+                ret = -EBUSY;
+                goto out;
+        }
+        attr = fei_attr_new(sym, addr);
+        if (!attr) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ret = register_kprobe(&attr->kp);
+        if (!ret)
+                ret = fei_debugfs_add_attr(attr);
+        if (ret < 0)
+                fei_attr_remove(attr);
+        else {
+                list_add_tail(&attr->list, &fei_attr_list);
+                ret = count;
+        }
+out:
+        kfree(buf);
+        mutex_unlock(&fei_lock);
+        return ret;
+}
+static const struct file_operations fei_ops = {
+        .open =         fei_open,
+        .read =         seq_read,
+        .write =        fei_write,
+        .llseek =       seq_lseek,
+        .release =      seq_release,
+};
+static int __init fei_debugfs_init(void)
+{
+        struct dentry *dir;
+        dir = fault_create_debugfs_attr("fail_function", NULL,
+                                        &fei_fault_attr);
+        if (IS_ERR(dir))
+                return PTR_ERR(dir);
+        /* injectable attribute is just a symlink of error_inject/list */
+        if (!debugfs_create_symlink("injectable", dir,
+                                    "../error_injection/list"))
+                goto error;
+        if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops))
+                goto error;
+        fei_debugfs_dir = dir;
+        return 0;
+error:
+        debugfs_remove_recursive(dir);
+        return -ENOMEM;
+}
+late_initcall(fei_debugfs_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2295fc69717f..c7c112391d79 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,7 @@
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
 #include <linux/magic.h>
+#include <linux/sched/mm.h>
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
@@ -282,8 +283,9 @@ static void free_thread_stack(struct task_struct *tsk)
 void thread_stack_cache_init(void)
 {
-        thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
+        thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
-                                              THREAD_SIZE, 0, NULL);
+                                        THREAD_SIZE, THREAD_SIZE, 0, 0,
+                                        THREAD_SIZE, NULL);
        BUG_ON(thread_stack_cache == NULL);
 }
 # endif
@@ -390,6 +392,246 @@ void free_task(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(free_task);
+#ifdef CONFIG_MMU
+static __latent_entropy int dup_mmap(struct mm_struct *mm,
+                                        struct mm_struct *oldmm)
+{
+        struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
+        struct rb_node **rb_link, *rb_parent;
+        int retval;
+        unsigned long charge;
+        LIST_HEAD(uf);
+        uprobe_start_dup_mmap();
+        if (down_write_killable(&oldmm->mmap_sem)) {
+                retval = -EINTR;
+                goto fail_uprobe_end;
+        }
+        flush_cache_dup_mm(oldmm);
+        uprobe_dup_mmap(oldmm, mm);
+        /*
+         * Not linked in yet - no deadlock potential:
+         */
+        down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+        /* No ordering required: file already has been exposed. */
+        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+        mm->total_vm = oldmm->total_vm;
+        mm->data_vm = oldmm->data_vm;
+        mm->exec_vm = oldmm->exec_vm;
+        mm->stack_vm = oldmm->stack_vm;
+        rb_link = &mm->mm_rb.rb_node;
+        rb_parent = NULL;
+        pprev = &mm->mmap;
+        retval = ksm_fork(mm, oldmm);
+        if (retval)
+                goto out;
+        retval = khugepaged_fork(mm, oldmm);
+        if (retval)
+                goto out;
+        prev = NULL;
+        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+                struct file *file;
+                if (mpnt->vm_flags & VM_DONTCOPY) {
+                        vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
+                        continue;
+                }
+                charge = 0;
+                if (mpnt->vm_flags & VM_ACCOUNT) {
+                        unsigned long len = vma_pages(mpnt);
+                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
+                                goto fail_nomem;
+                        charge = len;
+                }
+                tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+                if (!tmp)
+                        goto fail_nomem;
+                *tmp = *mpnt;
+                INIT_LIST_HEAD(&tmp->anon_vma_chain);
+                retval = vma_dup_policy(mpnt, tmp);
+                if (retval)
+                        goto fail_nomem_policy;
+                tmp->vm_mm = mm;
+                retval = dup_userfaultfd(tmp, &uf);
+                if (retval)
+                        goto fail_nomem_anon_vma_fork;
+                if (tmp->vm_flags & VM_WIPEONFORK) {
+                        /* VM_WIPEONFORK gets a clean slate in the child. */
+                        tmp->anon_vma = NULL;
+                        if (anon_vma_prepare(tmp))
+                                goto fail_nomem_anon_vma_fork;
+                } else if (anon_vma_fork(tmp, mpnt))
+                        goto fail_nomem_anon_vma_fork;
+                tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+                tmp->vm_next = tmp->vm_prev = NULL;
+                file = tmp->vm_file;
+                if (file) {
+                        struct inode *inode = file_inode(file);
+                        struct address_space *mapping = file->f_mapping;
+                        get_file(file);
+                        if (tmp->vm_flags & VM_DENYWRITE)
+                                atomic_dec(&inode->i_writecount);
+                        i_mmap_lock_write(mapping);
+                        if (tmp->vm_flags & VM_SHARED)
+                                atomic_inc(&mapping->i_mmap_writable);
+                        flush_dcache_mmap_lock(mapping);
+                        /* insert tmp into the share list, just after mpnt */
+                        vma_interval_tree_insert_after(tmp, mpnt,
+                                        &mapping->i_mmap);
+                        flush_dcache_mmap_unlock(mapping);
+                        i_mmap_unlock_write(mapping);
+                }
+                /*
+                 * Clear hugetlb-related page reserves for children. This only
+                 * affects MAP_PRIVATE mappings. Faults generated by the child
+                 * are not guaranteed to succeed, even if read-only
+                 */
+                if (is_vm_hugetlb_page(tmp))
+                        reset_vma_resv_huge_pages(tmp);
+                /*
+                 * Link in the new vma and copy the page table entries.
+                 */
+                *pprev = tmp;
+                pprev = &tmp->vm_next;
+                tmp->vm_prev = prev;
+                prev = tmp;
+                __vma_link_rb(mm, tmp, rb_link, rb_parent);
+                rb_link = &tmp->vm_rb.rb_right;
+                rb_parent = &tmp->vm_rb;
+                mm->map_count++;
+                if (!(tmp->vm_flags & VM_WIPEONFORK))
+                        retval = copy_page_range(mm, oldmm, mpnt);
+                if (tmp->vm_ops && tmp->vm_ops->open)
+                        tmp->vm_ops->open(tmp);
+                if (retval)
+                        goto out;
+        }
+        /* a new mm has just been created */
+        arch_dup_mmap(oldmm, mm);
+        retval = 0;
+out:
+        up_write(&mm->mmap_sem);
+        flush_tlb_mm(oldmm);
+        up_write(&oldmm->mmap_sem);
+        dup_userfaultfd_complete(&uf);
+fail_uprobe_end:
+        uprobe_end_dup_mmap();
+        return retval;
+fail_nomem_anon_vma_fork:
+        mpol_put(vma_policy(tmp));
+fail_nomem_policy:
+        kmem_cache_free(vm_area_cachep, tmp);
+fail_nomem:
+        retval = -ENOMEM;
+        vm_unacct_memory(charge);
+        goto out;
+}
+static inline int mm_alloc_pgd(struct mm_struct *mm)
+{
+        mm->pgd = pgd_alloc(mm);
+        if (unlikely(!mm->pgd))
+                return -ENOMEM;
+        return 0;
+}
+static inline void mm_free_pgd(struct mm_struct *mm)
+{
+        pgd_free(mm, mm->pgd);
+}
+#else
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+        down_write(&oldmm->mmap_sem);
+        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+        up_write(&oldmm->mmap_sem);
+        return 0;
+}
+#define mm_alloc_pgd(mm)        (0)
+#define mm_free_pgd(mm)
+#endif /* CONFIG_MMU */
+static void check_mm(struct mm_struct *mm)
+{
+        int i;
+        for (i = 0; i < NR_MM_COUNTERS; i++) {
+                long x = atomic_long_read(&mm->rss_stat.count[i]);
+                if (unlikely(x))
+                        printk(KERN_ALERT "BUG: Bad rss-counter state "
+                                          "mm:%p idx:%d val:%ld\n", mm, i, x);
+        }
+        if (mm_pgtables_bytes(mm))
+                pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+                                mm_pgtables_bytes(mm));
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
+#endif
+}
+#define allocate_mm()   (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#define free_mm(mm)     (kmem_cache_free(mm_cachep, (mm)))
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+static void __mmdrop(struct mm_struct *mm)
+{
+        BUG_ON(mm == &init_mm);
+        mm_free_pgd(mm);
+        destroy_context(mm);
+        hmm_mm_destroy(mm);
+        mmu_notifier_mm_destroy(mm);
+        check_mm(mm);
+        put_user_ns(mm->user_ns);
+        free_mm(mm);
+}
+void mmdrop(struct mm_struct *mm)
+{
+        /*
+         * The implicit full barrier implied by atomic_dec_and_test() is
+         * required by the membarrier system call before returning to
+         * user-space, after storing to rq->curr.
+         */
+        if (unlikely(atomic_dec_and_test(&mm->mm_count)))
+                __mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmdrop);
+static void mmdrop_async_fn(struct work_struct *work)
+{
+        struct mm_struct *mm;
+        mm = container_of(work, struct mm_struct, async_put_work);
+        __mmdrop(mm);
+}
+static void mmdrop_async(struct mm_struct *mm)
+{
+        if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
+                INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
+                schedule_work(&mm->async_put_work);
+        }
+}
 static inline void free_signal_struct(struct signal_struct *sig)
 {
        taskstats_tgid_free(sig);
@@ -457,6 +699,21 @@ static void set_max_threads(unsigned int max_threads_suggested)
 int arch_task_struct_size __read_mostly;
 #endif
+static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
+{
+        /* Fetch thread_struct whitelist for the architecture. */
+        arch_thread_struct_whitelist(offset, size);
+        /*
+         * Handle zero-sized whitelist or empty thread_struct, otherwise
+         * adjust offset to position of thread_struct in task_struct.
+         */
+        if (unlikely(*size == 0))
+                *offset = 0;
+        else
+                *offset += offsetof(struct task_struct, thread);
+}
 void __init fork_init(void)
 {
        int i;
@@ -465,11 +722,14 @@ void __init fork_init(void)
 #define ARCH_MIN_TASKALIGN      0
 #endif
        int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+        unsigned long useroffset, usersize;
        /* create a slab on which task_structs can be allocated */
-        task_struct_cachep = kmem_cache_create("task_struct",
+        task_struct_whitelist(&useroffset, &usersize);
+        task_struct_cachep = kmem_cache_create_usercopy("task_struct",
                        arch_task_struct_size, align,
-                        SLAB_PANIC|SLAB_ACCOUNT, NULL);
+                        SLAB_PANIC|SLAB_ACCOUNT,
+                        useroffset, usersize, NULL);
 #endif
        /* do the arch specific task caches init */
@@ -594,181 +854,8 @@ free_tsk:
        return NULL;
 }
-#ifdef CONFIG_MMU
-static __latent_entropy int dup_mmap(struct mm_struct *mm,
-                                        struct mm_struct *oldmm)
-{
-        struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
-        struct rb_node **rb_link, *rb_parent;
-        int retval;
-        unsigned long charge;
-        LIST_HEAD(uf);
-        uprobe_start_dup_mmap();
-        if (down_write_killable(&oldmm->mmap_sem)) {
-                retval = -EINTR;
-                goto fail_uprobe_end;
-        }
-        flush_cache_dup_mm(oldmm);
-        uprobe_dup_mmap(oldmm, mm);
-        /*
-         * Not linked in yet - no deadlock potential:
-         */
-        down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
-        /* No ordering required: file already has been exposed. */
-        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
-        mm->total_vm = oldmm->total_vm;
-        mm->data_vm = oldmm->data_vm;
-        mm->exec_vm = oldmm->exec_vm;
-        mm->stack_vm = oldmm->stack_vm;
-        rb_link = &mm->mm_rb.rb_node;
-        rb_parent = NULL;
-        pprev = &mm->mmap;
-        retval = ksm_fork(mm, oldmm);
-        if (retval)
-                goto out;
-        retval = khugepaged_fork(mm, oldmm);
-        if (retval)
-                goto out;
-        prev = NULL;
-        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
-                struct file *file;
-                if (mpnt->vm_flags & VM_DONTCOPY) {
-                        vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
-                        continue;
-                }
-                charge = 0;
-                if (mpnt->vm_flags & VM_ACCOUNT) {
-                        unsigned long len = vma_pages(mpnt);
-                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
-                                goto fail_nomem;
-                        charge = len;
-                }
-                tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-                if (!tmp)
-                        goto fail_nomem;
-                *tmp = *mpnt;
-                INIT_LIST_HEAD(&tmp->anon_vma_chain);
-                retval = vma_dup_policy(mpnt, tmp);
-                if (retval)
-                        goto fail_nomem_policy;
-                tmp->vm_mm = mm;
-                retval = dup_userfaultfd(tmp, &uf);
-                if (retval)
-                        goto fail_nomem_anon_vma_fork;
-                if (tmp->vm_flags & VM_WIPEONFORK) {
-                        /* VM_WIPEONFORK gets a clean slate in the child. */
-                        tmp->anon_vma = NULL;
-                        if (anon_vma_prepare(tmp))
-                                goto fail_nomem_anon_vma_fork;
-                } else if (anon_vma_fork(tmp, mpnt))
-                        goto fail_nomem_anon_vma_fork;
-                tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
-                tmp->vm_next = tmp->vm_prev = NULL;
-                file = tmp->vm_file;
-                if (file) {
-                        struct inode *inode = file_inode(file);
-                        struct address_space *mapping = file->f_mapping;
-                        get_file(file);
-                        if (tmp->vm_flags & VM_DENYWRITE)
-                                atomic_dec(&inode->i_writecount);
-                        i_mmap_lock_write(mapping);
-                        if (tmp->vm_flags & VM_SHARED)
-                                atomic_inc(&mapping->i_mmap_writable);
-                        flush_dcache_mmap_lock(mapping);
-                        /* insert tmp into the share list, just after mpnt */
-                        vma_interval_tree_insert_after(tmp, mpnt,
-                                        &mapping->i_mmap);
-                        flush_dcache_mmap_unlock(mapping);
-                        i_mmap_unlock_write(mapping);
-                }
-                /*
-                 * Clear hugetlb-related page reserves for children. This only
-                 * affects MAP_PRIVATE mappings. Faults generated by the child
-                 * are not guaranteed to succeed, even if read-only
-                 */
-                if (is_vm_hugetlb_page(tmp))
-                        reset_vma_resv_huge_pages(tmp);
-                /*
-                 * Link in the new vma and copy the page table entries.
-                 */
-                *pprev = tmp;
-                pprev = &tmp->vm_next;
-                tmp->vm_prev = prev;
-                prev = tmp;
-                __vma_link_rb(mm, tmp, rb_link, rb_parent);
-                rb_link = &tmp->vm_rb.rb_right;
-                rb_parent = &tmp->vm_rb;
-                mm->map_count++;
-                if (!(tmp->vm_flags & VM_WIPEONFORK))
-                        retval = copy_page_range(mm, oldmm, mpnt);
-                if (tmp->vm_ops && tmp->vm_ops->open)
-                        tmp->vm_ops->open(tmp);
-                if (retval)
-                        goto out;
-        }
-        /* a new mm has just been created */
-        retval = arch_dup_mmap(oldmm, mm);
-out:
-        up_write(&mm->mmap_sem);
-        flush_tlb_mm(oldmm);
-        up_write(&oldmm->mmap_sem);
-        dup_userfaultfd_complete(&uf);
-fail_uprobe_end:
-        uprobe_end_dup_mmap();
-        return retval;
-fail_nomem_anon_vma_fork:
-        mpol_put(vma_policy(tmp));
-fail_nomem_policy:
-        kmem_cache_free(vm_area_cachep, tmp);
-fail_nomem:
-        retval = -ENOMEM;
-        vm_unacct_memory(charge);
-        goto out;
-}
-static inline int mm_alloc_pgd(struct mm_struct *mm)
-{
-        mm->pgd = pgd_alloc(mm);
-        if (unlikely(!mm->pgd))
-                return -ENOMEM;
-        return 0;
-}
-static inline void mm_free_pgd(struct mm_struct *mm)
-{
-        pgd_free(mm, mm->pgd);
-}
-#else
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-        down_write(&oldmm->mmap_sem);
-        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
-        up_write(&oldmm->mmap_sem);
-        return 0;
-}
-#define mm_alloc_pgd(mm)        (0)
-#define mm_free_pgd(mm)
-#endif /* CONFIG_MMU */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
-#define allocate_mm()   (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
-#define free_mm(mm)     (kmem_cache_free(mm_cachep, (mm)))
 static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
 static int __init coredump_filter_setup(char *s)
@@ -858,27 +945,6 @@ fail_nopgd:
        return NULL;
 }
-static void check_mm(struct mm_struct *mm)
-{
-        int i;
-        for (i = 0; i < NR_MM_COUNTERS; i++) {
-                long x = atomic_long_read(&mm->rss_stat.count[i]);
-                if (unlikely(x))
-                        printk(KERN_ALERT "BUG: Bad rss-counter state "
-                                          "mm:%p idx:%d val:%ld\n", mm, i, x);
-        }
-        if (mm_pgtables_bytes(mm))
-                pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
-                                mm_pgtables_bytes(mm));
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
-#endif
-}
 /*
 * Allocate and initialize an mm_struct.
 */
@@ -894,24 +960,6 @@ struct mm_struct *mm_alloc(void)
        return mm_init(mm, current, current_user_ns());
 }
-/*
- * Called when the last reference to the mm
- * is dropped: either by a lazy thread or by
- * mmput. Free the page directory and the mm.
- */
-void __mmdrop(struct mm_struct *mm)
-{
-        BUG_ON(mm == &init_mm);
-        mm_free_pgd(mm);
-        destroy_context(mm);
-        hmm_mm_destroy(mm);
-        mmu_notifier_mm_destroy(mm);
-        check_mm(mm);
-        put_user_ns(mm->user_ns);
-        free_mm(mm);
-}
-EXPORT_SYMBOL_GPL(__mmdrop);
 static inline void __mmput(struct mm_struct *mm)
 {
        VM_BUG_ON(atomic_read(&mm->mm_users));
@@ -2224,9 +2272,11 @@ void __init proc_caches_init(void)
         * maximum number of CPU's we can ever have.  The cpumask_allocation
         * is at the end of the structure, exactly for that reason.
         */
-        mm_cachep = kmem_cache_create("mm_struct",
+        mm_cachep = kmem_cache_create_usercopy("mm_struct",
                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+                        offsetof(struct mm_struct, saved_auxv),
+                        sizeof_field(struct mm_struct, saved_auxv),
                        NULL);
        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
        mmap_init();
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 4e8089b319ae..8c82ea26e837 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -71,7 +71,7 @@ unsigned long probe_irq_on(void)
                raw_spin_lock_irq(&desc->lock);
                if (!desc->action && irq_settings_can_probe(desc)) {
                        desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
-                        if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE))
+                        if (irq_activate_and_startup(desc, IRQ_NORESEND))
                                desc->istate |= IRQS_PENDING;
                }
                raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 043bfc35b353..c69357a43849 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -294,11 +294,11 @@ int irq_activate(struct irq_desc *desc)
        return 0;
 }
-void irq_activate_and_startup(struct irq_desc *desc, bool resend)
+int irq_activate_and_startup(struct irq_desc *desc, bool resend)
 {
        if (WARN_ON(irq_activate(desc)))
-                return;
+                return 0;
-        irq_startup(desc, resend, IRQ_START_FORCE);
+        return irq_startup(desc, resend, IRQ_START_FORCE);
 }
 static void __irq_disable(struct irq_desc *desc, bool mask);
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index e4d3819a91cc..8ccb326d2977 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -3,8 +3,6 @@
 * Debugging printout:
 */
-#include <linux/kallsyms.h>
 #define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
 #define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f)
 /* FIXME */
@@ -19,14 +17,14 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
        printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
                irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
-        printk("->handle_irq():  %p, ", desc->handle_irq);
+        printk("->handle_irq():  %p, %pS\n",
-        print_symbol("%s\n", (unsigned long)desc->handle_irq);
+                desc->handle_irq, desc->handle_irq);
-        printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
+        printk("->irq_data.chip(): %p, %pS\n",
-        print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
+                desc->irq_data.chip, desc->irq_data.chip);
        printk("->action(): %p\n", desc->action);
        if (desc->action) {
-                printk("->action->handler(): %p, ", desc->action->handler);
+                printk("->action->handler(): %p, %pS\n",
-                print_symbol("%s\n", (unsigned long)desc->action->handler);
+                        desc->action->handler, desc->action->handler);
        }
        ___P(IRQ_LEVEL);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ab19371eab9b..ca6afa267070 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc);
 #define IRQ_START_COND  false
 extern int irq_activate(struct irq_desc *desc);
-extern void irq_activate_and_startup(struct irq_desc *desc, bool resend);
+extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
 extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
 extern void irq_shutdown(struct irq_desc *desc);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index d5fa4116688a..a23e21ada81b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -12,7 +12,6 @@
 *      compression (see scripts/kallsyms.c for a more complete description)
 */
 #include <linux/kallsyms.h>
-#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/seq_file.h>
 #include <linux/fs.h>
@@ -20,15 +19,12 @@
 #include <linux/err.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>        /* for cond_resched */
-#include <linux/mm.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/filter.h>
 #include <linux/ftrace.h>
 #include <linux/compiler.h>
-#include <asm/sections.h>
 /*
 * These will be re-linked against their real values
 * during the second link stage.
@@ -52,37 +48,6 @@ extern const u16 kallsyms_token_index[] __weak;
 extern const unsigned long kallsyms_markers[] __weak;
-static inline int is_kernel_inittext(unsigned long addr)
-{
-        if (addr >= (unsigned long)_sinittext
-            && addr <= (unsigned long)_einittext)
-                return 1;
-        return 0;
-}
-static inline int is_kernel_text(unsigned long addr)
-{
-        if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
-            arch_is_kernel_text(addr))
-                return 1;
-        return in_gate_area_no_mm(addr);
-}
-static inline int is_kernel(unsigned long addr)
-{
-        if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
-                return 1;
-        return in_gate_area_no_mm(addr);
-}
-static int is_ksym_addr(unsigned long addr)
-{
-        if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
-                return is_kernel(addr);
-        return is_kernel_text(addr) || is_kernel_inittext(addr);
-}
 /*
 * Expand a compressed symbol data into the resulting uncompressed string,
 * if uncompressed string is too long (>= maxlen), it will be truncated,
@@ -464,17 +429,6 @@ int sprint_backtrace(char *buffer, unsigned long address)
        return __sprint_symbol(buffer, address, -1, 1);
 }
-/* Look up a kernel symbol and print it to the kernel messages. */
-void __print_symbol(const char *fmt, unsigned long address)
-{
-        char buffer[KSYM_SYMBOL_LEN];
-        sprint_symbol(buffer, address);
-        printk(fmt, buffer);
-}
-EXPORT_SYMBOL(__print_symbol);
 /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
 struct kallsym_iter {
        loff_t pos;
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index de9e45dca70f..3a4656fb7047 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -366,11 +366,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
        /*
         * A reference is taken on the patch module to prevent it from being
         * unloaded.
-         *
-         * Note: For immediate (no consistency model) patches we don't allow
-         * patch modules to unload since there is no safe/sane method to
-         * determine if a thread is still running in the patched code contained
-         * in the patch module once the ftrace registration is successful.
         */
        if (!try_module_get(patch->mod))
                return -ENODEV;
@@ -454,6 +449,8 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
 * /sys/kernel/livepatch/<patch>
 * /sys/kernel/livepatch/<patch>/enabled
 * /sys/kernel/livepatch/<patch>/transition
+ * /sys/kernel/livepatch/<patch>/signal
+ * /sys/kernel/livepatch/<patch>/force
 * /sys/kernel/livepatch/<patch>/<object>
 * /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
 */
@@ -528,11 +525,73 @@ static ssize_t transition_show(struct kobject *kobj,
                        patch == klp_transition_patch);
 }
+static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr,
+                            const char *buf, size_t count)
+{
+        struct klp_patch *patch;
+        int ret;
+        bool val;
+        ret = kstrtobool(buf, &val);
+        if (ret)
+                return ret;
+        if (!val)
+                return count;
+        mutex_lock(&klp_mutex);
+        patch = container_of(kobj, struct klp_patch, kobj);
+        if (patch != klp_transition_patch) {
+                mutex_unlock(&klp_mutex);
+                return -EINVAL;
+        }
+        klp_send_signals();
+        mutex_unlock(&klp_mutex);
+        return count;
+}
+static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
+                           const char *buf, size_t count)
+{
+        struct klp_patch *patch;
+        int ret;
+        bool val;
+        ret = kstrtobool(buf, &val);
+        if (ret)
+                return ret;
+        if (!val)
+                return count;
+        mutex_lock(&klp_mutex);
+        patch = container_of(kobj, struct klp_patch, kobj);
+        if (patch != klp_transition_patch) {
+                mutex_unlock(&klp_mutex);
+                return -EINVAL;
+        }
+        klp_force_transition();
+        mutex_unlock(&klp_mutex);
+        return count;
+}
 static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
 static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
+static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal);
+static struct kobj_attribute force_kobj_attr = __ATTR_WO(force);
 static struct attribute *klp_patch_attrs[] = {
        &enabled_kobj_attr.attr,
        &transition_kobj_attr.attr,
+        &signal_kobj_attr.attr,
+        &force_kobj_attr.attr,
        NULL
 };
@@ -830,12 +889,7 @@ int klp_register_patch(struct klp_patch *patch)
        if (!klp_initialized())
                return -ENODEV;
-        /*
+        if (!klp_have_reliable_stack()) {
-         * Architectures without reliable stack traces have to set
-         * patch->immediate because there's currently no way to patch kthreads
-         * with the consistency model.
-         */
-        if (!klp_have_reliable_stack() && !patch->immediate) {
                pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
                return -ENOSYS;
        }
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 56add6327736..7c6631e693bc 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -33,6 +33,8 @@ struct klp_patch *klp_transition_patch;
 static int klp_target_state = KLP_UNDEFINED;
+static bool klp_forced = false;
 /*
 * This work can be performed periodically to finish patching or unpatching any
 * "straggler" tasks which failed to transition in the first attempt.
@@ -80,7 +82,6 @@ static void klp_complete_transition(void)
        struct klp_func *func;
        struct task_struct *g, *task;
        unsigned int cpu;
-        bool immediate_func = false;
        pr_debug("'%s': completing %s transition\n",
                 klp_transition_patch->mod->name,
@@ -102,16 +103,9 @@ static void klp_complete_transition(void)
                klp_synchronize_transition();
        }
-        if (klp_transition_patch->immediate)
+        klp_for_each_object(klp_transition_patch, obj)
-                goto done;
+                klp_for_each_func(obj, func)
-        klp_for_each_object(klp_transition_patch, obj) {
-                klp_for_each_func(obj, func) {
                        func->transition = false;
-                        if (func->immediate)
-                                immediate_func = true;
-                }
-        }
        /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
        if (klp_target_state == KLP_PATCHED)
@@ -130,7 +124,6 @@ static void klp_complete_transition(void)
                task->patch_state = KLP_UNDEFINED;
        }
-done:
        klp_for_each_object(klp_transition_patch, obj) {
                if (!klp_is_object_loaded(obj))
                        continue;
@@ -144,13 +137,11 @@ done:
                  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
        /*
-         * See complementary comment in __klp_enable_patch() for why we
+         * klp_forced set implies unbounded increase of module's ref count if
-         * keep the module reference for immediate patches.
+         * the module is disabled/enabled in a loop.
         */
-        if (!klp_transition_patch->immediate && !immediate_func &&
+        if (!klp_forced && klp_target_state == KLP_UNPATCHED)
-            klp_target_state == KLP_UNPATCHED) {
                module_put(klp_transition_patch->mod);
-        }
        klp_target_state = KLP_UNDEFINED;
        klp_transition_patch = NULL;
@@ -218,9 +209,6 @@ static int klp_check_stack_func(struct klp_func *func,
        struct klp_ops *ops;
        int i;
-        if (func->immediate)
-                return 0;
        for (i = 0; i < trace->nr_entries; i++) {
                address = trace->entries[i];
@@ -383,13 +371,6 @@ void klp_try_complete_transition(void)
        WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
        /*
-         * If the patch can be applied or reverted immediately, skip the
-         * per-task transitions.
-         */
-        if (klp_transition_patch->immediate)
-                goto success;
-        /*
         * Try to switch the tasks to the target patch state by walking their
         * stacks and looking for any to-be-patched or to-be-unpatched
         * functions.  If such functions are found on a stack, or if the stack
@@ -432,7 +413,6 @@ void klp_try_complete_transition(void)
                return;
        }
-success:
        /* we're done, now cleanup the data structures */
        klp_complete_transition();
 }
@@ -453,13 +433,6 @@ void klp_start_transition(void)
                  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
        /*
-         * If the patch can be applied or reverted immediately, skip the
-         * per-task transitions.
-         */
-        if (klp_transition_patch->immediate)
-                return;
-        /*
         * Mark all normal tasks as needing a patch state update.  They'll
         * switch either in klp_try_complete_transition() or as they exit the
         * kernel.
@@ -509,13 +482,6 @@ void klp_init_transition(struct klp_patch *patch, int state)
                 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
        /*
-         * If the patch can be applied or reverted immediately, skip the
-         * per-task transitions.
-         */
-        if (patch->immediate)
-                return;
-        /*
         * Initialize all tasks to the initial patch state to prepare them for
         * switching to the target state.
         */
@@ -608,3 +574,71 @@ void klp_copy_process(struct task_struct *child)
        /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
 }
+/*
+ * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
+ * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this
+ * action currently.
+ */
+void klp_send_signals(void)
+{
+        struct task_struct *g, *task;
+        pr_notice("signaling remaining tasks\n");
+        read_lock(&tasklist_lock);
+        for_each_process_thread(g, task) {
+                if (!klp_patch_pending(task))
+                        continue;
+                /*
+                 * There is a small race here. We could see TIF_PATCH_PENDING
+                 * set and decide to wake up a kthread or send a fake signal.
+                 * Meanwhile the task could migrate itself and the action
+                 * would be meaningless. It is not serious though.
+                 */
+                if (task->flags & PF_KTHREAD) {
+                        /*
+                         * Wake up a kthread which sleeps interruptedly and
+                         * still has not been migrated.
+                         */
+                        wake_up_state(task, TASK_INTERRUPTIBLE);
+                } else {
+                        /*
+                         * Send fake signal to all non-kthread tasks which are
+                         * still not migrated.
+                         */
+                        spin_lock_irq(&task->sighand->siglock);
+                        signal_wake_up(task, 0);
+                        spin_unlock_irq(&task->sighand->siglock);
+                }
+        }
+        read_unlock(&tasklist_lock);
+}
+/*
+ * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an
+ * existing transition to finish.
+ *
+ * NOTE: klp_update_patch_state(task) requires the task to be inactive or
+ * 'current'. This is not the case here and the consistency model could be
+ * broken. Administrator, who is the only one to execute the
+ * klp_force_transitions(), has to be aware of this.
+ */
+void klp_force_transition(void)
+{
+        struct task_struct *g, *task;
+        unsigned int cpu;
+        pr_warn("forcing remaining tasks to the patched state\n");
+        read_lock(&tasklist_lock);
+        for_each_process_thread(g, task)
+                klp_update_patch_state(task);
+        read_unlock(&tasklist_lock);
+        for_each_possible_cpu(cpu)
+                klp_update_patch_state(idle_task(cpu));
+        klp_forced = true;
+}
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
index 0f6e27c481f9..f9d0bc016067 100644
--- a/kernel/livepatch/transition.h
+++ b/kernel/livepatch/transition.h
@@ -11,5 +11,7 @@ void klp_cancel_transition(void);
 void klp_start_transition(void);
 void klp_try_complete_transition(void);
 void klp_reverse_transition(void);
+void klp_send_signals(void);
+void klp_force_transition(void);
 #endif /* _LIVEPATCH_TRANSITION_H */
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 403ab9cdb949..4849be5f9b3c 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -188,13 +188,6 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL);
 #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
 #define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-struct page_map {
-        struct resource res;
-        struct percpu_ref *ref;
-        struct dev_pagemap pgmap;
-        struct vmem_altmap altmap;
-};
 static unsigned long order_at(struct resource *res, unsigned long pgoff)
 {
        unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
@@ -248,34 +241,36 @@ int device_private_entry_fault(struct vm_area_struct *vma,
 EXPORT_SYMBOL(device_private_entry_fault);
 #endif /* CONFIG_DEVICE_PRIVATE */
-static void pgmap_radix_release(struct resource *res)
+static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff)
 {
        unsigned long pgoff, order;
        mutex_lock(&pgmap_lock);
-        foreach_order_pgoff(res, order, pgoff)
+        foreach_order_pgoff(res, order, pgoff) {
+                if (pgoff >= end_pgoff)
+                        break;
                radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
+        }
        mutex_unlock(&pgmap_lock);
        synchronize_rcu();
 }
-static unsigned long pfn_first(struct page_map *page_map)
+static unsigned long pfn_first(struct dev_pagemap *pgmap)
 {
-        struct dev_pagemap *pgmap = &page_map->pgmap;
+        const struct resource *res = &pgmap->res;
-        const struct resource *res = &page_map->res;
+        struct vmem_altmap *altmap = &pgmap->altmap;
-        struct vmem_altmap *altmap = pgmap->altmap;
        unsigned long pfn;
        pfn = res->start >> PAGE_SHIFT;
-        if (altmap)
+        if (pgmap->altmap_valid)
                pfn += vmem_altmap_offset(altmap);
        return pfn;
 }
-static unsigned long pfn_end(struct page_map *page_map)
+static unsigned long pfn_end(struct dev_pagemap *pgmap)
 {
-        const struct resource *res = &page_map->res;
+        const struct resource *res = &pgmap->res;
        return (res->start + resource_size(res)) >> PAGE_SHIFT;
 }
@@ -283,15 +278,15 @@ static unsigned long pfn_end(struct page_map *page_map)
 #define for_each_device_pfn(pfn, map) \
        for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
-static void devm_memremap_pages_release(struct device *dev, void *data)
+static void devm_memremap_pages_release(void *data)
 {
-        struct page_map *page_map = data;
+        struct dev_pagemap *pgmap = data;
-        struct resource *res = &page_map->res;
+        struct device *dev = pgmap->dev;
+        struct resource *res = &pgmap->res;
        resource_size_t align_start, align_size;
-        struct dev_pagemap *pgmap = &page_map->pgmap;
        unsigned long pfn;
-        for_each_device_pfn(pfn, page_map)
+        for_each_device_pfn(pfn, pgmap)
                put_page(pfn_to_page(pfn));
        if (percpu_ref_tryget_live(pgmap->ref)) {
@@ -301,56 +296,51 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
        /* pages are dead and unused, undo the arch mapping */
        align_start = res->start & ~(SECTION_SIZE - 1);
-        align_size = ALIGN(resource_size(res), SECTION_SIZE);
+        align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+                - align_start;
        mem_hotplug_begin();
-        arch_remove_memory(align_start, align_size);
+        arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
+                        &pgmap->altmap : NULL);
        mem_hotplug_done();
        untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
-        pgmap_radix_release(res);
+        pgmap_radix_release(res, -1);
-        dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
+        dev_WARN_ONCE(dev, pgmap->altmap.alloc,
-                        "%s: failed to free all reserved pages\n", __func__);
+                      "%s: failed to free all reserved pages\n", __func__);
-}
-/* assumes rcu_read_lock() held at entry */
-struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
-{
-        struct page_map *page_map;
-        WARN_ON_ONCE(!rcu_read_lock_held());
-        page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
-        return page_map ? &page_map->pgmap : NULL;
 }
 /**
 * devm_memremap_pages - remap and provide memmap backing for the given resource
 * @dev: hosting device for @res
- * @res: "host memory" address range
+ * @pgmap: pointer to a struct dev_pgmap
- * @ref: a live per-cpu reference count
- * @altmap: optional descriptor for allocating the memmap from @res
 *
 * Notes:
- * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
+ * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
- *    (or devm release event). The expected order of events is that @ref has
+ *    by the caller before passing it to this function
+ *
+ * 2/ The altmap field may optionally be initialized, in which case altmap_valid
+ *    must be set to true
+ *
+ * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages()
+ *    time (or devm release event). The expected order of events is that ref has
 *    been through percpu_ref_kill() before devm_memremap_pages_release(). The
 *    wait for the completion of all references being dropped and
 *    percpu_ref_exit() must occur after devm_memremap_pages_release().
 *
- * 2/ @res is expected to be a host memory range that could feasibly be
+ * 4/ res is expected to be a host memory range that could feasibly be
 *    treated as a "System RAM" range, i.e. not a device mmio range, but
 *    this is not enforced.
 */
-void *devm_memremap_pages(struct device *dev, struct resource *res,
+void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
-                struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
        resource_size_t align_start, align_size, align_end;
+        struct vmem_altmap *altmap = pgmap->altmap_valid ?
+                        &pgmap->altmap : NULL;
        unsigned long pfn, pgoff, order;
        pgprot_t pgprot = PAGE_KERNEL;
-        struct dev_pagemap *pgmap;
-        struct page_map *page_map;
        int error, nid, is_ram, i = 0;
+        struct resource *res = &pgmap->res;
        align_start = res->start & ~(SECTION_SIZE - 1);
        align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -367,47 +357,18 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
        if (is_ram == REGION_INTERSECTS)
                return __va(res->start);
-        if (!ref)
+        if (!pgmap->ref)
                return ERR_PTR(-EINVAL);
-        page_map = devres_alloc_node(devm_memremap_pages_release,
-                        sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
-        if (!page_map)
-                return ERR_PTR(-ENOMEM);
-        pgmap = &page_map->pgmap;
-        memcpy(&page_map->res, res, sizeof(*res));
        pgmap->dev = dev;
-        if (altmap) {
-                memcpy(&page_map->altmap, altmap, sizeof(*altmap));
-                pgmap->altmap = &page_map->altmap;
-        }
-        pgmap->ref = ref;
-        pgmap->res = &page_map->res;
-        pgmap->type = MEMORY_DEVICE_HOST;
-        pgmap->page_fault = NULL;
-        pgmap->page_free = NULL;
-        pgmap->data = NULL;
        mutex_lock(&pgmap_lock);
        error = 0;
        align_end = align_start + align_size - 1;
        foreach_order_pgoff(res, order, pgoff) {
-                struct dev_pagemap *dup;
-                rcu_read_lock();
-                dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
-                rcu_read_unlock();
-                if (dup) {
-                        dev_err(dev, "%s: %pr collides with mapping for %s\n",
-                                        __func__, res, dev_name(dup->dev));
-                        error = -EBUSY;
-                        break;
-                }
                error = __radix_tree_insert(&pgmap_radix,
-                                PHYS_PFN(res->start) + pgoff, order, page_map);
+                                PHYS_PFN(res->start) + pgoff, order, pgmap);
                if (error) {
                        dev_err(dev, "%s: failed: %d\n", __func__, error);
                        break;
@@ -427,16 +388,16 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
                goto err_pfn_remap;
        mem_hotplug_begin();
-        error = arch_add_memory(nid, align_start, align_size, false);
+        error = arch_add_memory(nid, align_start, align_size, altmap, false);
        if (!error)
                move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
                                        align_start >> PAGE_SHIFT,
-                                        align_size >> PAGE_SHIFT);
+                                        align_size >> PAGE_SHIFT, altmap);
        mem_hotplug_done();
        if (error)
                goto err_add_memory;
-        for_each_device_pfn(pfn, page_map) {
+        for_each_device_pfn(pfn, pgmap) {
                struct page *page = pfn_to_page(pfn);
                /*
@@ -447,19 +408,21 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
                 */
                list_del(&page->lru);
                page->pgmap = pgmap;
-                percpu_ref_get(ref);
+                percpu_ref_get(pgmap->ref);
                if (!(++i % 1024))
                        cond_resched();
        }
-        devres_add(dev, page_map);
+        devm_add_action(dev, devm_memremap_pages_release, pgmap);
        return __va(res->start);
 err_add_memory:
        untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
 err_pfn_remap:
 err_radix:
-        pgmap_radix_release(res);
+        pgmap_radix_release(res, pgoff);
-        devres_free(page_map);
+        devres_free(pgmap);
        return ERR_PTR(error);
 }
 EXPORT_SYMBOL(devm_memremap_pages);
@@ -475,34 +438,39 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
        altmap->alloc -= nr_pfns;
 }
-struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+/**
+ * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
+ * @pfn: page frame number to lookup page_map
+ * @pgmap: optional known pgmap that already has a reference
+ *
+ * If @pgmap is non-NULL and covers @pfn it will be returned as-is.  If @pgmap
+ * is non-NULL but does not cover @pfn the reference to it will be released.
+ */
+struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
+                struct dev_pagemap *pgmap)
 {
-        /*
+        resource_size_t phys = PFN_PHYS(pfn);
-         * 'memmap_start' is the virtual address for the first "struct
-         * page" in this range of the vmemmap array.  In the case of
-         * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple
-         * pointer arithmetic, so we can perform this to_vmem_altmap()
-         * conversion without concern for the initialization state of
-         * the struct page fields.
-         */
-        struct page *page = (struct page *) memmap_start;
-        struct dev_pagemap *pgmap;
        /*
-         * Unconditionally retrieve a dev_pagemap associated with the
+         * In the cached case we're already holding a live reference.
-         * given physical address, this is only for use in the
-         * arch_{add|remove}_memory() for setting up and tearing down
-         * the memmap.
         */
+        if (pgmap) {
+                if (phys >= pgmap->res.start && phys <= pgmap->res.end)
+                        return pgmap;
+                put_dev_pagemap(pgmap);
+        }
+        /* fall back to slow path lookup */
        rcu_read_lock();
-        pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page)));
+        pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
+        if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+                pgmap = NULL;
        rcu_read_unlock();
-        return pgmap ? pgmap->altmap : NULL;
+        return pgmap;
 }
 #endif /* CONFIG_ZONE_DEVICE */
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 void put_zone_device_private_or_public_page(struct page *page)
 {
diff --git a/kernel/module.c b/kernel/module.c
index 09e48eee4d55..ccdf24c4949e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3129,7 +3129,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
                                             sizeof(*mod->ftrace_callsites),
                                             &mod->num_ftrace_callsites);
 #endif
+#ifdef CONFIG_FUNCTION_ERROR_INJECTION
+        mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
+                                            sizeof(*mod->ei_funcs),
+                                            &mod->num_ei_funcs);
+#endif
        mod->extable = section_objs(info, "__ex_table",
                                    sizeof(*mod->extable), &mod->num_exentries);
@@ -3949,6 +3953,12 @@ static const char *get_ksymbol(struct module *mod,
        return symname(kallsyms, best);
 }
+void * __weak dereference_module_function_descriptor(struct module *mod,
+                                                     void *ptr)
+{
+        return ptr;
+}
 /* For kallsyms to ask for address resolution.  NULL means not found.  Careful
 * not to lock to avoid deadlock on oopses, simply disable preemption. */
 const char *module_address_lookup(unsigned long addr,
diff --git a/kernel/padata.c b/kernel/padata.c
index 57c0074d50cc..d568cc56405f 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
 * padata.c - generic interface to process data streams in parallel
 *
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f29cd178df90..9e58bdc8a562 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -104,9 +104,6 @@ extern int in_suspend;
 extern dev_t swsusp_resume_device;
 extern sector_t swsusp_resume_block;
-extern asmlinkage int swsusp_arch_suspend(void);
-extern asmlinkage int swsusp_arch_resume(void);
 extern int create_basic_memory_bitmaps(void);
 extern void free_basic_memory_bitmaps(void);
 extern int hibernate_preallocate_memory(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b9006617710f..db4b9b8929eb 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -131,13 +131,10 @@ static int __init control_devkmsg(char *str)
        /*
         * Set sysctl string accordingly:
         */
-        if (devkmsg_log == DEVKMSG_LOG_MASK_ON) {
+        if (devkmsg_log == DEVKMSG_LOG_MASK_ON)
-                memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
+                strcpy(devkmsg_log_str, "on");
-                strncpy(devkmsg_log_str, "on", 2);
+        else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF)
-        } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) {
+                strcpy(devkmsg_log_str, "off");
-                memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
-                strncpy(devkmsg_log_str, "off", 3);
-        }
        /* else "ratelimit" which is set by default. */
        /*
@@ -277,6 +274,13 @@ EXPORT_SYMBOL(console_set_on_cmdline);
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
+enum con_msg_format_flags {
+        MSG_FORMAT_DEFAULT      = 0,
+        MSG_FORMAT_SYSLOG       = (1 << 0),
+};
+static int console_msg_format = MSG_FORMAT_DEFAULT;
 /*
 * The printk log buffer consists of a chain of concatenated variable
 * length records. Every record starts with a record header, containing
@@ -920,10 +924,10 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
        return ret;
 }
-static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
+static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
 {
        struct devkmsg_user *user = file->private_data;
-        int ret = 0;
+        __poll_t ret = 0;
        if (!user)
                return POLLERR|POLLNVAL;
@@ -1544,6 +1548,146 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 }
 /*
+ * Special console_lock variants that help to reduce the risk of soft-lockups.
+ * They allow to pass console_lock to another printk() call using a busy wait.
+ */
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map console_owner_dep_map = {
+        .name = "console_owner"
+};
+#endif
+static DEFINE_RAW_SPINLOCK(console_owner_lock);
+static struct task_struct *console_owner;
+static bool console_waiter;
+/**
+ * console_lock_spinning_enable - mark beginning of code where another
+ *      thread might safely busy wait
+ *
+ * This basically converts console_lock into a spinlock. This marks
+ * the section where the console_lock owner can not sleep, because
+ * there may be a waiter spinning (like a spinlock). Also it must be
+ * ready to hand over the lock at the end of the section.
+ */
+static void console_lock_spinning_enable(void)
+{
+        raw_spin_lock(&console_owner_lock);
+        console_owner = current;
+        raw_spin_unlock(&console_owner_lock);
+        /* The waiter may spin on us after setting console_owner */
+        spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
+}
+/**
+ * console_lock_spinning_disable_and_check - mark end of code where another
+ *      thread was able to busy wait and check if there is a waiter
+ *
+ * This is called at the end of the section where spinning is allowed.
+ * It has two functions. First, it is a signal that it is no longer
+ * safe to start busy waiting for the lock. Second, it checks if
+ * there is a busy waiter and passes the lock rights to her.
+ *
+ * Important: Callers lose the lock if there was a busy waiter.
+ *      They must not touch items synchronized by console_lock
+ *      in this case.
+ *
+ * Return: 1 if the lock rights were passed, 0 otherwise.
+ */
+static int console_lock_spinning_disable_and_check(void)
+{
+        int waiter;
+        raw_spin_lock(&console_owner_lock);
+        waiter = READ_ONCE(console_waiter);
+        console_owner = NULL;
+        raw_spin_unlock(&console_owner_lock);
+        if (!waiter) {
+                spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+                return 0;
+        }
+        /* The waiter is now free to continue */
+        WRITE_ONCE(console_waiter, false);
+        spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+        /*
+         * Hand off console_lock to waiter. The waiter will perform
+         * the up(). After this, the waiter is the console_lock owner.
+         */
+        mutex_release(&console_lock_dep_map, 1, _THIS_IP_);
+        return 1;
+}
+/**
+ * console_trylock_spinning - try to get console_lock by busy waiting
+ *
+ * This allows to busy wait for the console_lock when the current
+ * owner is running in specially marked sections. It means that
+ * the current owner is running and cannot reschedule until it
+ * is ready to lose the lock.
+ *
+ * Return: 1 if we got the lock, 0 othrewise
+ */
+static int console_trylock_spinning(void)
+{
+        struct task_struct *owner = NULL;
+        bool waiter;
+        bool spin = false;
+        unsigned long flags;
+        if (console_trylock())
+                return 1;
+        printk_safe_enter_irqsave(flags);
+        raw_spin_lock(&console_owner_lock);
+        owner = READ_ONCE(console_owner);
+        waiter = READ_ONCE(console_waiter);
+        if (!waiter && owner && owner != current) {
+                WRITE_ONCE(console_waiter, true);
+                spin = true;
+        }
+        raw_spin_unlock(&console_owner_lock);
+        /*
+         * If there is an active printk() writing to the
+         * consoles, instead of having it write our data too,
+         * see if we can offload that load from the active
+         * printer, and do some printing ourselves.
+         * Go into a spin only if there isn't already a waiter
+         * spinning, and there is an active printer, and
+         * that active printer isn't us (recursive printk?).
+         */
+        if (!spin) {
+                printk_safe_exit_irqrestore(flags);
+                return 0;
+        }
+        /* We spin waiting for the owner to release us */
+        spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
+        /* Owner will clear console_waiter on hand off */
+        while (READ_ONCE(console_waiter))
+                cpu_relax();
+        spin_release(&console_owner_dep_map, 1, _THIS_IP_);
+        printk_safe_exit_irqrestore(flags);
+        /*
+         * The owner passed the console lock to us.
+         * Since we did not spin on console lock, annotate
+         * this as a trylock. Otherwise lockdep will
+         * complain.
+         */
+        mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
+        return 1;
+}
+/*
 * Call the console drivers, asking them to write out
 * log_buf[start] to log_buf[end - 1].
 * The console_lock must be held.
@@ -1749,12 +1893,19 @@ asmlinkage int vprintk_emit(int facility, int level,
        /* If called from the scheduler, we can not call up(). */
        if (!in_sched) {
                /*
+                 * Disable preemption to avoid being preempted while holding
+                 * console_sem which would prevent anyone from printing to
+                 * console
+                 */
+                preempt_disable();
+                /*
                 * Try to acquire and then immediately release the console
                 * semaphore.  The release will print out buffers and wake up
                 * /dev/kmsg and syslog() users.
                 */
-                if (console_trylock())
+                if (console_trylock_spinning())
                        console_unlock();
+                preempt_enable();
        }
        return printed_len;
@@ -1855,6 +2006,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
 static ssize_t msg_print_ext_body(char *buf, size_t size,
                                  char *dict, size_t dict_len,
                                  char *text, size_t text_len) { return 0; }
+static void console_lock_spinning_enable(void) { }
+static int console_lock_spinning_disable_and_check(void) { return 0; }
 static void call_console_drivers(const char *ext_text, size_t ext_len,
                                 const char *text, size_t len) {}
 static size_t msg_print_text(const struct printk_log *msg,
@@ -1913,6 +2066,17 @@ static int __add_preferred_console(char *name, int idx, char *options,
        c->index = idx;
        return 0;
 }
+static int __init console_msg_format_setup(char *str)
+{
+        if (!strcmp(str, "syslog"))
+                console_msg_format = MSG_FORMAT_SYSLOG;
+        if (!strcmp(str, "default"))
+                console_msg_format = MSG_FORMAT_DEFAULT;
+        return 1;
+}
+__setup("console_msg_format=", console_msg_format_setup);
 /*
 * Set up a console.  Called via do_early_param() in init/main.c
 * for each "console=" parameter in the boot command line.
@@ -2069,20 +2233,7 @@ int console_trylock(void)
                return 0;
        }
        console_locked = 1;
-        /*
+        console_may_schedule = 0;
-         * When PREEMPT_COUNT disabled we can't reliably detect if it's
-         * safe to schedule (e.g. calling printk while holding a spin_lock),
-         * because preempt_disable()/preempt_enable() are just barriers there
-         * and preempt_count() is always 0.
-         *
-         * RCU read sections have a separate preemption counter when
-         * PREEMPT_RCU enabled thus we must take extra care and check
-         * rcu_preempt_depth(), otherwise RCU read sections modify
-         * preempt_count().
-         */
-        console_may_schedule = !oops_in_progress &&
-                        preemptible() &&
-                        !rcu_preempt_depth();
        return 1;
 }
 EXPORT_SYMBOL(console_trylock);
@@ -2215,7 +2366,10 @@ skip:
                        goto skip;
                }
-                len += msg_print_text(msg, false, text + len, sizeof(text) - len);
+                len += msg_print_text(msg,
+                                console_msg_format & MSG_FORMAT_SYSLOG,
+                                text + len,
+                                sizeof(text) - len);
                if (nr_ext_console_drivers) {
                        ext_len = msg_print_ext_header(ext_text,
                                                sizeof(ext_text),
@@ -2229,14 +2383,29 @@ skip:
                console_seq++;
                raw_spin_unlock(&logbuf_lock);
+                /*
+                 * While actively printing out messages, if another printk()
+                 * were to occur on another CPU, it may wait for this one to
+                 * finish. This task can not be preempted if there is a
+                 * waiter waiting to take over.
+                 */
+                console_lock_spinning_enable();
                stop_critical_timings();        /* don't trace print latency */
                call_console_drivers(ext_text, ext_len, text, len);
                start_critical_timings();
+                if (console_lock_spinning_disable_and_check()) {
+                        printk_safe_exit_irqrestore(flags);
+                        return;
+                }
                printk_safe_exit_irqrestore(flags);
                if (do_cond_resched)
                        cond_resched();
        }
        console_locked = 0;
        /* Release the exclusive_console once it is used */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 84b1367935e4..5e1d713c8e61 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -659,7 +659,7 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
        if (lock_task_sighand(child, &flags)) {
                error = -EINVAL;
                if (likely(child->last_siginfo != NULL)) {
-                        *info = *child->last_siginfo;
+                        copy_siginfo(info, child->last_siginfo);
                        error = 0;
                }
                unlock_task_sighand(child, &flags);
@@ -675,7 +675,7 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
        if (lock_task_sighand(child, &flags)) {
                error = -EINVAL;
                if (likely(child->last_siginfo != NULL)) {
-                        *child->last_siginfo = *info;
+                        copy_siginfo(child->last_siginfo, info);
                        error = 0;
                }
                unlock_task_sighand(child, &flags);
@@ -1092,6 +1092,10 @@ int ptrace_request(struct task_struct *child, long request,
                ret = seccomp_get_filter(child, addr, datavp);
                break;
+        case PTRACE_SECCOMP_GET_METADATA:
+                ret = seccomp_get_metadata(child, addr, datavp);
+                break;
        default:
                break;
        }
@@ -1226,7 +1230,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
                break;
        case PTRACE_SETSIGINFO:
-                memset(&siginfo, 0, sizeof siginfo);
                if (copy_siginfo_from_user32(
                            &siginfo, (struct compat_siginfo __user *) datap))
                        ret = -EFAULT;
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index fbd56d6e575b..68fa19a5e7bd 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -422,11 +422,13 @@ void init_rcu_head(struct rcu_head *head)
 {
        debug_object_init(head, &rcuhead_debug_descr);
 }
+EXPORT_SYMBOL_GPL(init_rcu_head);
 void destroy_rcu_head(struct rcu_head *head)
 {
        debug_object_free(head, &rcuhead_debug_descr);
 }
+EXPORT_SYMBOL_GPL(destroy_rcu_head);
 static bool rcuhead_is_static_object(void *addr)
 {
diff --git a/kernel/relay.c b/kernel/relay.c
index 39a9dfc69486..41280033a4c5 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -919,9 +919,9 @@ static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
 *
 *      Poll implemention.
 */
-static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
+static __poll_t relay_file_poll(struct file *filp, poll_table *wait)
 {
-        unsigned int mask = 0;
+        __poll_t mask = 0;
        struct rchan_buf *buf = filp->private_data;
        if (buf->finalized)
diff --git a/kernel/resource.c b/kernel/resource.c
index 54ba6de3757c..8c527d83ca76 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1022,6 +1022,7 @@ static void __init __reserve_region_with_split(struct resource *root,
        struct resource *conflict;
        struct resource *res = alloc_resource(GFP_ATOMIC);
        struct resource *next_res = NULL;
+        int type = resource_type(root);
        if (!res)
                return;
@@ -1029,7 +1030,7 @@ static void __init __reserve_region_with_split(struct resource *root,
        res->name = name;
        res->start = start;
        res->end = end;
-        res->flags = IORESOURCE_BUSY;
+        res->flags = type | IORESOURCE_BUSY;
        res->desc = IORES_DESC_NONE;
        while (1) {
@@ -1064,7 +1065,7 @@ static void __init __reserve_region_with_split(struct resource *root,
                                next_res->name = name;
                                next_res->start = conflict->end + 1;
                                next_res->end = end;
-                                next_res->flags = IORESOURCE_BUSY;
+                                next_res->flags = type | IORESOURCE_BUSY;
                                next_res->desc = IORES_DESC_NONE;
                        }
                } else {
@@ -1478,7 +1479,7 @@ void __devm_release_region(struct device *dev, struct resource *parent,
 EXPORT_SYMBOL(__devm_release_region);
 /*
- * Called from init/main.c to reserve IO ports.
+ * Reserve I/O ports or memory based on "reserve=" kernel parameter.
 */
 #define MAXRESERVE 4
 static int __init reserve_setup(char *str)
@@ -1489,26 +1490,38 @@ static int __init reserve_setup(char *str)
        for (;;) {
                unsigned int io_start, io_num;
                int x = reserved;
+                struct resource *parent;
-                if (get_option (&str, &io_start) != 2)
+                if (get_option(&str, &io_start) != 2)
                        break;
-                if (get_option (&str, &io_num)   == 0)
+                if (get_option(&str, &io_num) == 0)
                        break;
                if (x < MAXRESERVE) {
                        struct resource *res = reserve + x;
+                        /*
+                         * If the region starts below 0x10000, we assume it's
+                         * I/O port space; otherwise assume it's memory.
+                         */
+                        if (io_start < 0x10000) {
+                                res->flags = IORESOURCE_IO;
+                                parent = &ioport_resource;
+                        } else {
+                                res->flags = IORESOURCE_MEM;
+                                parent = &iomem_resource;
+                        }
                        res->name = "reserved";
                        res->start = io_start;
                        res->end = io_start + io_num - 1;
-                        res->flags = IORESOURCE_BUSY;
+                        res->flags |= IORESOURCE_BUSY;
                        res->desc = IORES_DESC_NONE;
                        res->child = NULL;
-                        if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
+                        if (request_resource(parent, res) == 0)
                                reserved = x+1;
                }
        }
        return 1;
 }
 __setup("reserve=", reserve_setup);
 /*
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index a43df5193538..bb4b9fe026a1 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,13 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "sched.h"
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/kallsyms.h>
 #include <linux/utsname.h>
 #include <linux/security.h>
 #include <linux/export.h>
+#include "sched.h"
 unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
 static struct autogroup autogroup_default;
 static atomic_t autogroup_seq_nr;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5f0dfb2abb8d..940fa408a288 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -515,7 +515,7 @@ void put_seccomp_filter(struct task_struct *tsk)
 static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
 {
-        memset(info, 0, sizeof(*info));
+        clear_siginfo(info);
        info->si_signo = SIGSYS;
        info->si_code = SYS_SECCOMP;
        info->si_call_addr = (void __user *)KSTK_EIP(current);
@@ -978,49 +978,68 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 }
 #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
-long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
+static struct seccomp_filter *get_nth_filter(struct task_struct *task,
-                        void __user *data)
+                                             unsigned long filter_off)
 {
-        struct seccomp_filter *filter;
+        struct seccomp_filter *orig, *filter;
-        struct sock_fprog_kern *fprog;
+        unsigned long count;
-        long ret;
-        unsigned long count = 0;
-        if (!capable(CAP_SYS_ADMIN) ||
-            current->seccomp.mode != SECCOMP_MODE_DISABLED) {
-                return -EACCES;
-        }
+        /*
+         * Note: this is only correct because the caller should be the (ptrace)
+         * tracer of the task, otherwise lock_task_sighand is needed.
+         */
        spin_lock_irq(&task->sighand->siglock);
        if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
-                ret = -EINVAL;
+                spin_unlock_irq(&task->sighand->siglock);
-                goto out;
+                return ERR_PTR(-EINVAL);
        }
-        filter = task->seccomp.filter;
+        orig = task->seccomp.filter;
-        while (filter) {
+        __get_seccomp_filter(orig);
-                filter = filter->prev;
+        spin_unlock_irq(&task->sighand->siglock);
+        count = 0;
+        for (filter = orig; filter; filter = filter->prev)
                count++;
-        }
        if (filter_off >= count) {
-                ret = -ENOENT;
+                filter = ERR_PTR(-ENOENT);
                goto out;
        }
-        count -= filter_off;
-        filter = task->seccomp.filter;
+        count -= filter_off;
-        while (filter && count > 1) {
+        for (filter = orig; filter && count > 1; filter = filter->prev)
-                filter = filter->prev;
                count--;
-        }
        if (WARN_ON(count != 1 || !filter)) {
-                /* The filter tree shouldn't shrink while we're using it. */
+                filter = ERR_PTR(-ENOENT);
-                ret = -ENOENT;
                goto out;
        }
+        __get_seccomp_filter(filter);
+out:
+        __put_seccomp_filter(orig);
+        return filter;
+}
+long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
+                        void __user *data)
+{
+        struct seccomp_filter *filter;
+        struct sock_fprog_kern *fprog;
+        long ret;
+        if (!capable(CAP_SYS_ADMIN) ||
+            current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+                return -EACCES;
+        }
+        filter = get_nth_filter(task, filter_off);
+        if (IS_ERR(filter))
+                return PTR_ERR(filter);
        fprog = filter->prog->orig_prog;
        if (!fprog) {
                /* This must be a new non-cBPF filter, since we save
@@ -1035,17 +1054,44 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
        if (!data)
                goto out;
-        __get_seccomp_filter(filter);
-        spin_unlock_irq(&task->sighand->siglock);
        if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
                ret = -EFAULT;
+out:
        __put_seccomp_filter(filter);
        return ret;
+}
-out:
+long seccomp_get_metadata(struct task_struct *task,
-        spin_unlock_irq(&task->sighand->siglock);
+                          unsigned long size, void __user *data)
+{
+        long ret;
+        struct seccomp_filter *filter;
+        struct seccomp_metadata kmd = {};
+        if (!capable(CAP_SYS_ADMIN) ||
+            current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+                return -EACCES;
+        }
+        size = min_t(unsigned long, size, sizeof(kmd));
+        if (copy_from_user(&kmd, data, size))
+                return -EFAULT;
+        filter = get_nth_filter(task, kmd.filter_off);
+        if (IS_ERR(filter))
+                return PTR_ERR(filter);
+        memset(&kmd, 0, sizeof(kmd));
+        if (filter->log)
+                kmd.flags |= SECCOMP_FILTER_FLAG_LOG;
+        ret = size;
+        if (copy_to_user(data, &kmd, size))
+                ret = -EFAULT;
+        __put_seccomp_filter(filter);
        return ret;
 }
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 9558664bd9ec..c6e4c83dc090 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -40,6 +40,7 @@
 #include <linux/cn_proc.h>
 #include <linux/compiler.h>
 #include <linux/posix-timers.h>
+#include <linux/livepatch.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -165,7 +166,8 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 void recalc_sigpending(void)
 {
-        if (!recalc_sigpending_tsk(current) && !freezing(current))
+        if (!recalc_sigpending_tsk(current) && !freezing(current) &&
+            !klp_patch_pending(current))
                clear_thread_flag(TIF_SIGPENDING);
 }
@@ -549,6 +551,7 @@ still_pending:
                 * a fast-pathed signal or we must have been
                 * out of queue space.  So zero out the info.
                 */
+                clear_siginfo(info);
                info->si_signo = sig;
                info->si_errno = 0;
                info->si_code = SI_USER;
@@ -642,6 +645,9 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                spin_unlock(&tsk->sighand->siglock);
                posixtimer_rearm(info);
                spin_lock(&tsk->sighand->siglock);
+                /* Don't expose the si_sys_private value to userspace */
+                info->si_sys_private = 0;
        }
 #endif
        return signr;
@@ -1043,6 +1049,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                list_add_tail(&q->list, &pending->list);
                switch ((unsigned long) info) {
                case (unsigned long) SEND_SIG_NOINFO:
+                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_USER;
@@ -1051,6 +1058,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                        q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
                        break;
                case (unsigned long) SEND_SIG_PRIV:
+                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_KERNEL;
@@ -1485,6 +1493,129 @@ force_sigsegv(int sig, struct task_struct *p)
        return 0;
 }
+int force_sig_fault(int sig, int code, void __user *addr
+        ___ARCH_SI_TRAPNO(int trapno)
+        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
+        , struct task_struct *t)
+{
+        struct siginfo info;
+        clear_siginfo(&info);
+        info.si_signo = sig;
+        info.si_errno = 0;
+        info.si_code  = code;
+        info.si_addr  = addr;
+#ifdef __ARCH_SI_TRAPNO
+        info.si_trapno = trapno;
+#endif
+#ifdef __ia64__
+        info.si_imm = imm;
+        info.si_flags = flags;
+        info.si_isr = isr;
+#endif
+        return force_sig_info(info.si_signo, &info, t);
+}
+int send_sig_fault(int sig, int code, void __user *addr
+        ___ARCH_SI_TRAPNO(int trapno)
+        ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
+        , struct task_struct *t)
+{
+        struct siginfo info;
+        clear_siginfo(&info);
+        info.si_signo = sig;
+        info.si_errno = 0;
+        info.si_code  = code;
+        info.si_addr  = addr;
+#ifdef __ARCH_SI_TRAPNO
+        info.si_trapno = trapno;
+#endif
+#ifdef __ia64__
+        info.si_imm = imm;
+        info.si_flags = flags;
+        info.si_isr = isr;
+#endif
+        return send_sig_info(info.si_signo, &info, t);
+}
+#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR)
+int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
+{
+        struct siginfo info;
+        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
+        clear_siginfo(&info);
+        info.si_signo = SIGBUS;
+        info.si_errno = 0;
+        info.si_code = code;
+        info.si_addr = addr;
+        info.si_addr_lsb = lsb;
+        return force_sig_info(info.si_signo, &info, t);
+}
+int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
+{
+        struct siginfo info;
+        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
+        clear_siginfo(&info);
+        info.si_signo = SIGBUS;
+        info.si_errno = 0;
+        info.si_code = code;
+        info.si_addr = addr;
+        info.si_addr_lsb = lsb;
+        return send_sig_info(info.si_signo, &info, t);
+}
+EXPORT_SYMBOL(send_sig_mceerr);
+#endif
+#ifdef SEGV_BNDERR
+int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
+{
+        struct siginfo info;
+        clear_siginfo(&info);
+        info.si_signo = SIGSEGV;
+        info.si_errno = 0;
+        info.si_code  = SEGV_BNDERR;
+        info.si_addr  = addr;
+        info.si_lower = lower;
+        info.si_upper = upper;
+        return force_sig_info(info.si_signo, &info, current);
+}
+#endif
+#ifdef SEGV_PKUERR
+int force_sig_pkuerr(void __user *addr, u32 pkey)
+{
+        struct siginfo info;
+        clear_siginfo(&info);
+        info.si_signo = SIGSEGV;
+        info.si_errno = 0;
+        info.si_code  = SEGV_PKUERR;
+        info.si_addr  = addr;
+        info.si_pkey  = pkey;
+        return force_sig_info(info.si_signo, &info, current);
+}
+#endif
+/* For the crazy architectures that include trap information in
+ * the errno field, instead of an actual errno value.
+ */
+int force_sig_ptrace_errno_trap(int errno, void __user *addr)
+{
+        struct siginfo info;
+        clear_siginfo(&info);
+        info.si_signo = SIGTRAP;
+        info.si_errno = errno;
+        info.si_code  = TRAP_HWBKPT;
+        info.si_addr  = addr;
+        return force_sig_info(info.si_signo, &info, current);
+}
 int kill_pgrp(struct pid *pid, int sig, int priv)
 {
        int ret;
@@ -1623,6 +1754,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
                        sig = SIGCHLD;
        }
+        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        /*
@@ -1717,6 +1849,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
                parent = tsk->real_parent;
        }
+        clear_siginfo(&info);
        info.si_signo = SIGCHLD;
        info.si_errno = 0;
        /*
@@ -1929,7 +2062,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 {
        siginfo_t info;
-        memset(&info, 0, sizeof info);
+        clear_siginfo(&info);
        info.si_signo = signr;
        info.si_code = exit_code;
        info.si_pid = task_pid_vnr(current);
@@ -2136,6 +2269,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
         * have updated *info via PTRACE_SETSIGINFO.
         */
        if (signr != info->si_signo) {
+                clear_siginfo(info);
                info->si_signo = signr;
                info->si_errno = 0;
                info->si_code = SI_USER;
@@ -2688,9 +2822,7 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
 #endif
                        [SIGCHLD] = { NSIGCHLD, SIL_CHLD },
                        [SIGPOLL] = { NSIGPOLL, SIL_POLL },
-#ifdef __ARCH_SIGSYS
                        [SIGSYS]  = { NSIGSYS,  SIL_SYS },
-#endif
                };
                if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
                        layout = filter[sig].layout;
@@ -2712,12 +2844,14 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
                if ((sig == SIGFPE) && (si_code == FPE_FIXME))
                        layout = SIL_FAULT;
 #endif
+#ifdef BUS_FIXME
+                if ((sig == SIGBUS) && (si_code == BUS_FIXME))
+                        layout = SIL_FAULT;
+#endif
        }
        return layout;
 }
-#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
 int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
 {
        int err;
@@ -2756,13 +2890,21 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
 #ifdef __ARCH_SI_TRAPNO
                err |= __put_user(from->si_trapno, &to->si_trapno);
 #endif
-#ifdef BUS_MCEERR_AO
+#ifdef __ia64__
+                err |= __put_user(from->si_imm, &to->si_imm);
+                err |= __put_user(from->si_flags, &to->si_flags);
+                err |= __put_user(from->si_isr, &to->si_isr);
+#endif
                /*
                 * Other callers might not initialize the si_lsb field,
                 * so check explicitly for the right codes here.
                 */
-                if (from->si_signo == SIGBUS &&
+#ifdef BUS_MCEERR_AR
-                    (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO))
+                if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR)
+                        err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
+#endif
+#ifdef BUS_MCEERR_AO
+                if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO)
                        err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
 #endif
 #ifdef SEGV_BNDERR
@@ -2788,18 +2930,185 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
                err |= __put_user(from->si_uid, &to->si_uid);
                err |= __put_user(from->si_ptr, &to->si_ptr);
                break;
-#ifdef __ARCH_SIGSYS
        case SIL_SYS:
                err |= __put_user(from->si_call_addr, &to->si_call_addr);
                err |= __put_user(from->si_syscall, &to->si_syscall);
                err |= __put_user(from->si_arch, &to->si_arch);
                break;
-#endif
        }
        return err;
 }
+#ifdef CONFIG_COMPAT
+int copy_siginfo_to_user32(struct compat_siginfo __user *to,
+                           const struct siginfo *from)
+#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
+{
+        return __copy_siginfo_to_user32(to, from, in_x32_syscall());
+}
+int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
+                             const struct siginfo *from, bool x32_ABI)
+#endif
+{
+        struct compat_siginfo new;
+        memset(&new, 0, sizeof(new));
+        new.si_signo = from->si_signo;
+        new.si_errno = from->si_errno;
+        new.si_code  = from->si_code;
+        switch(siginfo_layout(from->si_signo, from->si_code)) {
+        case SIL_KILL:
+                new.si_pid = from->si_pid;
+                new.si_uid = from->si_uid;
+                break;
+        case SIL_TIMER:
+                new.si_tid     = from->si_tid;
+                new.si_overrun = from->si_overrun;
+                new.si_int     = from->si_int;
+                break;
+        case SIL_POLL:
+                new.si_band = from->si_band;
+                new.si_fd   = from->si_fd;
+                break;
+        case SIL_FAULT:
+                new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+                new.si_trapno = from->si_trapno;
+#endif
+#ifdef BUS_MCEERR_AR
+                if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR))
+                        new.si_addr_lsb = from->si_addr_lsb;
+#endif
+#ifdef BUS_MCEERR_AO
+                if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO))
+                        new.si_addr_lsb = from->si_addr_lsb;
+#endif
+#ifdef SEGV_BNDERR
+                if ((from->si_signo == SIGSEGV) &&
+                    (from->si_code == SEGV_BNDERR)) {
+                        new.si_lower = ptr_to_compat(from->si_lower);
+                        new.si_upper = ptr_to_compat(from->si_upper);
+                }
+#endif
+#ifdef SEGV_PKUERR
+                if ((from->si_signo == SIGSEGV) &&
+                    (from->si_code == SEGV_PKUERR))
+                        new.si_pkey = from->si_pkey;
+#endif
+                break;
+        case SIL_CHLD:
+                new.si_pid    = from->si_pid;
+                new.si_uid    = from->si_uid;
+                new.si_status = from->si_status;
+#ifdef CONFIG_X86_X32_ABI
+                if (x32_ABI) {
+                        new._sifields._sigchld_x32._utime = from->si_utime;
+                        new._sifields._sigchld_x32._stime = from->si_stime;
+                } else
+#endif
+                {
+                        new.si_utime = from->si_utime;
+                        new.si_stime = from->si_stime;
+                }
+                break;
+        case SIL_RT:
+                new.si_pid = from->si_pid;
+                new.si_uid = from->si_uid;
+                new.si_int = from->si_int;
+                break;
+        case SIL_SYS:
+                new.si_call_addr = ptr_to_compat(from->si_call_addr);
+                new.si_syscall   = from->si_syscall;
+                new.si_arch      = from->si_arch;
+                break;
+        }
+        if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
+                return -EFAULT;
+        return 0;
+}
+int copy_siginfo_from_user32(struct siginfo *to,
+                             const struct compat_siginfo __user *ufrom)
+{
+        struct compat_siginfo from;
+        if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
+                return -EFAULT;
+        clear_siginfo(to);
+        to->si_signo = from.si_signo;
+        to->si_errno = from.si_errno;
+        to->si_code  = from.si_code;
+        switch(siginfo_layout(from.si_signo, from.si_code)) {
+        case SIL_KILL:
+                to->si_pid = from.si_pid;
+                to->si_uid = from.si_uid;
+                break;
+        case SIL_TIMER:
+                to->si_tid     = from.si_tid;
+                to->si_overrun = from.si_overrun;
+                to->si_int     = from.si_int;
+                break;
+        case SIL_POLL:
+                to->si_band = from.si_band;
+                to->si_fd   = from.si_fd;
+                break;
+        case SIL_FAULT:
+                to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+                to->si_trapno = from.si_trapno;
+#endif
+#ifdef BUS_MCEERR_AR
+                if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR))
+                        to->si_addr_lsb = from.si_addr_lsb;
+#endif
+#ifdef BUS_MCEER_AO
+                if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO))
+                        to->si_addr_lsb = from.si_addr_lsb;
+#endif
+#ifdef SEGV_BNDERR
+                if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) {
+                        to->si_lower = compat_ptr(from.si_lower);
+                        to->si_upper = compat_ptr(from.si_upper);
+                }
+#endif
+#ifdef SEGV_PKUERR
+                if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR))
+                        to->si_pkey = from.si_pkey;
+#endif
+                break;
+        case SIL_CHLD:
+                to->si_pid    = from.si_pid;
+                to->si_uid    = from.si_uid;
+                to->si_status = from.si_status;
+#ifdef CONFIG_X86_X32_ABI
+                if (in_x32_syscall()) {
+                        to->si_utime = from._sifields._sigchld_x32._utime;
+                        to->si_stime = from._sifields._sigchld_x32._stime;
+                } else
 #endif
+                {
+                        to->si_utime = from.si_utime;
+                        to->si_stime = from.si_stime;
+                }
+                break;
+        case SIL_RT:
+                to->si_pid = from.si_pid;
+                to->si_uid = from.si_uid;
+                to->si_int = from.si_int;
+                break;
+        case SIL_SYS:
+                to->si_call_addr = compat_ptr(from.si_call_addr);
+                to->si_syscall   = from.si_syscall;
+                to->si_arch      = from.si_arch;
+                break;
+        }
+        return 0;
+}
+#endif /* CONFIG_COMPAT */
 /**
 *  do_sigtimedwait - wait for queued signals specified in @which
@@ -2937,6 +3246,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 {
        struct siginfo info;
+        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_USER;
@@ -2978,8 +3288,9 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 static int do_tkill(pid_t tgid, pid_t pid, int sig)
 {
-        struct siginfo info = {};
+        struct siginfo info;
+        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_TKILL;
@@ -3060,7 +3371,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
 {
-        siginfo_t info = {};
+        siginfo_t info;
        int ret = copy_siginfo_from_user32(&info, uinfo);
        if (unlikely(ret))
                return ret;
@@ -3104,7 +3415,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
 {
-        siginfo_t info = {};
+        siginfo_t info;
        if (copy_siginfo_from_user32(&info, uinfo))
                return -EFAULT;
@@ -3677,6 +3988,7 @@ void __init signals_init(void)
        /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */
        BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE
                != offsetof(struct siginfo, _sifields._pad));
+        BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE);
        sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
 }
@@ -3684,26 +3996,25 @@ void __init signals_init(void)
 #ifdef CONFIG_KGDB_KDB
 #include <linux/kdb.h>
 /*
- * kdb_send_sig_info - Allows kdb to send signals without exposing
+ * kdb_send_sig - Allows kdb to send signals without exposing
 * signal internals.  This function checks if the required locks are
 * available before calling the main signal code, to avoid kdb
 * deadlocks.
 */
-void
+void kdb_send_sig(struct task_struct *t, int sig)
-kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
 {
        static struct task_struct *kdb_prev_t;
-        int sig, new_t;
+        int new_t, ret;
        if (!spin_trylock(&t->sighand->siglock)) {
                kdb_printf("Can't do kill command now.\n"
                           "The sigmask lock is held somewhere else in "
                           "kernel, try again later\n");
                return;
        }
-        spin_unlock(&t->sighand->siglock);
        new_t = kdb_prev_t != t;
        kdb_prev_t = t;
        if (t->state != TASK_RUNNING && new_t) {
+                spin_unlock(&t->sighand->siglock);
                kdb_printf("Process is not RUNNING, sending a signal from "
                           "kdb risks deadlock\n"
                           "on the run queue locks. "
@@ -3712,8 +4023,9 @@ kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
                           "the deadlock.\n");
                return;
        }
-        sig = info->si_signo;
+        ret = send_signal(sig, SEND_SIG_PRIV, t, false);
-        if (send_sig_info(sig, info, t))
+        spin_unlock(&t->sighand->siglock);
+        if (ret)
                kdb_printf("Fail to deliver Signal %d to process %d.\n",
                           sig, t->pid);
        else
diff --git a/kernel/sys.c b/kernel/sys.c
index 83ffd7dccf23..f2289de20e19 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(overflowgid);
 */
 int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
-int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
+int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;
 EXPORT_SYMBOL(fs_overflowuid);
 EXPORT_SYMBOL(fs_overflowgid);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 557d46728577..2fb4e27c636a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1374,13 +1374,6 @@ static struct ctl_table vm_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
         },
-         {
-                .procname       = "hugepages_treat_as_movable",
-                .data           = &hugepages_treat_as_movable,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
        {
                .procname       = "nr_overcommit_hugepages",
                .data           = NULL,
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index cc91d90abd84..94ad46d50b56 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -68,10 +68,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
        return err;
 }
-static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
+static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
 {
        struct posix_clock *clk = get_posix_clock(fp);
-        unsigned int result = 0;
+        __poll_t result = 0;
        if (!clk)
                return POLLERR;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index ec999f32c840..75043046914e 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -462,7 +462,7 @@ static struct k_itimer * alloc_posix_timer(void)
                kmem_cache_free(posix_timers_cache, tmr);
                return NULL;
        }
-        memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
+        clear_siginfo(&tmr->sigq->info);
        return tmr;
 }
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f54dc62b599c..0b249e2f0c3c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -530,6 +530,15 @@ config FUNCTION_PROFILER
          If in doubt, say N.
+config BPF_KPROBE_OVERRIDE
+        bool "Enable BPF programs to override a kprobed function"
+        depends on BPF_EVENTS
+        depends on FUNCTION_ERROR_INJECTION
+        default n
+        help
+         Allows BPF to override the execution of a probed function and
+         set a different return value.  This is used for error injection.
 config FTRACE_MCOUNT_RECORD
        def_bool y
        depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 40207c2a4113..fc2838ac8b78 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,10 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
+#include <linux/kprobes.h>
+#include <linux/error-injection.h>
+#include "trace_probe.h"
 #include "trace.h"
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -76,6 +80,23 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+        regs_set_return_value(regs, rc);
+        override_function_with_return(regs);
+        return 0;
+}
+static const struct bpf_func_proto bpf_override_return_proto = {
+        .func           = bpf_override_return,
+        .gpl_only       = true,
+        .ret_type       = RET_INTEGER,
+        .arg1_type      = ARG_PTR_TO_CTX,
+        .arg2_type      = ARG_ANYTHING,
+};
+#endif
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
        int ret;
@@ -224,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
 */
 #define __BPF_TP_EMIT() __BPF_ARG3_TP()
 #define __BPF_TP(...)                                                   \
-        __trace_printk(1 /* Fake ip will not be printed. */,            \
+        __trace_printk(0 /* Fake ip */,                                 \
                       fmt, ##__VA_ARGS__)
 #define __BPF_ARG1_TP(...)                                              \
@@ -556,6 +577,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
                return &bpf_get_stackid_proto;
        case BPF_FUNC_perf_event_read_value:
                return &bpf_perf_event_read_value_proto;
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+        case BPF_FUNC_override_return:
+                return &bpf_override_return_proto;
+#endif
        default:
                return tracing_func_proto(func_id);
        }
@@ -773,6 +798,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
        struct bpf_prog_array *new_array;
        int ret = -EEXIST;
+        /*
+         * Kprobe override only works if they are on the function entry,
+         * and only if they are on the opt-in list.
+         */
+        if (prog->kprobe_override &&
+            (!trace_kprobe_on_func_entry(event->tp_event) ||
+             !trace_kprobe_error_injectable(event->tp_event)))
+                return -EINVAL;
        mutex_lock(&bpf_event_mutex);
        if (event->prog)
@@ -825,3 +859,26 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 unlock:
        mutex_unlock(&bpf_event_mutex);
 }
+int perf_event_query_prog_array(struct perf_event *event, void __user *info)
+{
+        struct perf_event_query_bpf __user *uquery = info;
+        struct perf_event_query_bpf query = {};
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (event->attr.type != PERF_TYPE_TRACEPOINT)
+                return -EINVAL;
+        if (copy_from_user(&query, uquery, sizeof(query)))
+                return -EFAULT;
+        mutex_lock(&bpf_event_mutex);
+        ret = bpf_prog_array_copy_info(event->tp_event->prog_array,
+                                       uquery->ids,
+                                       query.ids_len,
+                                       &uquery->prog_cnt);
+        mutex_unlock(&bpf_event_mutex);
+        return ret;
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 554b517c61a0..dabd9d167d42 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5015,7 +5015,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
        parser = &iter->parser;
        if (trace_parser_loaded(parser)) {
-                parser->buffer[parser->idx] = 0;
                ftrace_match_records(iter->hash, parser->buffer, parser->idx);
        }
@@ -5329,7 +5328,6 @@ ftrace_graph_release(struct inode *inode, struct file *file)
                parser = &fgd->parser;
                if (trace_parser_loaded((parser))) {
-                        parser->buffer[parser->idx] = 0;
                        ret = ftrace_graph_set_hash(fgd->new_hash,
                                                    parser->buffer);
                }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5af2842dea96..ca6930e0d25e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -630,7 +630,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
 * Returns POLLIN | POLLRDNORM if data exists in the buffers,
 * zero otherwise.
 */
-int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
                          struct file *filp, poll_table *poll_table)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f3a8e24b426..56608538a4ad 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -530,8 +530,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
                ubuf += ret;
                cnt -= ret;
-                parser.buffer[parser.idx] = 0;
                ret = -EINVAL;
                if (kstrtoul(parser.buffer, 0, &val))
                        break;
@@ -1236,18 +1234,18 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
                        cnt--;
                }
+                parser->idx = 0;
                /* only spaces were written */
-                if (isspace(ch)) {
+                if (isspace(ch) || !ch) {
                        *ppos += read;
                        ret = read;
                        goto out;
                }
-                parser->idx = 0;
        }
        /* read the non-space input */
-        while (cnt && !isspace(ch)) {
+        while (cnt && !isspace(ch) && ch) {
                if (parser->idx < parser->size - 1)
                        parser->buffer[parser->idx++] = ch;
                else {
@@ -1262,12 +1260,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
        }
        /* We either got finished input or we have to wait for another call. */
-        if (isspace(ch)) {
+        if (isspace(ch) || !ch) {
                parser->buffer[parser->idx] = 0;
                parser->cont = false;
        } else if (parser->idx < parser->size - 1) {
                parser->cont = true;
                parser->buffer[parser->idx++] = ch;
+                /* Make sure the parsed string always terminates with '\0'. */
+                parser->buffer[parser->idx] = 0;
        } else {
                ret = -EINVAL;
                goto out;
@@ -5616,7 +5616,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
        return 0;
 }
-static unsigned int
+static __poll_t
 trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
 {
        struct trace_array *tr = iter->tr;
@@ -5635,7 +5635,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
                                             filp, poll_table);
 }
-static unsigned int
+static __poll_t
 tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 {
        struct trace_iterator *iter = filp->private_data;
@@ -6589,7 +6589,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
        return ret;
 }
-static unsigned int
+static __poll_t
 tracing_buffers_poll(struct file *filp, poll_table *poll_table)
 {
        struct ftrace_buffer_info *info = filp->private_data;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1b87157edbff..05c7172c6667 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -885,8 +885,6 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
                if (*parser.buffer == '!')
                        set = 0;
-                parser.buffer[parser.idx] = 0;
                ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
                if (ret)
                        goto out_put;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 492700c5fb4d..1fad24acd444 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
 #include <linux/rculist.h>
+#include <linux/error-injection.h>
 #include "trace_probe.h"
@@ -42,7 +43,6 @@ struct trace_kprobe {
        (offsetof(struct trace_kprobe, tp.args) +       \
        (sizeof(struct probe_arg) * (n)))
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
        return tk->rp.handler != NULL;
@@ -87,6 +87,30 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
        return nhit;
 }
+bool trace_kprobe_on_func_entry(struct trace_event_call *call)
+{
+        struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+        return kprobe_on_func_entry(tk->rp.kp.addr,
+                        tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
+                        tk->rp.kp.addr ? 0 : tk->rp.kp.offset);
+}
+bool trace_kprobe_error_injectable(struct trace_event_call *call)
+{
+        struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+        unsigned long addr;
+        if (tk->symbol) {
+                addr = (unsigned long)
+                        kallsyms_lookup_name(trace_kprobe_symbol(tk));
+                addr += tk->rp.kp.offset;
+        } else {
+                addr = (unsigned long)tk->rp.kp.addr;
+        }
+        return within_error_injection_list(addr);
+}
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
@@ -1170,7 +1194,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 /* Kprobe profile handler */
-static void
+static int
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
        struct trace_event_call *call = &tk->tp.call;
@@ -1179,12 +1203,31 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
        int size, __size, dsize;
        int rctx;
-        if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
+        if (bpf_prog_array_valid(call)) {
-                return;
+                unsigned long orig_ip = instruction_pointer(regs);
+                int ret;
+                ret = trace_call_bpf(call, regs);
+                /*
+                 * We need to check and see if we modified the pc of the
+                 * pt_regs, and if so clear the kprobe and return 1 so that we
+                 * don't do the single stepping.
+                 * The ftrace kprobe handler leaves it up to us to re-enable
+                 * preemption here before returning if we've modified the ip.
+                 */
+                if (orig_ip != instruction_pointer(regs)) {
+                        reset_current_kprobe();
+                        preempt_enable_no_resched();
+                        return 1;
+                }
+                if (!ret)
+                        return 0;
+        }
        head = this_cpu_ptr(call->perf_events);
        if (hlist_empty(head))
-                return;
+                return 0;
        dsize = __get_data_size(&tk->tp, regs);
        __size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1193,13 +1236,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
        entry = perf_trace_buf_alloc(size, NULL, &rctx);
        if (!entry)
-                return;
+                return 0;
        entry->ip = (unsigned long)tk->rp.kp.addr;
        memset(&entry[1], 0, dsize);
        store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
        perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
                              head, NULL);
+        return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
@@ -1275,6 +1319,7 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
        struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
+        int ret = 0;
        raw_cpu_inc(*tk->nhit);
@@ -1282,9 +1327,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
                kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
        if (tk->tp.flags & TP_FLAG_PROFILE)
-                kprobe_perf_func(tk, regs);
+                ret = kprobe_perf_func(tk, regs);
 #endif
-        return 0;       /* We don't tweek kernel, so just return 0 */
+        return ret;
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index fb66e3eaa192..e101c5bb9eda 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -252,6 +252,8 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+bool trace_kprobe_on_func_entry(struct trace_event_call *call);
+bool trace_kprobe_error_injectable(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8                 NULL
@@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset)
 {
        return NULL;
 }
+static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call)
+{
+        return false;
+}
+static inline bool trace_kprobe_error_injectable(struct trace_event_call *call)
+{
+        return false;
+}
 #endif /* CONFIG_KPROBE_EVENTS */
 struct probe_arg {
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 8cda06a10d66..c364cf777e1a 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -1,13 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/compiler.h>
 #include "trace.h"
-int DYN_FTRACE_TEST_NAME(void)
+noinline __noclone int DYN_FTRACE_TEST_NAME(void)
 {
        /* used to call mcount */
        return 0;
 }
-int DYN_FTRACE_TEST_NAME2(void)
+noinline __noclone int DYN_FTRACE_TEST_NAME2(void)
 {
        /* used to call mcount */
        return 0;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 40592e7b3568..268029ae1be6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -608,7 +608,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
        /* Don't print "0x  (null)" when offset is 0 */
        if (tu->offset) {
-                seq_printf(m, "0x%p", (void *)tu->offset);
+                seq_printf(m, "0x%px", (void *)tu->offset);
        } else {
                switch (sizeof(void *)) {
                case 4:
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8c34981d90ad..017044c26233 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3807,6 +3807,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
        return ret;
 }
+EXPORT_SYMBOL_GPL(apply_workqueue_attrs);
 /**
 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
@@ -3940,6 +3941,37 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
        return clamp_val(max_active, 1, lim);
 }
+/*
+ * Workqueues which may be used during memory reclaim should have a rescuer
+ * to guarantee forward progress.
+ */
+static int init_rescuer(struct workqueue_struct *wq)
+{
+        struct worker *rescuer;
+        int ret;
+        if (!(wq->flags & WQ_MEM_RECLAIM))
+                return 0;
+        rescuer = alloc_worker(NUMA_NO_NODE);
+        if (!rescuer)
+                return -ENOMEM;
+        rescuer->rescue_wq = wq;
+        rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
+        ret = PTR_ERR_OR_ZERO(rescuer->task);
+        if (ret) {
+                kfree(rescuer);
+                return ret;
+        }
+        wq->rescuer = rescuer;
+        kthread_bind_mask(rescuer->task, cpu_possible_mask);
+        wake_up_process(rescuer->task);
+        return 0;
+}
 struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                                               unsigned int flags,
                                               int max_active,
@@ -4002,29 +4034,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        if (alloc_and_link_pwqs(wq) < 0)
                goto err_free_wq;
-        /*
+        if (wq_online && init_rescuer(wq) < 0)
-         * Workqueues which may be used during memory reclaim should
+                goto err_destroy;
-         * have a rescuer to guarantee forward progress.
-         */
-        if (flags & WQ_MEM_RECLAIM) {
-                struct worker *rescuer;
-                rescuer = alloc_worker(NUMA_NO_NODE);
-                if (!rescuer)
-                        goto err_destroy;
-                rescuer->rescue_wq = wq;
-                rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
-                                               wq->name);
-                if (IS_ERR(rescuer->task)) {
-                        kfree(rescuer);
-                        goto err_destroy;
-                }
-                wq->rescuer = rescuer;
-                kthread_bind_mask(rescuer->task, cpu_possible_mask);
-                wake_up_process(rescuer->task);
-        }
        if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
                goto err_destroy;
@@ -5642,6 +5653,8 @@ int __init workqueue_init(void)
         * archs such as power and arm64.  As per-cpu pools created
         * previously could be missing node hint and unbound pools NUMA
         * affinity, fix them up.
+         *
+         * Also, while iterating workqueues, create rescuers if requested.
         */
        wq_numa_init();
@@ -5653,8 +5666,12 @@ int __init workqueue_init(void)
                }
        }
-        list_for_each_entry(wq, &workqueues, list)
+        list_for_each_entry(wq, &workqueues, list) {
                wq_update_unbound_numa(wq, smp_processor_id(), true);
+                WARN(init_rescuer(wq),
+                     "workqueue: failed to create early rescuer for %s",
+                     wq->name);
+        }
        mutex_unlock(&wq_pool_mutex);
author	Ingo Molnar <mingo@kernel.org>	2018-02-06 15:12:31 -0500
committer	Ingo Molnar <mingo@kernel.org>	2018-02-06 15:12:31 -0500
commit	82845079160817cc6ac64e5321bbd935e0a47b3a (patch)
tree	0886d1d52428e9db14536cae4b37db896e7c360a /kernel
parent	32e839dda3ba576943365f0f5817ce5c843137dc (diff)
parent	68c5735eaa5e680e701c9a2d1e3c7880bdf5ab66 (diff)