bpf: allow bpf programs to tail-call other bpf programs

introduce bpf_tail_call(ctx, &jmp_table, index) helper function which can be used from BPF programs like: int bpf_prog(struct pt_regs *ctx) { ... bpf_tail_call(ctx, &jmp_table, index); ... } that is roughly equivalent to: int bpf_prog(struct pt_regs *ctx) { ... if (jmp_table[index]) return (*jmp_table[index])(ctx); ... } The important detail that it's not a normal call, but a tail call. The kernel stack is precious, so this helper reuses the current stack frame and jumps into another BPF program without adding extra call frame. It's trivially done in interpreter and a bit trickier in JITs. In case of x64 JIT the bigger part of generated assembler prologue is common for all programs, so it is simply skipped while jumping. Other JITs can do similar prologue-skipping optimization or do stack unwind before jumping into the next program. bpf_tail_call() arguments: ctx - context pointer jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table index - index in the jump table Since all BPF programs are idenitified by file descriptor, user space need to populate the jmp_table with FDs of other BPF programs. If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere and program execution continues as normal. New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can populate this jmp_table array with FDs of other bpf programs. Programs can share the same jmp_table array or use multiple jmp_tables. The chain of tail calls can form unpredictable dynamic loops therefore tail_call_cnt is used to limit the number of calls and currently is set to 32. Use cases: Acked-by: Daniel Borkmann <daniel@iogearbox.net> ========== - simplify complex programs by splitting them into a sequence of small programs - dispatch routine For tracing and future seccomp the program may be triggered on all system calls, but processing of syscall arguments will be different. It's more efficient to implement them as: int syscall_entry(struct seccomp_data *ctx) { bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */); ... default: process unknown syscall ... } int sys_write_event(struct seccomp_data *ctx) {...} int sys_read_event(struct seccomp_data *ctx) {...} syscall_jmp_table[__NR_write] = sys_write_event; syscall_jmp_table[__NR_read] = sys_read_event; For networking the program may call into different parsers depending on packet format, like: int packet_parser(struct __sk_buff *skb) { ... parse L2, L3 here ... __u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol)); bpf_tail_call(skb, &ipproto_jmp_table, ipproto); ... default: process unknown protocol ... } int parse_tcp(struct __sk_buff *skb) {...} int parse_udp(struct __sk_buff *skb) {...} ipproto_jmp_table[IPPROTO_TCP] = parse_tcp; ipproto_jmp_table[IPPROTO_UDP] = parse_udp; - for TC use case, bpf_tail_call() allows to implement reclassify-like logic - bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table are atomic, so user space can build chains of BPF programs on the fly Implementation details: ======================= - high performance of bpf_tail_call() is the goal. It could have been implemented without JIT changes as a wrapper on top of BPF_PROG_RUN() macro, but with two downsides: . all programs would have to pay performance penalty for this feature and tail call itself would be slower, since mandatory stack unwind, return, stack allocate would be done for every tailcall. . tailcall would be limited to programs running preempt_disabled, since generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would need to be either global per_cpu variable accessed by helper and by wrapper or global variable protected by locks. In this implementation x64 JIT bypasses stack unwind and jumps into the callee program after prologue. - bpf_prog_array_compatible() ensures that prog_type of callee and caller are the same and JITed/non-JITed flag is the same, since calling JITed program from non-JITed is invalid, since stack frames are different. Similarly calling kprobe type program from socket type program is invalid. - jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map' abstraction, its user space API and all of verifier logic. It's in the existing arraymap.c file, since several functions are shared with regular array map. Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Alexei Starovoitov <ast@plumgrid.com> 2015-05-19 19:59:03 -0400
committer: David S. Miller <davem@davemloft.net> 2015-05-21 17:07:59 -0400
commit: 04fd61ab36ec065e194ab5e74ae34a5240d992bb (patch)
tree: e14531e8775c71ca0508f97ba25af09d8d3db426 /kernel/bpf
parent: e7582bab5d28ea72e07cf2c74632eaf46a6c1a50 (diff)
4 files changed, 218 insertions, 8 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8a6616583f38..614bcd4c1d74 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,12 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/filter.h>
-struct bpf_array {
-        struct bpf_map map;
-        u32 elem_size;
-        char value[0] __aligned(8);
-};
 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
        return 0;
 }
 late_initcall(register_array_map);
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+        /* only bpf_prog file descriptors can be stored in prog_array map */
+        if (attr->value_size != sizeof(u32))
+                return ERR_PTR(-EINVAL);
+        return array_map_alloc(attr);
+}
+static void prog_array_map_free(struct bpf_map *map)
+{
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        int i;
+        synchronize_rcu();
+        /* make sure it's empty */
+        for (i = 0; i < array->map.max_entries; i++)
+                BUG_ON(array->prog[i] != NULL);
+        kvfree(array);
+}
+static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+        return NULL;
+}
+/* only called from syscall */
+static int prog_array_map_update_elem(struct bpf_map *map, void *key,
+                                      void *value, u64 map_flags)
+{
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        struct bpf_prog *prog, *old_prog;
+        u32 index = *(u32 *)key, ufd;
+        if (map_flags != BPF_ANY)
+                return -EINVAL;
+        if (index >= array->map.max_entries)
+                return -E2BIG;
+        ufd = *(u32 *)value;
+        prog = bpf_prog_get(ufd);
+        if (IS_ERR(prog))
+                return PTR_ERR(prog);
+        if (!bpf_prog_array_compatible(array, prog)) {
+                bpf_prog_put(prog);
+                return -EINVAL;
+        }
+        old_prog = xchg(array->prog + index, prog);
+        if (old_prog)
+                bpf_prog_put(old_prog);
+        return 0;
+}
+static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        struct bpf_prog *old_prog;
+        u32 index = *(u32 *)key;
+        if (index >= array->map.max_entries)
+                return -E2BIG;
+        old_prog = xchg(array->prog + index, NULL);
+        if (old_prog) {
+                bpf_prog_put(old_prog);
+                return 0;
+        } else {
+                return -ENOENT;
+        }
+}
+/* decrement refcnt of all bpf_progs that are stored in this map */
+void bpf_prog_array_map_clear(struct bpf_map *map)
+{
+        struct bpf_array *array = container_of(map, struct bpf_array, map);
+        int i;
+        for (i = 0; i < array->map.max_entries; i++)
+                prog_array_map_delete_elem(map, &i);
+}
+static const struct bpf_map_ops prog_array_ops = {
+        .map_alloc = prog_array_map_alloc,
+        .map_free = prog_array_map_free,
+        .map_get_next_key = array_map_get_next_key,
+        .map_lookup_elem = prog_array_map_lookup_elem,
+        .map_update_elem = prog_array_map_update_elem,
+        .map_delete_elem = prog_array_map_delete_elem,
+};
+static struct bpf_map_type_list prog_array_type __read_mostly = {
+        .ops = &prog_array_ops,
+        .type = BPF_MAP_TYPE_PROG_ARRAY,
+};
+static int __init register_prog_array_map(void)
+{
+        bpf_register_map_type(&prog_array_type);
+        return 0;
+}
+late_initcall(register_prog_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 54f0e7fcd0e2..d44b25cbe460 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -176,6 +176,15 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
        return 0;
 }
+const struct bpf_func_proto bpf_tail_call_proto = {
+        .func = NULL,
+        .gpl_only = false,
+        .ret_type = RET_VOID,
+        .arg1_type = ARG_PTR_TO_CTX,
+        .arg2_type = ARG_CONST_MAP_PTR,
+        .arg3_type = ARG_ANYTHING,
+};
 /**
 *      __bpf_prog_run - run eBPF program on a given context
 *      @ctx: is the data we are operating on
@@ -244,6 +253,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
                [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
                /* Call instruction */
                [BPF_JMP | BPF_CALL] = &&JMP_CALL,
+                [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
                /* Jumps */
                [BPF_JMP | BPF_JA] = &&JMP_JA,
                [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +296,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
                [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
                [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
        };
+        u32 tail_call_cnt = 0;
        void *ptr;
        int off;
@@ -431,6 +442,30 @@ select_insn:
                                                       BPF_R4, BPF_R5);
                CONT;
+        JMP_TAIL_CALL: {
+                struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
+                struct bpf_array *array = container_of(map, struct bpf_array, map);
+                struct bpf_prog *prog;
+                u64 index = BPF_R3;
+                if (unlikely(index >= array->map.max_entries))
+                        goto out;
+                if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+                        goto out;
+                tail_call_cnt++;
+                prog = READ_ONCE(array->prog[index]);
+                if (unlikely(!prog))
+                        goto out;
+                ARG1 = BPF_R1;
+                insn = prog->insnsi;
+                goto select_insn;
+out:
+                CONT;
+        }
        /* JMP */
        JMP_JA:
                insn += insn->off;
@@ -619,6 +654,40 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
 }
+bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp)
+{
+        if (array->owner_prog_type) {
+                if (array->owner_prog_type != fp->type)
+                        return false;
+                if (array->owner_jited != fp->jited)
+                        return false;
+        } else {
+                array->owner_prog_type = fp->type;
+                array->owner_jited = fp->jited;
+        }
+        return true;
+}
+static int check_tail_call(const struct bpf_prog *fp)
+{
+        struct bpf_prog_aux *aux = fp->aux;
+        int i;
+        for (i = 0; i < aux->used_map_cnt; i++) {
+                struct bpf_array *array;
+                struct bpf_map *map;
+                map = aux->used_maps[i];
+                if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+                        continue;
+                array = container_of(map, struct bpf_array, map);
+                if (!bpf_prog_array_compatible(array, fp))
+                        return -EINVAL;
+        }
+        return 0;
+}
 /**
 *      bpf_prog_select_runtime - select execution runtime for BPF program
 *      @fp: bpf_prog populated with internal BPF program
@@ -626,7 +695,7 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
 * try to JIT internal BPF program, if JIT is not available select interpreter
 * BPF program will be executed via BPF_PROG_RUN() macro
 */
-void bpf_prog_select_runtime(struct bpf_prog *fp)
+int bpf_prog_select_runtime(struct bpf_prog *fp)
 {
        fp->bpf_func = (void *) __bpf_prog_run;
@@ -634,6 +703,8 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
        bpf_int_jit_compile(fp);
        /* Lock whole bpf_prog as read-only */
        bpf_prog_lock_ro(fp);
+        return check_tail_call(fp);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3bae6c591914..98a69bd83069 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 {
        struct bpf_map *map = filp->private_data;
+        if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+                /* prog_array stores refcnt-ed bpf_prog pointers
+                 * release them all when user space closes prog_array_fd
+                 */
+                bpf_prog_array_map_clear(map);
        bpf_map_put(map);
        return 0;
 }
@@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
                         */
                        BUG_ON(!prog->aux->ops->get_func_proto);
+                        if (insn->imm == BPF_FUNC_tail_call) {
+                                /* mark bpf_tail_call as different opcode
+                                 * to avoid conditional branch in
+                                 * interpeter for every normal call
+                                 * and to prevent accidental JITing by
+                                 * JIT compiler that doesn't support
+                                 * bpf_tail_call yet
+                                 */
+                                insn->imm = 0;
+                                insn->code |= BPF_X;
+                                continue;
+                        }
                        fn = prog->aux->ops->get_func_proto(insn->imm);
                        /* all functions that have prototype and verifier allowed
                         * programs to call them, must be real in-kernel functions
@@ -532,7 +551,9 @@ static int bpf_prog_load(union bpf_attr *attr)
        fixup_bpf_calls(prog);
        /* eBPF program is ready to be JITed */
-        bpf_prog_select_runtime(prog);
+        err = bpf_prog_select_runtime(prog);
+        if (err < 0)
+                goto free_used_maps;
        err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
        if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 47dcd3aa6e23..cfd9a40b9a5a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id)
                        fn->ret_type, func_id);
                return -EINVAL;
        }
+        if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+            func_id != BPF_FUNC_tail_call)
+                /* prog_array map type needs extra care:
+                 * only allow to pass it into bpf_tail_call() for now.
+                 * bpf_map_delete_elem() can be allowed in the future,
+                 * while bpf_map_update_elem() must only be done via syscall
+                 */
+                return -EINVAL;
+        if (func_id == BPF_FUNC_tail_call &&
+            map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+                /* don't allow any other map type to be passed into
+                 * bpf_tail_call()
+                 */
+                return -EINVAL;
        return 0;
 }
author	Alexei Starovoitov <ast@plumgrid.com>	2015-05-19 19:59:03 -0400
committer	David S. Miller <davem@davemloft.net>	2015-05-21 17:07:59 -0400
commit	04fd61ab36ec065e194ab5e74ae34a5240d992bb (patch)
tree	e14531e8775c71ca0508f97ba25af09d8d3db426 /kernel/bpf
parent	e7582bab5d28ea72e07cf2c74632eaf46a6c1a50 (diff)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8a6616583f38..614bcd4c1d74 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c
@@ -14,12 +14,7 @@
14	#include <linux/vmalloc.h>	14	#include <linux/vmalloc.h>
15	#include <linux/slab.h>	15	#include <linux/slab.h>
16	#include <linux/mm.h>	16	#include <linux/mm.h>
17		17	#include <linux/filter.h>
18	struct bpf_array {
19	struct bpf_map map;
20	u32 elem_size;
21	char value[0] __aligned(8);
22	};
23		18
24	/* Called from syscall */	19	/* Called from syscall */
25	static struct bpf_map array_map_alloc(union bpf_attr attr)	20	static struct bpf_map array_map_alloc(union bpf_attr attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
154	return 0;	149	return 0;
155	}	150	}
156	late_initcall(register_array_map);	151	late_initcall(register_array_map);
		152
		153	static struct bpf_map prog_array_map_alloc(union bpf_attr attr)
		154	{
		155	/* only bpf_prog file descriptors can be stored in prog_array map */
		156	if (attr->value_size != sizeof(u32))
		157	return ERR_PTR(-EINVAL);
		158	return array_map_alloc(attr);
		159	}
		160
		161	static void prog_array_map_free(struct bpf_map *map)
		162	{
		163	struct bpf_array *array = container_of(map, struct bpf_array, map);
		164	int i;
		165
		166	synchronize_rcu();
		167
		168	/* make sure it's empty */
		169	for (i = 0; i < array->map.max_entries; i++)
		170	BUG_ON(array->prog[i] != NULL);
		171	kvfree(array);
		172	}
		173
		174	static void prog_array_map_lookup_elem(struct bpf_map map, void *key)
		175	{
		176	return NULL;
		177	}
		178
		179	/* only called from syscall */
		180	static int prog_array_map_update_elem(struct bpf_map map, void key,
		181	void *value, u64 map_flags)
		182	{
		183	struct bpf_array *array = container_of(map, struct bpf_array, map);
		184	struct bpf_prog prog, old_prog;
		185	u32 index = (u32 )key, ufd;
		186
		187	if (map_flags != BPF_ANY)
		188	return -EINVAL;
		189
		190	if (index >= array->map.max_entries)
		191	return -E2BIG;
		192
		193	ufd = (u32 )value;
		194	prog = bpf_prog_get(ufd);
		195	if (IS_ERR(prog))
		196	return PTR_ERR(prog);
		197
		198	if (!bpf_prog_array_compatible(array, prog)) {
		199	bpf_prog_put(prog);
		200	return -EINVAL;
		201	}
		202
		203	old_prog = xchg(array->prog + index, prog);
		204	if (old_prog)
		205	bpf_prog_put(old_prog);
		206
		207	return 0;
		208	}
		209
		210	static int prog_array_map_delete_elem(struct bpf_map map, void key)
		211	{
		212	struct bpf_array *array = container_of(map, struct bpf_array, map);
		213	struct bpf_prog *old_prog;
		214	u32 index = (u32 )key;
		215
		216	if (index >= array->map.max_entries)
		217	return -E2BIG;
		218
		219	old_prog = xchg(array->prog + index, NULL);
		220	if (old_prog) {
		221	bpf_prog_put(old_prog);
		222	return 0;
		223	} else {
		224	return -ENOENT;
		225	}
		226	}
		227
		228	/* decrement refcnt of all bpf_progs that are stored in this map */
		229	void bpf_prog_array_map_clear(struct bpf_map *map)
		230	{
		231	struct bpf_array *array = container_of(map, struct bpf_array, map);
		232	int i;
		233
		234	for (i = 0; i < array->map.max_entries; i++)
		235	prog_array_map_delete_elem(map, &i);
		236	}
		237
		238	static const struct bpf_map_ops prog_array_ops = {
		239	.map_alloc = prog_array_map_alloc,
		240	.map_free = prog_array_map_free,
		241	.map_get_next_key = array_map_get_next_key,
		242	.map_lookup_elem = prog_array_map_lookup_elem,
		243	.map_update_elem = prog_array_map_update_elem,
		244	.map_delete_elem = prog_array_map_delete_elem,
		245	};
		246
		247	static struct bpf_map_type_list prog_array_type __read_mostly = {
		248	.ops = &prog_array_ops,
		249	.type = BPF_MAP_TYPE_PROG_ARRAY,
		250	};
		251
		252	static int __init register_prog_array_map(void)
		253	{
		254	bpf_register_map_type(&prog_array_type);
		255	return 0;
		256	}
		257	late_initcall(register_prog_array_map);


diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 54f0e7fcd0e2..d44b25cbe460 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c
@@ -176,6 +176,15 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
176	return 0;	176	return 0;
177	}	177	}
178		178
		179	const struct bpf_func_proto bpf_tail_call_proto = {
		180	.func = NULL,
		181	.gpl_only = false,
		182	.ret_type = RET_VOID,
		183	.arg1_type = ARG_PTR_TO_CTX,
		184	.arg2_type = ARG_CONST_MAP_PTR,
		185	.arg3_type = ARG_ANYTHING,
		186	};
		187
179	/**	188	/**
180	* __bpf_prog_run - run eBPF program on a given context	189	* __bpf_prog_run - run eBPF program on a given context
181	* @ctx: is the data we are operating on	190	* @ctx: is the data we are operating on
@@ -244,6 +253,7 @@ static unsigned int __bpf_prog_run(void ctx, const struct bpf_insn insn)
244	[BPF_ALU64 \| BPF_NEG] = &&ALU64_NEG,	253	[BPF_ALU64 \| BPF_NEG] = &&ALU64_NEG,
245	/* Call instruction */	254	/* Call instruction */
246	[BPF_JMP \| BPF_CALL] = &&JMP_CALL,	255	[BPF_JMP \| BPF_CALL] = &&JMP_CALL,
		256	[BPF_JMP \| BPF_CALL \| BPF_X] = &&JMP_TAIL_CALL,
247	/* Jumps */	257	/* Jumps */
248	[BPF_JMP \| BPF_JA] = &&JMP_JA,	258	[BPF_JMP \| BPF_JA] = &&JMP_JA,
249	[BPF_JMP \| BPF_JEQ \| BPF_X] = &&JMP_JEQ_X,	259	[BPF_JMP \| BPF_JEQ \| BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +296,7 @@ static unsigned int __bpf_prog_run(void ctx, const struct bpf_insn insn)
286	[BPF_LD \| BPF_IND \| BPF_B] = &&LD_IND_B,	296	[BPF_LD \| BPF_IND \| BPF_B] = &&LD_IND_B,
287	[BPF_LD \| BPF_IMM \| BPF_DW] = &&LD_IMM_DW,	297	[BPF_LD \| BPF_IMM \| BPF_DW] = &&LD_IMM_DW,
288	};	298	};
		299	u32 tail_call_cnt = 0;
289	void *ptr;	300	void *ptr;
290	int off;	301	int off;
291		302
@@ -431,6 +442,30 @@ select_insn:
431	BPF_R4, BPF_R5);	442	BPF_R4, BPF_R5);
432	CONT;	443	CONT;
433		444
		445	JMP_TAIL_CALL: {
		446	struct bpf_map map = (struct bpf_map ) (unsigned long) BPF_R2;
		447	struct bpf_array *array = container_of(map, struct bpf_array, map);
		448	struct bpf_prog *prog;
		449	u64 index = BPF_R3;
		450
		451	if (unlikely(index >= array->map.max_entries))
		452	goto out;
		453
		454	if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
		455	goto out;
		456
		457	tail_call_cnt++;
		458
		459	prog = READ_ONCE(array->prog[index]);
		460	if (unlikely(!prog))
		461	goto out;
		462
		463	ARG1 = BPF_R1;
		464	insn = prog->insnsi;
		465	goto select_insn;
		466	out:
		467	CONT;
		468	}
434	/* JMP */	469	/* JMP */
435	JMP_JA:	470	JMP_JA:
436	insn += insn->off;	471	insn += insn->off;
@@ -619,6 +654,40 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
619	{	654	{
620	}	655	}
621		656
		657	bool bpf_prog_array_compatible(struct bpf_array array, const struct bpf_prog fp)
		658	{
		659	if (array->owner_prog_type) {
		660	if (array->owner_prog_type != fp->type)
		661	return false;
		662	if (array->owner_jited != fp->jited)
		663	return false;
		664	} else {
		665	array->owner_prog_type = fp->type;
		666	array->owner_jited = fp->jited;
		667	}
		668	return true;
		669	}
		670
		671	static int check_tail_call(const struct bpf_prog *fp)
		672	{
		673	struct bpf_prog_aux *aux = fp->aux;
		674	int i;
		675
		676	for (i = 0; i < aux->used_map_cnt; i++) {
		677	struct bpf_array *array;
		678	struct bpf_map *map;
		679
		680	map = aux->used_maps[i];
		681	if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
		682	continue;
		683	array = container_of(map, struct bpf_array, map);
		684	if (!bpf_prog_array_compatible(array, fp))
		685	return -EINVAL;
		686	}
		687
		688	return 0;
		689	}
		690
622	/**	691	/**
623	* bpf_prog_select_runtime - select execution runtime for BPF program	692	* bpf_prog_select_runtime - select execution runtime for BPF program
624	* @fp: bpf_prog populated with internal BPF program	693	* @fp: bpf_prog populated with internal BPF program
@@ -626,7 +695,7 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
626	* try to JIT internal BPF program, if JIT is not available select interpreter	695	* try to JIT internal BPF program, if JIT is not available select interpreter
627	* BPF program will be executed via BPF_PROG_RUN() macro	696	* BPF program will be executed via BPF_PROG_RUN() macro
628	*/	697	*/
629	void bpf_prog_select_runtime(struct bpf_prog *fp)	698	int bpf_prog_select_runtime(struct bpf_prog *fp)
630	{	699	{
631	fp->bpf_func = (void *) __bpf_prog_run;	700	fp->bpf_func = (void *) __bpf_prog_run;
632		701
@@ -634,6 +703,8 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
634	bpf_int_jit_compile(fp);	703	bpf_int_jit_compile(fp);
635	/* Lock whole bpf_prog as read-only */	704	/* Lock whole bpf_prog as read-only */
636	bpf_prog_lock_ro(fp);	705	bpf_prog_lock_ro(fp);
		706
		707	return check_tail_call(fp);
637	}	708	}
638	EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);	709	EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
639		710


diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3bae6c591914..98a69bd83069 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c
@@ -68,6 +68,12 @@ static int bpf_map_release(struct inode inode, struct file filp)
68	{	68	{
69	struct bpf_map *map = filp->private_data;	69	struct bpf_map *map = filp->private_data;
70		70
		71	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
		72	/* prog_array stores refcnt-ed bpf_prog pointers
		73	* release them all when user space closes prog_array_fd
		74	*/
		75	bpf_prog_array_map_clear(map);
		76
71	bpf_map_put(map);	77	bpf_map_put(map);
72	return 0;	78	return 0;
73	}	79	}
@@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
392	*/	398	*/
393	BUG_ON(!prog->aux->ops->get_func_proto);	399	BUG_ON(!prog->aux->ops->get_func_proto);
394		400
		401	if (insn->imm == BPF_FUNC_tail_call) {
		402	/* mark bpf_tail_call as different opcode
		403	* to avoid conditional branch in
		404	* interpeter for every normal call
		405	* and to prevent accidental JITing by
		406	* JIT compiler that doesn't support
		407	* bpf_tail_call yet
		408	*/
		409	insn->imm = 0;
		410	insn->code \|= BPF_X;
		411	continue;
		412	}
		413
395	fn = prog->aux->ops->get_func_proto(insn->imm);	414	fn = prog->aux->ops->get_func_proto(insn->imm);
396	/* all functions that have prototype and verifier allowed	415	/* all functions that have prototype and verifier allowed
397	* programs to call them, must be real in-kernel functions	416	* programs to call them, must be real in-kernel functions
@@ -532,7 +551,9 @@ static int bpf_prog_load(union bpf_attr *attr)
532	fixup_bpf_calls(prog);	551	fixup_bpf_calls(prog);
533		552
534	/* eBPF program is ready to be JITed */	553	/* eBPF program is ready to be JITed */
535	bpf_prog_select_runtime(prog);	554	err = bpf_prog_select_runtime(prog);
		555	if (err < 0)
		556	goto free_used_maps;
536		557
537	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR \| O_CLOEXEC);	558	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR \| O_CLOEXEC);
538	if (err < 0)	559	if (err < 0)


diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 47dcd3aa6e23..cfd9a40b9a5a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c
@@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id)
907	fn->ret_type, func_id);	907	fn->ret_type, func_id);
908	return -EINVAL;	908	return -EINVAL;
909	}	909	}
		910
		911	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
		912	func_id != BPF_FUNC_tail_call)
		913	/* prog_array map type needs extra care:
		914	* only allow to pass it into bpf_tail_call() for now.
		915	* bpf_map_delete_elem() can be allowed in the future,
		916	* while bpf_map_update_elem() must only be done via syscall
		917	*/
		918	return -EINVAL;
		919
		920	if (func_id == BPF_FUNC_tail_call &&
		921	map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
		922	/* don't allow any other map type to be passed into
		923	* bpf_tail_call()
		924	*/
		925	return -EINVAL;
		926
910	return 0;	927	return 0;
911	}	928	}
912		929