aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/bpf
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@plumgrid.com>2015-05-19 19:59:03 -0400
committerDavid S. Miller <davem@davemloft.net>2015-05-21 17:07:59 -0400
commit04fd61ab36ec065e194ab5e74ae34a5240d992bb (patch)
treee14531e8775c71ca0508f97ba25af09d8d3db426 /kernel/bpf
parente7582bab5d28ea72e07cf2c74632eaf46a6c1a50 (diff)
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function which can be used from BPF programs like: int bpf_prog(struct pt_regs *ctx) { ... bpf_tail_call(ctx, &jmp_table, index); ... } that is roughly equivalent to: int bpf_prog(struct pt_regs *ctx) { ... if (jmp_table[index]) return (*jmp_table[index])(ctx); ... } The important detail that it's not a normal call, but a tail call. The kernel stack is precious, so this helper reuses the current stack frame and jumps into another BPF program without adding extra call frame. It's trivially done in interpreter and a bit trickier in JITs. In case of x64 JIT the bigger part of generated assembler prologue is common for all programs, so it is simply skipped while jumping. Other JITs can do similar prologue-skipping optimization or do stack unwind before jumping into the next program. bpf_tail_call() arguments: ctx - context pointer jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table index - index in the jump table Since all BPF programs are idenitified by file descriptor, user space need to populate the jmp_table with FDs of other BPF programs. If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere and program execution continues as normal. New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can populate this jmp_table array with FDs of other bpf programs. Programs can share the same jmp_table array or use multiple jmp_tables. The chain of tail calls can form unpredictable dynamic loops therefore tail_call_cnt is used to limit the number of calls and currently is set to 32. Use cases: Acked-by: Daniel Borkmann <daniel@iogearbox.net> ========== - simplify complex programs by splitting them into a sequence of small programs - dispatch routine For tracing and future seccomp the program may be triggered on all system calls, but processing of syscall arguments will be different. It's more efficient to implement them as: int syscall_entry(struct seccomp_data *ctx) { bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */); ... default: process unknown syscall ... } int sys_write_event(struct seccomp_data *ctx) {...} int sys_read_event(struct seccomp_data *ctx) {...} syscall_jmp_table[__NR_write] = sys_write_event; syscall_jmp_table[__NR_read] = sys_read_event; For networking the program may call into different parsers depending on packet format, like: int packet_parser(struct __sk_buff *skb) { ... parse L2, L3 here ... __u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol)); bpf_tail_call(skb, &ipproto_jmp_table, ipproto); ... default: process unknown protocol ... } int parse_tcp(struct __sk_buff *skb) {...} int parse_udp(struct __sk_buff *skb) {...} ipproto_jmp_table[IPPROTO_TCP] = parse_tcp; ipproto_jmp_table[IPPROTO_UDP] = parse_udp; - for TC use case, bpf_tail_call() allows to implement reclassify-like logic - bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table are atomic, so user space can build chains of BPF programs on the fly Implementation details: ======================= - high performance of bpf_tail_call() is the goal. It could have been implemented without JIT changes as a wrapper on top of BPF_PROG_RUN() macro, but with two downsides: . all programs would have to pay performance penalty for this feature and tail call itself would be slower, since mandatory stack unwind, return, stack allocate would be done for every tailcall. . tailcall would be limited to programs running preempt_disabled, since generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would need to be either global per_cpu variable accessed by helper and by wrapper or global variable protected by locks. In this implementation x64 JIT bypasses stack unwind and jumps into the callee program after prologue. - bpf_prog_array_compatible() ensures that prog_type of callee and caller are the same and JITed/non-JITed flag is the same, since calling JITed program from non-JITed is invalid, since stack frames are different. Similarly calling kprobe type program from socket type program is invalid. - jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map' abstraction, its user space API and all of verifier logic. It's in the existing arraymap.c file, since several functions are shared with regular array map. Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/arraymap.c113
-rw-r--r--kernel/bpf/core.c73
-rw-r--r--kernel/bpf/syscall.c23
-rw-r--r--kernel/bpf/verifier.c17
4 files changed, 218 insertions, 8 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8a6616583f38..614bcd4c1d74 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,12 +14,7 @@
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17 17#include <linux/filter.h>
18struct bpf_array {
19 struct bpf_map map;
20 u32 elem_size;
21 char value[0] __aligned(8);
22};
23 18
24/* Called from syscall */ 19/* Called from syscall */
25static struct bpf_map *array_map_alloc(union bpf_attr *attr) 20static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
154 return 0; 149 return 0;
155} 150}
156late_initcall(register_array_map); 151late_initcall(register_array_map);
152
153static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
154{
155 /* only bpf_prog file descriptors can be stored in prog_array map */
156 if (attr->value_size != sizeof(u32))
157 return ERR_PTR(-EINVAL);
158 return array_map_alloc(attr);
159}
160
161static void prog_array_map_free(struct bpf_map *map)
162{
163 struct bpf_array *array = container_of(map, struct bpf_array, map);
164 int i;
165
166 synchronize_rcu();
167
168 /* make sure it's empty */
169 for (i = 0; i < array->map.max_entries; i++)
170 BUG_ON(array->prog[i] != NULL);
171 kvfree(array);
172}
173
174static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
175{
176 return NULL;
177}
178
179/* only called from syscall */
180static int prog_array_map_update_elem(struct bpf_map *map, void *key,
181 void *value, u64 map_flags)
182{
183 struct bpf_array *array = container_of(map, struct bpf_array, map);
184 struct bpf_prog *prog, *old_prog;
185 u32 index = *(u32 *)key, ufd;
186
187 if (map_flags != BPF_ANY)
188 return -EINVAL;
189
190 if (index >= array->map.max_entries)
191 return -E2BIG;
192
193 ufd = *(u32 *)value;
194 prog = bpf_prog_get(ufd);
195 if (IS_ERR(prog))
196 return PTR_ERR(prog);
197
198 if (!bpf_prog_array_compatible(array, prog)) {
199 bpf_prog_put(prog);
200 return -EINVAL;
201 }
202
203 old_prog = xchg(array->prog + index, prog);
204 if (old_prog)
205 bpf_prog_put(old_prog);
206
207 return 0;
208}
209
210static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
211{
212 struct bpf_array *array = container_of(map, struct bpf_array, map);
213 struct bpf_prog *old_prog;
214 u32 index = *(u32 *)key;
215
216 if (index >= array->map.max_entries)
217 return -E2BIG;
218
219 old_prog = xchg(array->prog + index, NULL);
220 if (old_prog) {
221 bpf_prog_put(old_prog);
222 return 0;
223 } else {
224 return -ENOENT;
225 }
226}
227
228/* decrement refcnt of all bpf_progs that are stored in this map */
229void bpf_prog_array_map_clear(struct bpf_map *map)
230{
231 struct bpf_array *array = container_of(map, struct bpf_array, map);
232 int i;
233
234 for (i = 0; i < array->map.max_entries; i++)
235 prog_array_map_delete_elem(map, &i);
236}
237
238static const struct bpf_map_ops prog_array_ops = {
239 .map_alloc = prog_array_map_alloc,
240 .map_free = prog_array_map_free,
241 .map_get_next_key = array_map_get_next_key,
242 .map_lookup_elem = prog_array_map_lookup_elem,
243 .map_update_elem = prog_array_map_update_elem,
244 .map_delete_elem = prog_array_map_delete_elem,
245};
246
247static struct bpf_map_type_list prog_array_type __read_mostly = {
248 .ops = &prog_array_ops,
249 .type = BPF_MAP_TYPE_PROG_ARRAY,
250};
251
252static int __init register_prog_array_map(void)
253{
254 bpf_register_map_type(&prog_array_type);
255 return 0;
256}
257late_initcall(register_prog_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 54f0e7fcd0e2..d44b25cbe460 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -176,6 +176,15 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
176 return 0; 176 return 0;
177} 177}
178 178
179const struct bpf_func_proto bpf_tail_call_proto = {
180 .func = NULL,
181 .gpl_only = false,
182 .ret_type = RET_VOID,
183 .arg1_type = ARG_PTR_TO_CTX,
184 .arg2_type = ARG_CONST_MAP_PTR,
185 .arg3_type = ARG_ANYTHING,
186};
187
179/** 188/**
180 * __bpf_prog_run - run eBPF program on a given context 189 * __bpf_prog_run - run eBPF program on a given context
181 * @ctx: is the data we are operating on 190 * @ctx: is the data we are operating on
@@ -244,6 +253,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
244 [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, 253 [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
245 /* Call instruction */ 254 /* Call instruction */
246 [BPF_JMP | BPF_CALL] = &&JMP_CALL, 255 [BPF_JMP | BPF_CALL] = &&JMP_CALL,
256 [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
247 /* Jumps */ 257 /* Jumps */
248 [BPF_JMP | BPF_JA] = &&JMP_JA, 258 [BPF_JMP | BPF_JA] = &&JMP_JA,
249 [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, 259 [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +296,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
286 [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, 296 [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
287 [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, 297 [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
288 }; 298 };
299 u32 tail_call_cnt = 0;
289 void *ptr; 300 void *ptr;
290 int off; 301 int off;
291 302
@@ -431,6 +442,30 @@ select_insn:
431 BPF_R4, BPF_R5); 442 BPF_R4, BPF_R5);
432 CONT; 443 CONT;
433 444
445 JMP_TAIL_CALL: {
446 struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
447 struct bpf_array *array = container_of(map, struct bpf_array, map);
448 struct bpf_prog *prog;
449 u64 index = BPF_R3;
450
451 if (unlikely(index >= array->map.max_entries))
452 goto out;
453
454 if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
455 goto out;
456
457 tail_call_cnt++;
458
459 prog = READ_ONCE(array->prog[index]);
460 if (unlikely(!prog))
461 goto out;
462
463 ARG1 = BPF_R1;
464 insn = prog->insnsi;
465 goto select_insn;
466out:
467 CONT;
468 }
434 /* JMP */ 469 /* JMP */
435 JMP_JA: 470 JMP_JA:
436 insn += insn->off; 471 insn += insn->off;
@@ -619,6 +654,40 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
619{ 654{
620} 655}
621 656
657bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp)
658{
659 if (array->owner_prog_type) {
660 if (array->owner_prog_type != fp->type)
661 return false;
662 if (array->owner_jited != fp->jited)
663 return false;
664 } else {
665 array->owner_prog_type = fp->type;
666 array->owner_jited = fp->jited;
667 }
668 return true;
669}
670
671static int check_tail_call(const struct bpf_prog *fp)
672{
673 struct bpf_prog_aux *aux = fp->aux;
674 int i;
675
676 for (i = 0; i < aux->used_map_cnt; i++) {
677 struct bpf_array *array;
678 struct bpf_map *map;
679
680 map = aux->used_maps[i];
681 if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
682 continue;
683 array = container_of(map, struct bpf_array, map);
684 if (!bpf_prog_array_compatible(array, fp))
685 return -EINVAL;
686 }
687
688 return 0;
689}
690
622/** 691/**
623 * bpf_prog_select_runtime - select execution runtime for BPF program 692 * bpf_prog_select_runtime - select execution runtime for BPF program
624 * @fp: bpf_prog populated with internal BPF program 693 * @fp: bpf_prog populated with internal BPF program
@@ -626,7 +695,7 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
626 * try to JIT internal BPF program, if JIT is not available select interpreter 695 * try to JIT internal BPF program, if JIT is not available select interpreter
627 * BPF program will be executed via BPF_PROG_RUN() macro 696 * BPF program will be executed via BPF_PROG_RUN() macro
628 */ 697 */
629void bpf_prog_select_runtime(struct bpf_prog *fp) 698int bpf_prog_select_runtime(struct bpf_prog *fp)
630{ 699{
631 fp->bpf_func = (void *) __bpf_prog_run; 700 fp->bpf_func = (void *) __bpf_prog_run;
632 701
@@ -634,6 +703,8 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
634 bpf_int_jit_compile(fp); 703 bpf_int_jit_compile(fp);
635 /* Lock whole bpf_prog as read-only */ 704 /* Lock whole bpf_prog as read-only */
636 bpf_prog_lock_ro(fp); 705 bpf_prog_lock_ro(fp);
706
707 return check_tail_call(fp);
637} 708}
638EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); 709EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
639 710
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3bae6c591914..98a69bd83069 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
68{ 68{
69 struct bpf_map *map = filp->private_data; 69 struct bpf_map *map = filp->private_data;
70 70
71 if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
72 /* prog_array stores refcnt-ed bpf_prog pointers
73 * release them all when user space closes prog_array_fd
74 */
75 bpf_prog_array_map_clear(map);
76
71 bpf_map_put(map); 77 bpf_map_put(map);
72 return 0; 78 return 0;
73} 79}
@@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
392 */ 398 */
393 BUG_ON(!prog->aux->ops->get_func_proto); 399 BUG_ON(!prog->aux->ops->get_func_proto);
394 400
401 if (insn->imm == BPF_FUNC_tail_call) {
402 /* mark bpf_tail_call as different opcode
403 * to avoid conditional branch in
404 * interpeter for every normal call
405 * and to prevent accidental JITing by
406 * JIT compiler that doesn't support
407 * bpf_tail_call yet
408 */
409 insn->imm = 0;
410 insn->code |= BPF_X;
411 continue;
412 }
413
395 fn = prog->aux->ops->get_func_proto(insn->imm); 414 fn = prog->aux->ops->get_func_proto(insn->imm);
396 /* all functions that have prototype and verifier allowed 415 /* all functions that have prototype and verifier allowed
397 * programs to call them, must be real in-kernel functions 416 * programs to call them, must be real in-kernel functions
@@ -532,7 +551,9 @@ static int bpf_prog_load(union bpf_attr *attr)
532 fixup_bpf_calls(prog); 551 fixup_bpf_calls(prog);
533 552
534 /* eBPF program is ready to be JITed */ 553 /* eBPF program is ready to be JITed */
535 bpf_prog_select_runtime(prog); 554 err = bpf_prog_select_runtime(prog);
555 if (err < 0)
556 goto free_used_maps;
536 557
537 err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); 558 err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
538 if (err < 0) 559 if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 47dcd3aa6e23..cfd9a40b9a5a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id)
907 fn->ret_type, func_id); 907 fn->ret_type, func_id);
908 return -EINVAL; 908 return -EINVAL;
909 } 909 }
910
911 if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
912 func_id != BPF_FUNC_tail_call)
913 /* prog_array map type needs extra care:
914 * only allow to pass it into bpf_tail_call() for now.
915 * bpf_map_delete_elem() can be allowed in the future,
916 * while bpf_map_update_elem() must only be done via syscall
917 */
918 return -EINVAL;
919
920 if (func_id == BPF_FUNC_tail_call &&
921 map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
922 /* don't allow any other map type to be passed into
923 * bpf_tail_call()
924 */
925 return -EINVAL;
926
910 return 0; 927 return 0;
911} 928}
912 929