aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2018-01-26 17:33:39 -0500
committerAlexei Starovoitov <ast@kernel.org>2018-01-26 19:42:05 -0500
commitf6b1b3bf0d5f681631a293cfe1ca934b81716f1e (patch)
tree21736164f4586c38117c1bb74e423b37bf6ab0e6
parent5e581dad4fec0e6d062740dc35b8dc248b39d224 (diff)
bpf: fix subprog verifier bypass by div/mod by 0 exception
One of the ugly leftovers from the early eBPF days is that div/mod operations based on registers have a hard-coded src_reg == 0 test in the interpreter as well as in JIT code generators that would return from the BPF program with exit code 0. This was basically adopted from cBPF interpreter for historical reasons. There are multiple reasons why this is very suboptimal and prone to bugs. To name one: the return code mapping for such abnormal program exit of 0 does not always match with a suitable program type's exit code mapping. For example, '0' in tc means action 'ok' where the packet gets passed further up the stack, which is just undesirable for such cases (e.g. when implementing policy) and also does not match with other program types. While trying to work out an exception handling scheme, I also noticed that programs crafted like the following will currently pass the verifier: 0: (bf) r6 = r1 1: (85) call pc+8 caller: R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 callee: frame1: R1=ctx(id=0,off=0,imm=0) R10=fp0,call_1 10: (b4) (u32) r2 = (u32) 0 11: (b4) (u32) r3 = (u32) 1 12: (3c) (u32) r3 /= (u32) r2 13: (61) r0 = *(u32 *)(r1 +76) 14: (95) exit returning from callee: frame1: R0_w=pkt(id=0,off=0,r=0,imm=0) R1=ctx(id=0,off=0,imm=0) R2_w=inv0 R3_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0,call_1 to caller at 2: R0_w=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 from 14 to 2: R0=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 2: (bf) r1 = r6 3: (61) r1 = *(u32 *)(r1 +80) 4: (bf) r2 = r0 5: (07) r2 += 8 6: (2d) if r2 > r1 goto pc+1 R0=pkt(id=0,off=0,r=8,imm=0) R1=pkt_end(id=0,off=0,imm=0) R2=pkt(id=0,off=8,r=8,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 7: (71) r0 = *(u8 *)(r0 +0) 8: (b7) r0 = 1 9: (95) exit from 6 to 8: safe processed 16 insns (limit 131072), stack depth 0+0 Basically what happens is that in the subprog we make use of a div/mod by 0 exception and in the 'normal' subprog's exit path we just return skb->data back to the main prog. This has the implication that the verifier thinks we always get a pkt pointer in R0 while we still have the implicit 'return 0' from the div as an alternative unconditional return path earlier. Thus, R0 then contains 0, meaning back in the parent prog we get the address range of [0x0, skb->data_end] as read and writeable. Similar can be crafted with other pointer register types. Since i) BPF_ABS/IND is not allowed in programs that contain BPF to BPF calls (and generally it's also disadvised to use in native eBPF context), ii) unknown opcodes don't return zero anymore, iii) we don't return an exception code in dead branches, the only last missing case affected and to fix is the div/mod handling. What we would really need is some infrastructure to propagate exceptions all the way to the original prog unwinding the current stack and returning that code to the caller of the BPF program. In user space such exception handling for similar runtimes is typically implemented with setjmp(3) and longjmp(3) as one possibility which is not available in the kernel, though (kgdb used to implement it in kernel long time ago). I implemented a PoC exception handling mechanism into the BPF interpreter with porting setjmp()/longjmp() into x86_64 and adding a new internal BPF_ABRT opcode that can use a program specific exception code for all exception cases we have (e.g. div/mod by 0, unknown opcodes, etc). While this seems to work in the constrained BPF environment (meaning, here, we don't need to deal with state e.g. from memory allocations that we would need to undo before going into exception state), it still has various drawbacks: i) we would need to implement the setjmp()/longjmp() for every arch supported in the kernel and for x86_64, arm64, sparc64 JITs currently supporting calls, ii) it has unconditional additional cost on main program entry to store CPU register state in initial setjmp() call, and we would need some way to pass the jmp_buf down into ___bpf_prog_run() for main prog and all subprogs, but also storing on stack is not really nice (other option would be per-cpu storage for this, but it also has the drawback that we need to disable preemption for every BPF program types). All in all this approach would add a lot of complexity. Another poor-man's solution would be to have some sort of additional shared register or scratch buffer to hold state for exceptions, and test that after every call return to chain returns and pass R0 all the way down to BPF prog caller. This is also problematic in various ways: i) an additional register doesn't map well into JITs, and some other scratch space could only be on per-cpu storage, which, again has the side-effect that this only works when we disable preemption, or somewhere in the input context which is not available everywhere either, and ii) this adds significant runtime overhead by putting conditionals after each and every call, as well as implementation complexity. Yet another option is to teach verifier that div/mod can return an integer, which however is also complex to implement as verifier would need to walk such fake 'mov r0,<code>; exit;' sequeuence and there would still be no guarantee for having propagation of this further down to the BPF caller as proper exception code. For parent prog, it is also is not distinguishable from a normal return of a constant scalar value. The approach taken here is a completely different one with little complexity and no additional overhead involved in that we make use of the fact that a div/mod by 0 is undefined behavior. Instead of bailing out, we adapt the same behavior as on some major archs like ARMv8 [0] into eBPF as well: X div 0 results in 0, and X mod 0 results in X. aarch64 and aarch32 ISA do not generate any traps or otherwise aborts of program execution for unsigned divides. I verified this also with a test program compiled by gcc and clang, and the behavior matches with the spec. Going forward we adapt the eBPF verifier to emit such rewrites once div/mod by register was seen. cBPF is not touched and will keep existing 'return 0' semantics. Given the options, it seems the most suitable from all of them, also since major archs have similar schemes in place. Given this is all in the realm of undefined behavior, we still have the option to adapt if deemed necessary and this way we would also have the option of more flexibility from LLVM code generation side (which is then fully visible to verifier). Thus, this patch i) fixes the panic seen in above program and ii) doesn't bypass the verifier observations. [0] ARM Architecture Reference Manual, ARMv8 [ARM DDI 0487B.b] http://infocenter.arm.com/help/topic/com.arm.doc.ddi0487b.b/DDI0487B_b_armv8_arm.pdf 1) aarch64 instruction set: section C3.4.7 and C6.2.279 (UDIV) "A division by zero results in a zero being written to the destination register, without any indication that the division by zero occurred." 2) aarch32 instruction set: section F1.4.8 and F5.1.263 (UDIV) "For the SDIV and UDIV instructions, division by zero always returns a zero result." Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r--kernel/bpf/core.c8
-rw-r--r--kernel/bpf/verifier.c38
-rw-r--r--net/core/filter.c9
3 files changed, 38 insertions, 17 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8301de2d1f96..5f35f93dcab2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -999,14 +999,10 @@ select_insn:
999 (*(s64 *) &DST) >>= IMM; 999 (*(s64 *) &DST) >>= IMM;
1000 CONT; 1000 CONT;
1001 ALU64_MOD_X: 1001 ALU64_MOD_X:
1002 if (unlikely(SRC == 0))
1003 return 0;
1004 div64_u64_rem(DST, SRC, &tmp); 1002 div64_u64_rem(DST, SRC, &tmp);
1005 DST = tmp; 1003 DST = tmp;
1006 CONT; 1004 CONT;
1007 ALU_MOD_X: 1005 ALU_MOD_X:
1008 if (unlikely((u32)SRC == 0))
1009 return 0;
1010 tmp = (u32) DST; 1006 tmp = (u32) DST;
1011 DST = do_div(tmp, (u32) SRC); 1007 DST = do_div(tmp, (u32) SRC);
1012 CONT; 1008 CONT;
@@ -1019,13 +1015,9 @@ select_insn:
1019 DST = do_div(tmp, (u32) IMM); 1015 DST = do_div(tmp, (u32) IMM);
1020 CONT; 1016 CONT;
1021 ALU64_DIV_X: 1017 ALU64_DIV_X:
1022 if (unlikely(SRC == 0))
1023 return 0;
1024 DST = div64_u64(DST, SRC); 1018 DST = div64_u64(DST, SRC);
1025 CONT; 1019 CONT;
1026 ALU_DIV_X: 1020 ALU_DIV_X:
1027 if (unlikely((u32)SRC == 0))
1028 return 0;
1029 tmp = (u32) DST; 1021 tmp = (u32) DST;
1030 do_div(tmp, (u32) SRC); 1022 do_div(tmp, (u32) SRC);
1031 DST = (u32) tmp; 1023 DST = (u32) tmp;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0c5269415090..5fb69a85d967 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5400,15 +5400,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
5400 int i, cnt, delta = 0; 5400 int i, cnt, delta = 0;
5401 5401
5402 for (i = 0; i < insn_cnt; i++, insn++) { 5402 for (i = 0; i < insn_cnt; i++, insn++) {
5403 if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || 5403 if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
5404 insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
5405 insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
5404 insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { 5406 insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
5405 /* due to JIT bugs clear upper 32-bits of src register 5407 bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
5406 * before div/mod operation 5408 struct bpf_insn mask_and_div[] = {
5407 */ 5409 BPF_MOV32_REG(insn->src_reg, insn->src_reg),
5408 insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); 5410 /* Rx div 0 -> 0 */
5409 insn_buf[1] = *insn; 5411 BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2),
5410 cnt = 2; 5412 BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
5411 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); 5413 BPF_JMP_IMM(BPF_JA, 0, 0, 1),
5414 *insn,
5415 };
5416 struct bpf_insn mask_and_mod[] = {
5417 BPF_MOV32_REG(insn->src_reg, insn->src_reg),
5418 /* Rx mod 0 -> Rx */
5419 BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1),
5420 *insn,
5421 };
5422 struct bpf_insn *patchlet;
5423
5424 if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
5425 insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
5426 patchlet = mask_and_div + (is64 ? 1 : 0);
5427 cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0);
5428 } else {
5429 patchlet = mask_and_mod + (is64 ? 1 : 0);
5430 cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0);
5431 }
5432
5433 new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
5412 if (!new_prog) 5434 if (!new_prog)
5413 return -ENOMEM; 5435 return -ENOMEM;
5414 5436
diff --git a/net/core/filter.c b/net/core/filter.c
index 60d8c8712652..08ab4c65a998 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -459,8 +459,15 @@ do_pass:
459 break; 459 break;
460 460
461 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || 461 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
462 fp->code == (BPF_ALU | BPF_MOD | BPF_X)) 462 fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
463 *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); 463 *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
464 /* Error with exception code on div/mod by 0.
465 * For cBPF programs, this was always return 0.
466 */
467 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
468 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
469 *insn++ = BPF_EXIT_INSN();
470 }
464 471
465 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); 472 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
466 break; 473 break;