aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@plumgrid.com>2015-10-08 01:23:21 -0400
committerDavid S. Miller <davem@davemloft.net>2015-10-12 22:13:35 -0400
commit1be7f75d1668d6296b80bf35dcf6762393530afc (patch)
tree319fe845ed6fc5f5f1b30f17983418d77196f313 /kernel
parent0fa28877b26641cca56b607ccec1fcbda7ae09c6 (diff)
bpf: enable non-root eBPF programs
In order to let unprivileged users load and execute eBPF programs teach verifier to prevent pointer leaks. Verifier will prevent - any arithmetic on pointers (except R10+Imm which is used to compute stack addresses) - comparison of pointers (except if (map_value_ptr == 0) ... ) - passing pointers to helper functions - indirectly passing pointers in stack to helper functions - returning pointer from bpf program - storing pointers into ctx or maps Spill/fill of pointers into stack is allowed, but mangling of pointers stored in the stack or reading them byte by byte is not. Within bpf programs the pointers do exist, since programs need to be able to access maps, pass skb pointer to LD_ABS insns, etc but programs cannot pass such pointer values to the outside or obfuscate them. Only allow BPF_PROG_TYPE_SOCKET_FILTER unprivileged programs, so that socket filters (tcpdump), af_packet (quic acceleration) and future kcm can use it. tracing and tc cls/act program types still require root permissions, since tracing actually needs to be able to see all kernel pointers and tc is for root only. For example, the following unprivileged socket filter program is allowed: int bpf_prog1(struct __sk_buff *skb) { u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); u64 *value = bpf_map_lookup_elem(&my_map, &index); if (value) *value += skb->len; return 0; } but the following program is not: int bpf_prog1(struct __sk_buff *skb) { u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); u64 *value = bpf_map_lookup_elem(&my_map, &index); if (value) *value += (u64) skb; return 0; } since it would leak the kernel address into the map. Unprivileged socket filter bpf programs have access to the following helper functions: - map lookup/update/delete (but they cannot store kernel pointers into them) - get_random (it's already exposed to unprivileged user space) - get_smp_processor_id - tail_call into another socket filter program - ktime_get_ns The feature is controlled by sysctl kernel.unprivileged_bpf_disabled. This toggle defaults to off (0), but can be set true (1). Once true, bpf programs and maps cannot be accessed from unprivileged process, and the toggle cannot be set back to false. Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Reviewed-by: Kees Cook <keescook@chromium.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/syscall.c11
-rw-r--r--kernel/bpf/verifier.c106
-rw-r--r--kernel/sysctl.c13
3 files changed, 116 insertions, 14 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c868cafbc00c..83697bc8e574 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
18#include <linux/filter.h> 18#include <linux/filter.h>
19#include <linux/version.h> 19#include <linux/version.h>
20 20
21int sysctl_unprivileged_bpf_disabled __read_mostly;
22
21static LIST_HEAD(bpf_map_types); 23static LIST_HEAD(bpf_map_types);
22 24
23static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) 25static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -544,6 +546,9 @@ static int bpf_prog_load(union bpf_attr *attr)
544 attr->kern_version != LINUX_VERSION_CODE) 546 attr->kern_version != LINUX_VERSION_CODE)
545 return -EINVAL; 547 return -EINVAL;
546 548
549 if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
550 return -EPERM;
551
547 /* plain bpf_prog allocation */ 552 /* plain bpf_prog allocation */
548 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); 553 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
549 if (!prog) 554 if (!prog)
@@ -599,11 +604,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
599 union bpf_attr attr = {}; 604 union bpf_attr attr = {};
600 int err; 605 int err;
601 606
602 /* the syscall is limited to root temporarily. This restriction will be 607 if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
603 * lifted when security audit is clean. Note that eBPF+tracing must have
604 * this restriction, since it may pass kernel data to user space
605 */
606 if (!capable(CAP_SYS_ADMIN))
607 return -EPERM; 608 return -EPERM;
608 609
609 if (!access_ok(VERIFY_READ, uattr, 1)) 610 if (!access_ok(VERIFY_READ, uattr, 1))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f8da034c2258..1d6b97be79e1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -199,6 +199,7 @@ struct verifier_env {
199 struct verifier_state_list **explored_states; /* search pruning optimization */ 199 struct verifier_state_list **explored_states; /* search pruning optimization */
200 struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ 200 struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
201 u32 used_map_cnt; /* number of used maps */ 201 u32 used_map_cnt; /* number of used maps */
202 bool allow_ptr_leaks;
202}; 203};
203 204
204/* verbose verifier prints what it's seeing 205/* verbose verifier prints what it's seeing
@@ -538,6 +539,21 @@ static int bpf_size_to_bytes(int bpf_size)
538 return -EINVAL; 539 return -EINVAL;
539} 540}
540 541
542static bool is_spillable_regtype(enum bpf_reg_type type)
543{
544 switch (type) {
545 case PTR_TO_MAP_VALUE:
546 case PTR_TO_MAP_VALUE_OR_NULL:
547 case PTR_TO_STACK:
548 case PTR_TO_CTX:
549 case FRAME_PTR:
550 case CONST_PTR_TO_MAP:
551 return true;
552 default:
553 return false;
554 }
555}
556
541/* check_stack_read/write functions track spill/fill of registers, 557/* check_stack_read/write functions track spill/fill of registers,
542 * stack boundary and alignment are checked in check_mem_access() 558 * stack boundary and alignment are checked in check_mem_access()
543 */ 559 */
@@ -550,9 +566,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
550 */ 566 */
551 567
552 if (value_regno >= 0 && 568 if (value_regno >= 0 &&
553 (state->regs[value_regno].type == PTR_TO_MAP_VALUE || 569 is_spillable_regtype(state->regs[value_regno].type)) {
554 state->regs[value_regno].type == PTR_TO_STACK ||
555 state->regs[value_regno].type == PTR_TO_CTX)) {
556 570
557 /* register containing pointer is being spilled into stack */ 571 /* register containing pointer is being spilled into stack */
558 if (size != BPF_REG_SIZE) { 572 if (size != BPF_REG_SIZE) {
@@ -643,6 +657,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
643 return -EACCES; 657 return -EACCES;
644} 658}
645 659
660static bool is_pointer_value(struct verifier_env *env, int regno)
661{
662 if (env->allow_ptr_leaks)
663 return false;
664
665 switch (env->cur_state.regs[regno].type) {
666 case UNKNOWN_VALUE:
667 case CONST_IMM:
668 return false;
669 default:
670 return true;
671 }
672}
673
646/* check whether memory at (regno + off) is accessible for t = (read | write) 674/* check whether memory at (regno + off) is accessible for t = (read | write)
647 * if t==write, value_regno is a register which value is stored into memory 675 * if t==write, value_regno is a register which value is stored into memory
648 * if t==read, value_regno is a register which will receive the value from memory 676 * if t==read, value_regno is a register which will receive the value from memory
@@ -669,11 +697,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
669 } 697 }
670 698
671 if (state->regs[regno].type == PTR_TO_MAP_VALUE) { 699 if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
700 if (t == BPF_WRITE && value_regno >= 0 &&
701 is_pointer_value(env, value_regno)) {
702 verbose("R%d leaks addr into map\n", value_regno);
703 return -EACCES;
704 }
672 err = check_map_access(env, regno, off, size); 705 err = check_map_access(env, regno, off, size);
673 if (!err && t == BPF_READ && value_regno >= 0) 706 if (!err && t == BPF_READ && value_regno >= 0)
674 mark_reg_unknown_value(state->regs, value_regno); 707 mark_reg_unknown_value(state->regs, value_regno);
675 708
676 } else if (state->regs[regno].type == PTR_TO_CTX) { 709 } else if (state->regs[regno].type == PTR_TO_CTX) {
710 if (t == BPF_WRITE && value_regno >= 0 &&
711 is_pointer_value(env, value_regno)) {
712 verbose("R%d leaks addr into ctx\n", value_regno);
713 return -EACCES;
714 }
677 err = check_ctx_access(env, off, size, t); 715 err = check_ctx_access(env, off, size, t);
678 if (!err && t == BPF_READ && value_regno >= 0) 716 if (!err && t == BPF_READ && value_regno >= 0)
679 mark_reg_unknown_value(state->regs, value_regno); 717 mark_reg_unknown_value(state->regs, value_regno);
@@ -684,10 +722,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
684 verbose("invalid stack off=%d size=%d\n", off, size); 722 verbose("invalid stack off=%d size=%d\n", off, size);
685 return -EACCES; 723 return -EACCES;
686 } 724 }
687 if (t == BPF_WRITE) 725 if (t == BPF_WRITE) {
726 if (!env->allow_ptr_leaks &&
727 state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
728 size != BPF_REG_SIZE) {
729 verbose("attempt to corrupt spilled pointer on stack\n");
730 return -EACCES;
731 }
688 err = check_stack_write(state, off, size, value_regno); 732 err = check_stack_write(state, off, size, value_regno);
689 else 733 } else {
690 err = check_stack_read(state, off, size, value_regno); 734 err = check_stack_read(state, off, size, value_regno);
735 }
691 } else { 736 } else {
692 verbose("R%d invalid mem access '%s'\n", 737 verbose("R%d invalid mem access '%s'\n",
693 regno, reg_type_str[state->regs[regno].type]); 738 regno, reg_type_str[state->regs[regno].type]);
@@ -775,8 +820,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
775 return -EACCES; 820 return -EACCES;
776 } 821 }
777 822
778 if (arg_type == ARG_ANYTHING) 823 if (arg_type == ARG_ANYTHING) {
824 if (is_pointer_value(env, regno)) {
825 verbose("R%d leaks addr into helper function\n", regno);
826 return -EACCES;
827 }
779 return 0; 828 return 0;
829 }
780 830
781 if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || 831 if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
782 arg_type == ARG_PTR_TO_MAP_VALUE) { 832 arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -950,8 +1000,9 @@ static int check_call(struct verifier_env *env, int func_id)
950} 1000}
951 1001
952/* check validity of 32-bit and 64-bit arithmetic operations */ 1002/* check validity of 32-bit and 64-bit arithmetic operations */
953static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) 1003static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
954{ 1004{
1005 struct reg_state *regs = env->cur_state.regs;
955 u8 opcode = BPF_OP(insn->code); 1006 u8 opcode = BPF_OP(insn->code);
956 int err; 1007 int err;
957 1008
@@ -976,6 +1027,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
976 if (err) 1027 if (err)
977 return err; 1028 return err;
978 1029
1030 if (is_pointer_value(env, insn->dst_reg)) {
1031 verbose("R%d pointer arithmetic prohibited\n",
1032 insn->dst_reg);
1033 return -EACCES;
1034 }
1035
979 /* check dest operand */ 1036 /* check dest operand */
980 err = check_reg_arg(regs, insn->dst_reg, DST_OP); 1037 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
981 if (err) 1038 if (err)
@@ -1012,6 +1069,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
1012 */ 1069 */
1013 regs[insn->dst_reg] = regs[insn->src_reg]; 1070 regs[insn->dst_reg] = regs[insn->src_reg];
1014 } else { 1071 } else {
1072 if (is_pointer_value(env, insn->src_reg)) {
1073 verbose("R%d partial copy of pointer\n",
1074 insn->src_reg);
1075 return -EACCES;
1076 }
1015 regs[insn->dst_reg].type = UNKNOWN_VALUE; 1077 regs[insn->dst_reg].type = UNKNOWN_VALUE;
1016 regs[insn->dst_reg].map_ptr = NULL; 1078 regs[insn->dst_reg].map_ptr = NULL;
1017 } 1079 }
@@ -1061,8 +1123,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
1061 /* pattern match 'bpf_add Rx, imm' instruction */ 1123 /* pattern match 'bpf_add Rx, imm' instruction */
1062 if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && 1124 if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
1063 regs[insn->dst_reg].type == FRAME_PTR && 1125 regs[insn->dst_reg].type == FRAME_PTR &&
1064 BPF_SRC(insn->code) == BPF_K) 1126 BPF_SRC(insn->code) == BPF_K) {
1065 stack_relative = true; 1127 stack_relative = true;
1128 } else if (is_pointer_value(env, insn->dst_reg)) {
1129 verbose("R%d pointer arithmetic prohibited\n",
1130 insn->dst_reg);
1131 return -EACCES;
1132 } else if (BPF_SRC(insn->code) == BPF_X &&
1133 is_pointer_value(env, insn->src_reg)) {
1134 verbose("R%d pointer arithmetic prohibited\n",
1135 insn->src_reg);
1136 return -EACCES;
1137 }
1066 1138
1067 /* check dest operand */ 1139 /* check dest operand */
1068 err = check_reg_arg(regs, insn->dst_reg, DST_OP); 1140 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
@@ -1101,6 +1173,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
1101 err = check_reg_arg(regs, insn->src_reg, SRC_OP); 1173 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1102 if (err) 1174 if (err)
1103 return err; 1175 return err;
1176
1177 if (is_pointer_value(env, insn->src_reg)) {
1178 verbose("R%d pointer comparison prohibited\n",
1179 insn->src_reg);
1180 return -EACCES;
1181 }
1104 } else { 1182 } else {
1105 if (insn->src_reg != BPF_REG_0) { 1183 if (insn->src_reg != BPF_REG_0) {
1106 verbose("BPF_JMP uses reserved fields\n"); 1184 verbose("BPF_JMP uses reserved fields\n");
@@ -1155,6 +1233,9 @@ static int check_cond_jmp_op(struct verifier_env *env,
1155 regs[insn->dst_reg].type = CONST_IMM; 1233 regs[insn->dst_reg].type = CONST_IMM;
1156 regs[insn->dst_reg].imm = 0; 1234 regs[insn->dst_reg].imm = 0;
1157 } 1235 }
1236 } else if (is_pointer_value(env, insn->dst_reg)) {
1237 verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
1238 return -EACCES;
1158 } else if (BPF_SRC(insn->code) == BPF_K && 1239 } else if (BPF_SRC(insn->code) == BPF_K &&
1159 (opcode == BPF_JEQ || opcode == BPF_JNE)) { 1240 (opcode == BPF_JEQ || opcode == BPF_JNE)) {
1160 1241
@@ -1658,7 +1739,7 @@ static int do_check(struct verifier_env *env)
1658 } 1739 }
1659 1740
1660 if (class == BPF_ALU || class == BPF_ALU64) { 1741 if (class == BPF_ALU || class == BPF_ALU64) {
1661 err = check_alu_op(regs, insn); 1742 err = check_alu_op(env, insn);
1662 if (err) 1743 if (err)
1663 return err; 1744 return err;
1664 1745
@@ -1816,6 +1897,11 @@ static int do_check(struct verifier_env *env)
1816 if (err) 1897 if (err)
1817 return err; 1898 return err;
1818 1899
1900 if (is_pointer_value(env, BPF_REG_0)) {
1901 verbose("R0 leaks addr as return value\n");
1902 return -EACCES;
1903 }
1904
1819process_bpf_exit: 1905process_bpf_exit:
1820 insn_idx = pop_stack(env, &prev_insn_idx); 1906 insn_idx = pop_stack(env, &prev_insn_idx);
1821 if (insn_idx < 0) { 1907 if (insn_idx < 0) {
@@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
2144 if (ret < 0) 2230 if (ret < 0)
2145 goto skip_full_check; 2231 goto skip_full_check;
2146 2232
2233 env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
2234
2147 ret = do_check(env); 2235 ret = do_check(env);
2148 2236
2149skip_full_check: 2237skip_full_check:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69201d8094e..96c856b04081 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,7 @@
64#include <linux/binfmts.h> 64#include <linux/binfmts.h>
65#include <linux/sched/sysctl.h> 65#include <linux/sched/sysctl.h>
66#include <linux/kexec.h> 66#include <linux/kexec.h>
67#include <linux/bpf.h>
67 68
68#include <asm/uaccess.h> 69#include <asm/uaccess.h>
69#include <asm/processor.h> 70#include <asm/processor.h>
@@ -1139,6 +1140,18 @@ static struct ctl_table kern_table[] = {
1139 .proc_handler = timer_migration_handler, 1140 .proc_handler = timer_migration_handler,
1140 }, 1141 },
1141#endif 1142#endif
1143#ifdef CONFIG_BPF_SYSCALL
1144 {
1145 .procname = "unprivileged_bpf_disabled",
1146 .data = &sysctl_unprivileged_bpf_disabled,
1147 .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
1148 .mode = 0644,
1149 /* only handle a transition from default "0" to "1" */
1150 .proc_handler = proc_dointvec_minmax,
1151 .extra1 = &one,
1152 .extra2 = &one,
1153 },
1154#endif
1142 { } 1155 { }
1143}; 1156};
1144 1157