summaryrefslogtreecommitdiffstats
path: root/kernel/bpf
diff options
context:
space:
mode:
authorStanislav Fomichev <sdf@google.com>2019-06-27 16:38:47 -0400
committerAlexei Starovoitov <ast@kernel.org>2019-06-27 18:25:16 -0400
commit0d01da6afc5402f60325c5da31b22f7d56689b49 (patch)
treeafb8dd2975f8aa50d3577cb42310eb298d6491ac /kernel/bpf
parent3b1c667e47e4066cfe61610825ad50bc6b4a57e1 (diff)
bpf: implement getsockopt and setsockopt hooks
Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks. BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before passing them down to the kernel or bypass kernel completely. BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that kernel returns. Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure. The buffer memory is pre-allocated (because I don't think there is a precedent for working with __user memory from bpf). This might be slow to do for each {s,g}etsockopt call, that's why I've added __cgroup_bpf_prog_array_is_empty that exits early if there is nothing attached to a cgroup. Note, however, that there is a race between __cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup program layout might have changed; this should not be a problem because in general there is a race between multiple calls to {s,g}etsocktop and user adding/removing bpf progs from a cgroup. The return code of the BPF program is handled as follows: * 0: EPERM * 1: success, continue with next BPF program in the cgroup chain v9: * allow overwriting setsockopt arguments (Alexei Starovoitov): * use set_fs (same as kernel_setsockopt) * buffer is always kzalloc'd (no small on-stack buffer) v8: * use s32 for optlen (Andrii Nakryiko) v7: * return only 0 or 1 (Alexei Starovoitov) * always run all progs (Alexei Starovoitov) * use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov) (decided to use optval=-1 instead, optval=0 might be a valid input) * call getsockopt hook after kernel handlers (Alexei Starovoitov) v6: * rework cgroup chaining; stop as soon as bpf program returns 0 or 2; see patch with the documentation for the details * drop Andrii's and Martin's Acked-by (not sure they are comfortable with the new state of things) v5: * skip copy_to_user() and put_user() when ret == 0 (Martin Lau) v4: * don't export bpf_sk_fullsock helper (Martin Lau) * size != sizeof(__u64) for uapi pointers (Martin Lau) * offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau) v3: * typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko) * reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii Nakryiko) * use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau) * use BPF_FIELD_SIZEOF() for consistency (Martin Lau) * new CG_SOCKOPT_ACCESS macro to wrap repeated parts v2: * moved bpf_sockopt_kern fields around to remove a hole (Martin Lau) * aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau) * bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau) * added [0,2] return code check to verifier (Martin Lau) * dropped unused buf[64] from the stack (Martin Lau) * use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau) * dropped bpf_target_off from ctx rewrites (Martin Lau) * use return code for kernel bypass (Martin Lau & Andrii Nakryiko) Cc: Andrii Nakryiko <andriin@fb.com> Cc: Martin Lau <kafai@fb.com> Signed-off-by: Stanislav Fomichev <sdf@google.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/cgroup.c333
-rw-r--r--kernel/bpf/core.c9
-rw-r--r--kernel/bpf/syscall.c19
-rw-r--r--kernel/bpf/verifier.c8
4 files changed, 369 insertions, 0 deletions
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 077ed3a19848..76fa0076f20d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -15,6 +15,7 @@
15#include <linux/bpf.h> 15#include <linux/bpf.h>
16#include <linux/bpf-cgroup.h> 16#include <linux/bpf-cgroup.h>
17#include <net/sock.h> 17#include <net/sock.h>
18#include <net/bpf_sk_storage.h>
18 19
19#include "../cgroup/cgroup-internal.h" 20#include "../cgroup/cgroup-internal.h"
20 21
@@ -938,6 +939,188 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
938} 939}
939EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); 940EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
940 941
942static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
943 enum bpf_attach_type attach_type)
944{
945 struct bpf_prog_array *prog_array;
946 bool empty;
947
948 rcu_read_lock();
949 prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
950 empty = bpf_prog_array_is_empty(prog_array);
951 rcu_read_unlock();
952
953 return empty;
954}
955
956static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
957{
958 if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
959 return -EINVAL;
960
961 ctx->optval = kzalloc(max_optlen, GFP_USER);
962 if (!ctx->optval)
963 return -ENOMEM;
964
965 ctx->optval_end = ctx->optval + max_optlen;
966 ctx->optlen = max_optlen;
967
968 return 0;
969}
970
971static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
972{
973 kfree(ctx->optval);
974}
975
976int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
977 int *optname, char __user *optval,
978 int *optlen, char **kernel_optval)
979{
980 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
981 struct bpf_sockopt_kern ctx = {
982 .sk = sk,
983 .level = *level,
984 .optname = *optname,
985 };
986 int ret;
987
988 /* Opportunistic check to see whether we have any BPF program
989 * attached to the hook so we don't waste time allocating
990 * memory and locking the socket.
991 */
992 if (!cgroup_bpf_enabled ||
993 __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
994 return 0;
995
996 ret = sockopt_alloc_buf(&ctx, *optlen);
997 if (ret)
998 return ret;
999
1000 if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
1001 ret = -EFAULT;
1002 goto out;
1003 }
1004
1005 lock_sock(sk);
1006 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
1007 &ctx, BPF_PROG_RUN);
1008 release_sock(sk);
1009
1010 if (!ret) {
1011 ret = -EPERM;
1012 goto out;
1013 }
1014
1015 if (ctx.optlen == -1) {
1016 /* optlen set to -1, bypass kernel */
1017 ret = 1;
1018 } else if (ctx.optlen > *optlen || ctx.optlen < -1) {
1019 /* optlen is out of bounds */
1020 ret = -EFAULT;
1021 } else {
1022 /* optlen within bounds, run kernel handler */
1023 ret = 0;
1024
1025 /* export any potential modifications */
1026 *level = ctx.level;
1027 *optname = ctx.optname;
1028 *optlen = ctx.optlen;
1029 *kernel_optval = ctx.optval;
1030 }
1031
1032out:
1033 if (ret)
1034 sockopt_free_buf(&ctx);
1035 return ret;
1036}
1037EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
1038
1039int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1040 int optname, char __user *optval,
1041 int __user *optlen, int max_optlen,
1042 int retval)
1043{
1044 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1045 struct bpf_sockopt_kern ctx = {
1046 .sk = sk,
1047 .level = level,
1048 .optname = optname,
1049 .retval = retval,
1050 };
1051 int ret;
1052
1053 /* Opportunistic check to see whether we have any BPF program
1054 * attached to the hook so we don't waste time allocating
1055 * memory and locking the socket.
1056 */
1057 if (!cgroup_bpf_enabled ||
1058 __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
1059 return retval;
1060
1061 ret = sockopt_alloc_buf(&ctx, max_optlen);
1062 if (ret)
1063 return ret;
1064
1065 if (!retval) {
1066 /* If kernel getsockopt finished successfully,
1067 * copy whatever was returned to the user back
1068 * into our temporary buffer. Set optlen to the
1069 * one that kernel returned as well to let
1070 * BPF programs inspect the value.
1071 */
1072
1073 if (get_user(ctx.optlen, optlen)) {
1074 ret = -EFAULT;
1075 goto out;
1076 }
1077
1078 if (ctx.optlen > max_optlen)
1079 ctx.optlen = max_optlen;
1080
1081 if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
1082 ret = -EFAULT;
1083 goto out;
1084 }
1085 }
1086
1087 lock_sock(sk);
1088 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
1089 &ctx, BPF_PROG_RUN);
1090 release_sock(sk);
1091
1092 if (!ret) {
1093 ret = -EPERM;
1094 goto out;
1095 }
1096
1097 if (ctx.optlen > max_optlen) {
1098 ret = -EFAULT;
1099 goto out;
1100 }
1101
1102 /* BPF programs only allowed to set retval to 0, not some
1103 * arbitrary value.
1104 */
1105 if (ctx.retval != 0 && ctx.retval != retval) {
1106 ret = -EFAULT;
1107 goto out;
1108 }
1109
1110 if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1111 put_user(ctx.optlen, optlen)) {
1112 ret = -EFAULT;
1113 goto out;
1114 }
1115
1116 ret = ctx.retval;
1117
1118out:
1119 sockopt_free_buf(&ctx);
1120 return ret;
1121}
1122EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
1123
941static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, 1124static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
942 size_t *lenp) 1125 size_t *lenp)
943{ 1126{
@@ -1198,3 +1381,153 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
1198 1381
1199const struct bpf_prog_ops cg_sysctl_prog_ops = { 1382const struct bpf_prog_ops cg_sysctl_prog_ops = {
1200}; 1383};
1384
1385static const struct bpf_func_proto *
1386cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1387{
1388 switch (func_id) {
1389 case BPF_FUNC_sk_storage_get:
1390 return &bpf_sk_storage_get_proto;
1391 case BPF_FUNC_sk_storage_delete:
1392 return &bpf_sk_storage_delete_proto;
1393#ifdef CONFIG_INET
1394 case BPF_FUNC_tcp_sock:
1395 return &bpf_tcp_sock_proto;
1396#endif
1397 default:
1398 return cgroup_base_func_proto(func_id, prog);
1399 }
1400}
1401
1402static bool cg_sockopt_is_valid_access(int off, int size,
1403 enum bpf_access_type type,
1404 const struct bpf_prog *prog,
1405 struct bpf_insn_access_aux *info)
1406{
1407 const int size_default = sizeof(__u32);
1408
1409 if (off < 0 || off >= sizeof(struct bpf_sockopt))
1410 return false;
1411
1412 if (off % size != 0)
1413 return false;
1414
1415 if (type == BPF_WRITE) {
1416 switch (off) {
1417 case offsetof(struct bpf_sockopt, retval):
1418 if (size != size_default)
1419 return false;
1420 return prog->expected_attach_type ==
1421 BPF_CGROUP_GETSOCKOPT;
1422 case offsetof(struct bpf_sockopt, optname):
1423 /* fallthrough */
1424 case offsetof(struct bpf_sockopt, level):
1425 if (size != size_default)
1426 return false;
1427 return prog->expected_attach_type ==
1428 BPF_CGROUP_SETSOCKOPT;
1429 case offsetof(struct bpf_sockopt, optlen):
1430 return size == size_default;
1431 default:
1432 return false;
1433 }
1434 }
1435
1436 switch (off) {
1437 case offsetof(struct bpf_sockopt, sk):
1438 if (size != sizeof(__u64))
1439 return false;
1440 info->reg_type = PTR_TO_SOCKET;
1441 break;
1442 case offsetof(struct bpf_sockopt, optval):
1443 if (size != sizeof(__u64))
1444 return false;
1445 info->reg_type = PTR_TO_PACKET;
1446 break;
1447 case offsetof(struct bpf_sockopt, optval_end):
1448 if (size != sizeof(__u64))
1449 return false;
1450 info->reg_type = PTR_TO_PACKET_END;
1451 break;
1452 case offsetof(struct bpf_sockopt, retval):
1453 if (size != size_default)
1454 return false;
1455 return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
1456 default:
1457 if (size != size_default)
1458 return false;
1459 break;
1460 }
1461 return true;
1462}
1463
1464#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
1465 T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
1466 si->dst_reg, si->src_reg, \
1467 offsetof(struct bpf_sockopt_kern, F))
1468
1469static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
1470 const struct bpf_insn *si,
1471 struct bpf_insn *insn_buf,
1472 struct bpf_prog *prog,
1473 u32 *target_size)
1474{
1475 struct bpf_insn *insn = insn_buf;
1476
1477 switch (si->off) {
1478 case offsetof(struct bpf_sockopt, sk):
1479 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
1480 break;
1481 case offsetof(struct bpf_sockopt, level):
1482 if (type == BPF_WRITE)
1483 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
1484 else
1485 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
1486 break;
1487 case offsetof(struct bpf_sockopt, optname):
1488 if (type == BPF_WRITE)
1489 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
1490 else
1491 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
1492 break;
1493 case offsetof(struct bpf_sockopt, optlen):
1494 if (type == BPF_WRITE)
1495 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
1496 else
1497 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
1498 break;
1499 case offsetof(struct bpf_sockopt, retval):
1500 if (type == BPF_WRITE)
1501 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
1502 else
1503 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
1504 break;
1505 case offsetof(struct bpf_sockopt, optval):
1506 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
1507 break;
1508 case offsetof(struct bpf_sockopt, optval_end):
1509 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
1510 break;
1511 }
1512
1513 return insn - insn_buf;
1514}
1515
1516static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
1517 bool direct_write,
1518 const struct bpf_prog *prog)
1519{
1520 /* Nothing to do for sockopt argument. The data is kzalloc'ated.
1521 */
1522 return 0;
1523}
1524
1525const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
1526 .get_func_proto = cg_sockopt_func_proto,
1527 .is_valid_access = cg_sockopt_is_valid_access,
1528 .convert_ctx_access = cg_sockopt_convert_ctx_access,
1529 .gen_prologue = cg_sockopt_get_prologue,
1530};
1531
1532const struct bpf_prog_ops cg_sockopt_prog_ops = {
1533};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 561ed07d3007..e2c1b43728da 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1809,6 +1809,15 @@ int bpf_prog_array_length(struct bpf_prog_array *array)
1809 return cnt; 1809 return cnt;
1810} 1810}
1811 1811
1812bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
1813{
1814 struct bpf_prog_array_item *item;
1815
1816 for (item = array->items; item->prog; item++)
1817 if (item->prog != &dummy_bpf_prog.prog)
1818 return false;
1819 return true;
1820}
1812 1821
1813static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, 1822static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
1814 u32 *prog_ids, 1823 u32 *prog_ids,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7713cf39795a..b0f545e07425 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1590,6 +1590,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
1590 default: 1590 default:
1591 return -EINVAL; 1591 return -EINVAL;
1592 } 1592 }
1593 case BPF_PROG_TYPE_CGROUP_SOCKOPT:
1594 switch (expected_attach_type) {
1595 case BPF_CGROUP_SETSOCKOPT:
1596 case BPF_CGROUP_GETSOCKOPT:
1597 return 0;
1598 default:
1599 return -EINVAL;
1600 }
1593 default: 1601 default:
1594 return 0; 1602 return 0;
1595 } 1603 }
@@ -1840,6 +1848,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
1840 switch (prog->type) { 1848 switch (prog->type) {
1841 case BPF_PROG_TYPE_CGROUP_SOCK: 1849 case BPF_PROG_TYPE_CGROUP_SOCK:
1842 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 1850 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
1851 case BPF_PROG_TYPE_CGROUP_SOCKOPT:
1843 return attach_type == prog->expected_attach_type ? 0 : -EINVAL; 1852 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
1844 case BPF_PROG_TYPE_CGROUP_SKB: 1853 case BPF_PROG_TYPE_CGROUP_SKB:
1845 return prog->enforce_expected_attach_type && 1854 return prog->enforce_expected_attach_type &&
@@ -1912,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr)
1912 case BPF_CGROUP_SYSCTL: 1921 case BPF_CGROUP_SYSCTL:
1913 ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; 1922 ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
1914 break; 1923 break;
1924 case BPF_CGROUP_GETSOCKOPT:
1925 case BPF_CGROUP_SETSOCKOPT:
1926 ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
1927 break;
1915 default: 1928 default:
1916 return -EINVAL; 1929 return -EINVAL;
1917 } 1930 }
@@ -1995,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
1995 case BPF_CGROUP_SYSCTL: 2008 case BPF_CGROUP_SYSCTL:
1996 ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; 2009 ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
1997 break; 2010 break;
2011 case BPF_CGROUP_GETSOCKOPT:
2012 case BPF_CGROUP_SETSOCKOPT:
2013 ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
2014 break;
1998 default: 2015 default:
1999 return -EINVAL; 2016 return -EINVAL;
2000 } 2017 }
@@ -2031,6 +2048,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
2031 case BPF_CGROUP_SOCK_OPS: 2048 case BPF_CGROUP_SOCK_OPS:
2032 case BPF_CGROUP_DEVICE: 2049 case BPF_CGROUP_DEVICE:
2033 case BPF_CGROUP_SYSCTL: 2050 case BPF_CGROUP_SYSCTL:
2051 case BPF_CGROUP_GETSOCKOPT:
2052 case BPF_CGROUP_SETSOCKOPT:
2034 break; 2053 break;
2035 case BPF_LIRC_MODE2: 2054 case BPF_LIRC_MODE2:
2036 return lirc_prog_query(attr, uattr); 2055 return lirc_prog_query(attr, uattr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0e079b2298f8..6b5623d320f9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2215,6 +2215,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
2215 2215
2216 env->seen_direct_write = true; 2216 env->seen_direct_write = true;
2217 return true; 2217 return true;
2218
2219 case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2220 if (t == BPF_WRITE)
2221 env->seen_direct_write = true;
2222
2223 return true;
2224
2218 default: 2225 default:
2219 return false; 2226 return false;
2220 } 2227 }
@@ -6066,6 +6073,7 @@ static int check_return_code(struct bpf_verifier_env *env)
6066 case BPF_PROG_TYPE_SOCK_OPS: 6073 case BPF_PROG_TYPE_SOCK_OPS:
6067 case BPF_PROG_TYPE_CGROUP_DEVICE: 6074 case BPF_PROG_TYPE_CGROUP_DEVICE:
6068 case BPF_PROG_TYPE_CGROUP_SYSCTL: 6075 case BPF_PROG_TYPE_CGROUP_SYSCTL:
6076 case BPF_PROG_TYPE_CGROUP_SOCKOPT:
6069 break; 6077 break;
6070 default: 6078 default:
6071 return 0; 6079 return 0;