aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorAndrey Ignatov <rdna@fb.com>2018-03-30 18:08:02 -0400
committerDaniel Borkmann <daniel@iogearbox.net>2018-03-30 20:15:18 -0400
commit4fbac77d2d092b475dda9eea66da674369665427 (patch)
treee565018845653a1d55241ffbc8f40cc30ae1e19a /kernel
parentd7be143b67c2cf99bf93279217b1cf93a1e8a6b1 (diff)
bpf: Hooks for sys_bind
== The problem == There is a use-case when all processes inside a cgroup should use one single IP address on a host that has multiple IP configured. Those processes should use the IP for both ingress and egress, for TCP and UDP traffic. So TCP/UDP servers should be bound to that IP to accept incoming connections on it, and TCP/UDP clients should make outgoing connections from that IP. It should not require changing application code since it's often not possible. Currently it's solved by intercepting glibc wrappers around syscalls such as `bind(2)` and `connect(2)`. It's done by a shared library that is preloaded for every process in a cgroup so that whenever TCP/UDP server calls `bind(2)`, the library replaces IP in sockaddr before passing arguments to syscall. When application calls `connect(2)` the library transparently binds the local end of connection to that IP (`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty). Shared library approach is fragile though, e.g.: * some applications clear env vars (incl. `LD_PRELOAD`); * `/etc/ld.so.preload` doesn't help since some applications are linked with option `-z nodefaultlib`; * other applications don't use glibc and there is nothing to intercept. == The solution == The patch provides much more reliable in-kernel solution for the 1st part of the problem: binding TCP/UDP servers on desired IP. It does not depend on application environment and implementation details (whether glibc is used or not). It adds new eBPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` and attach types `BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND` (similar to already existing `BPF_CGROUP_INET_SOCK_CREATE`). The new program type is intended to be used with sockets (`struct sock`) in a cgroup and provided by user `struct sockaddr`. Pointers to both of them are parts of the context passed to programs of newly added types. The new attach types provides hooks in `bind(2)` system call for both IPv4 and IPv6 so that one can write a program to override IP addresses and ports user program tries to bind to and apply such a program for whole cgroup. == Implementation notes == [1] Separate attach types for `AF_INET` and `AF_INET6` are added intentionally to prevent reading/writing to offsets that don't make sense for corresponding socket family. E.g. if user passes `sockaddr_in` it doesn't make sense to read from / write to `user_ip6[]` context fields. [2] The write access to `struct bpf_sock_addr_kern` is implemented using special field as an additional "register". There are just two registers in `sock_addr_convert_ctx_access`: `src` with value to write and `dst` with pointer to context that can't be changed not to break later instructions. But the fields, allowed to write to, are not available directly and to access them address of corresponding pointer has to be loaded first. To get additional register the 1st not used by `src` and `dst` one is taken, its content is saved to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load address of pointer field, and finally the register's content is restored from the temporary field after writing `src` value. Signed-off-by: Andrey Ignatov <rdna@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/cgroup.c36
-rw-r--r--kernel/bpf/syscall.c36
-rw-r--r--kernel/bpf/verifier.c1
3 files changed, 65 insertions, 8 deletions
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8730b24ed540..43171a0bb02b 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -495,6 +495,42 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
495EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); 495EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
496 496
497/** 497/**
498 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
499 * provided by user sockaddr
500 * @sk: sock struct that will use sockaddr
501 * @uaddr: sockaddr struct provided by user
502 * @type: The type of program to be exectuted
503 *
504 * socket is expected to be of type INET or INET6.
505 *
506 * This function will return %-EPERM if an attached program is found and
507 * returned value != 1 during execution. In all other cases, 0 is returned.
508 */
509int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
510 struct sockaddr *uaddr,
511 enum bpf_attach_type type)
512{
513 struct bpf_sock_addr_kern ctx = {
514 .sk = sk,
515 .uaddr = uaddr,
516 };
517 struct cgroup *cgrp;
518 int ret;
519
520 /* Check socket family since not all sockets represent network
521 * endpoint (e.g. AF_UNIX).
522 */
523 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
524 return 0;
525
526 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
527 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
528
529 return ret == 1 ? 0 : -EPERM;
530}
531EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
532
533/**
498 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock 534 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
499 * @sk: socket to get cgroup from 535 * @sk: socket to get cgroup from
500 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains 536 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9d3b572d4dec..2cad66a4cacb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1175,19 +1175,29 @@ static int
1175bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, 1175bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
1176 enum bpf_attach_type expected_attach_type) 1176 enum bpf_attach_type expected_attach_type)
1177{ 1177{
1178 /* There are currently no prog types that require specifying 1178 switch (prog_type) {
1179 * attach_type at load time. 1179 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
1180 */ 1180 switch (expected_attach_type) {
1181 return 0; 1181 case BPF_CGROUP_INET4_BIND:
1182 case BPF_CGROUP_INET6_BIND:
1183 return 0;
1184 default:
1185 return -EINVAL;
1186 }
1187 default:
1188 return 0;
1189 }
1182} 1190}
1183 1191
1184static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, 1192static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
1185 enum bpf_attach_type attach_type) 1193 enum bpf_attach_type attach_type)
1186{ 1194{
1187 /* There are currently no prog types that require specifying 1195 switch (prog->type) {
1188 * attach_type at load time. 1196 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
1189 */ 1197 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
1190 return 0; 1198 default:
1199 return 0;
1200 }
1191} 1201}
1192 1202
1193/* last field in 'union bpf_attr' used by this command */ 1203/* last field in 'union bpf_attr' used by this command */
@@ -1479,6 +1489,10 @@ static int bpf_prog_attach(const union bpf_attr *attr)
1479 case BPF_CGROUP_INET_SOCK_CREATE: 1489 case BPF_CGROUP_INET_SOCK_CREATE:
1480 ptype = BPF_PROG_TYPE_CGROUP_SOCK; 1490 ptype = BPF_PROG_TYPE_CGROUP_SOCK;
1481 break; 1491 break;
1492 case BPF_CGROUP_INET4_BIND:
1493 case BPF_CGROUP_INET6_BIND:
1494 ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
1495 break;
1482 case BPF_CGROUP_SOCK_OPS: 1496 case BPF_CGROUP_SOCK_OPS:
1483 ptype = BPF_PROG_TYPE_SOCK_OPS; 1497 ptype = BPF_PROG_TYPE_SOCK_OPS;
1484 break; 1498 break;
@@ -1541,6 +1555,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
1541 case BPF_CGROUP_INET_SOCK_CREATE: 1555 case BPF_CGROUP_INET_SOCK_CREATE:
1542 ptype = BPF_PROG_TYPE_CGROUP_SOCK; 1556 ptype = BPF_PROG_TYPE_CGROUP_SOCK;
1543 break; 1557 break;
1558 case BPF_CGROUP_INET4_BIND:
1559 case BPF_CGROUP_INET6_BIND:
1560 ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
1561 break;
1544 case BPF_CGROUP_SOCK_OPS: 1562 case BPF_CGROUP_SOCK_OPS:
1545 ptype = BPF_PROG_TYPE_SOCK_OPS; 1563 ptype = BPF_PROG_TYPE_SOCK_OPS;
1546 break; 1564 break;
@@ -1590,6 +1608,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
1590 case BPF_CGROUP_INET_INGRESS: 1608 case BPF_CGROUP_INET_INGRESS:
1591 case BPF_CGROUP_INET_EGRESS: 1609 case BPF_CGROUP_INET_EGRESS:
1592 case BPF_CGROUP_INET_SOCK_CREATE: 1610 case BPF_CGROUP_INET_SOCK_CREATE:
1611 case BPF_CGROUP_INET4_BIND:
1612 case BPF_CGROUP_INET6_BIND:
1593 case BPF_CGROUP_SOCK_OPS: 1613 case BPF_CGROUP_SOCK_OPS:
1594 case BPF_CGROUP_DEVICE: 1614 case BPF_CGROUP_DEVICE:
1595 break; 1615 break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 10024323031d..5dd1dcb902bf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3887,6 +3887,7 @@ static int check_return_code(struct bpf_verifier_env *env)
3887 switch (env->prog->type) { 3887 switch (env->prog->type) {
3888 case BPF_PROG_TYPE_CGROUP_SKB: 3888 case BPF_PROG_TYPE_CGROUP_SKB:
3889 case BPF_PROG_TYPE_CGROUP_SOCK: 3889 case BPF_PROG_TYPE_CGROUP_SOCK:
3890 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3890 case BPF_PROG_TYPE_SOCK_OPS: 3891 case BPF_PROG_TYPE_SOCK_OPS:
3891 case BPF_PROG_TYPE_CGROUP_DEVICE: 3892 case BPF_PROG_TYPE_CGROUP_DEVICE:
3892 break; 3893 break;