summaryrefslogtreecommitdiffstats
path: root/kernel/bpf/syscall.c
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2019-06-06 19:48:57 -0400
committerAlexei Starovoitov <ast@kernel.org>2019-06-06 19:53:12 -0400
commit983695fa676568fc0fe5ddd995c7267aabc24632 (patch)
treea969b5a9d9c3eb66cd05c462bfb0c56f7d9615ca /kernel/bpf/syscall.c
parent1884c066579a7a274dd981a4d9639ca63db66a23 (diff)
bpf: fix unconnected udp hooks
Intention of cgroup bind/connect/sendmsg BPF hooks is to act transparently to applications as also stated in original motivation in 7828f20e3779 ("Merge branch 'bpf-cgroup-bind-connect'"). When recently integrating the latter two hooks into Cilium to enable host based load-balancing with Kubernetes, I ran into the issue that pods couldn't start up as DNS got broken. Kubernetes typically sets up DNS as a service and is thus subject to load-balancing. Upon further debugging, it turns out that the cgroupv2 sendmsg BPF hooks API is currently insufficient and thus not usable as-is for standard applications shipped with most distros. To break down the issue we ran into with a simple example: # cat /etc/resolv.conf nameserver 147.75.207.207 nameserver 147.75.207.208 For the purpose of a simple test, we set up above IPs as service IPs and transparently redirect traffic to a different DNS backend server for that node: # cilium service list ID Frontend Backend 1 147.75.207.207:53 1 => 8.8.8.8:53 2 147.75.207.208:53 1 => 8.8.8.8:53 The attached BPF program is basically selecting one of the backends if the service IP/port matches on the cgroup hook. DNS breaks here, because the hooks are not transparent enough to applications which have built-in msg_name address checks: # nslookup 1.1.1.1 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 [...] ;; connection timed out; no servers could be reached # dig 1.1.1.1 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 [...] ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 ;; global options: +cmd ;; connection timed out; no servers could be reached For comparison, if none of the service IPs is used, and we tell nslookup to use 8.8.8.8 directly it works just fine, of course: # nslookup 1.1.1.1 8.8.8.8 1.1.1.1.in-addr.arpa name = one.one.one.one. In order to fix this and thus act more transparent to the application, this needs reverse translation on recvmsg() side. A minimal fix for this API is to add similar recvmsg() hooks behind the BPF cgroups static key such that the program can track state and replace the current sockaddr_in{,6} with the original service IP. From BPF side, this basically tracks the service tuple plus socket cookie in an LRU map where the reverse NAT can then be retrieved via map value as one example. Side-note: the BPF cgroups static key should be converted to a per-hook static key in future. Same example after this fix: # cilium service list ID Frontend Backend 1 147.75.207.207:53 1 => 8.8.8.8:53 2 147.75.207.208:53 1 => 8.8.8.8:53 Lookups work fine now: # nslookup 1.1.1.1 1.1.1.1.in-addr.arpa name = one.one.one.one. Authoritative answers can be found from: # dig 1.1.1.1 ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 ;; global options: +cmd ;; Got answer: ;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 51550 ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1 ;; OPT PSEUDOSECTION: ; EDNS: version: 0, flags:; udp: 512 ;; QUESTION SECTION: ;1.1.1.1. IN A ;; AUTHORITY SECTION: . 23426 IN SOA a.root-servers.net. nstld.verisign-grs.com. 2019052001 1800 900 604800 86400 ;; Query time: 17 msec ;; SERVER: 147.75.207.207#53(147.75.207.207) ;; WHEN: Tue May 21 12:59:38 UTC 2019 ;; MSG SIZE rcvd: 111 And from an actual packet level it shows that we're using the back end server when talking via 147.75.207.20{7,8} front end: # tcpdump -i any udp [...] 12:59:52.698732 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) 12:59:52.698735 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) [...] In order to be flexible and to have same semantics as in sendmsg BPF programs, we only allow return codes in [1,1] range. In the sendmsg case the program is called if msg->msg_name is present which can be the case in both, connected and unconnected UDP. The former only relies on the sockaddr_in{,6} passed via connect(2) if passed msg->msg_name was NULL. Therefore, on recvmsg side, we act in similar way to call into the BPF program whenever a non-NULL msg->msg_name was passed independent of sk->sk_state being TCP_ESTABLISHED or not. Note that for TCP case, the msg->msg_name is ignored in the regular recvmsg path and therefore not relevant. For the case of ip{,v6}_recv_error() paths, picked up via MSG_ERRQUEUE, the hook is not called. This is intentional as it aligns with the same semantics as in case of TCP cgroup BPF hooks right now. This might be better addressed in future through a different bpf_attach_type such that this case can be distinguished from the regular recvmsg paths, for example. Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Andrey Ignatov <rdna@fb.com> Acked-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Martynas Pumputis <m@lambda.lt> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'kernel/bpf/syscall.c')
-rw-r--r--kernel/bpf/syscall.c8
1 files changed, 8 insertions, 0 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb5440b02e82..e8ba3a153691 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1581,6 +1581,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
1581 case BPF_CGROUP_INET6_CONNECT: 1581 case BPF_CGROUP_INET6_CONNECT:
1582 case BPF_CGROUP_UDP4_SENDMSG: 1582 case BPF_CGROUP_UDP4_SENDMSG:
1583 case BPF_CGROUP_UDP6_SENDMSG: 1583 case BPF_CGROUP_UDP6_SENDMSG:
1584 case BPF_CGROUP_UDP4_RECVMSG:
1585 case BPF_CGROUP_UDP6_RECVMSG:
1584 return 0; 1586 return 0;
1585 default: 1587 default:
1586 return -EINVAL; 1588 return -EINVAL;
@@ -1875,6 +1877,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
1875 case BPF_CGROUP_INET6_CONNECT: 1877 case BPF_CGROUP_INET6_CONNECT:
1876 case BPF_CGROUP_UDP4_SENDMSG: 1878 case BPF_CGROUP_UDP4_SENDMSG:
1877 case BPF_CGROUP_UDP6_SENDMSG: 1879 case BPF_CGROUP_UDP6_SENDMSG:
1880 case BPF_CGROUP_UDP4_RECVMSG:
1881 case BPF_CGROUP_UDP6_RECVMSG:
1878 ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; 1882 ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
1879 break; 1883 break;
1880 case BPF_CGROUP_SOCK_OPS: 1884 case BPF_CGROUP_SOCK_OPS:
@@ -1960,6 +1964,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
1960 case BPF_CGROUP_INET6_CONNECT: 1964 case BPF_CGROUP_INET6_CONNECT:
1961 case BPF_CGROUP_UDP4_SENDMSG: 1965 case BPF_CGROUP_UDP4_SENDMSG:
1962 case BPF_CGROUP_UDP6_SENDMSG: 1966 case BPF_CGROUP_UDP6_SENDMSG:
1967 case BPF_CGROUP_UDP4_RECVMSG:
1968 case BPF_CGROUP_UDP6_RECVMSG:
1963 ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; 1969 ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
1964 break; 1970 break;
1965 case BPF_CGROUP_SOCK_OPS: 1971 case BPF_CGROUP_SOCK_OPS:
@@ -2011,6 +2017,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
2011 case BPF_CGROUP_INET6_CONNECT: 2017 case BPF_CGROUP_INET6_CONNECT:
2012 case BPF_CGROUP_UDP4_SENDMSG: 2018 case BPF_CGROUP_UDP4_SENDMSG:
2013 case BPF_CGROUP_UDP6_SENDMSG: 2019 case BPF_CGROUP_UDP6_SENDMSG:
2020 case BPF_CGROUP_UDP4_RECVMSG:
2021 case BPF_CGROUP_UDP6_RECVMSG:
2014 case BPF_CGROUP_SOCK_OPS: 2022 case BPF_CGROUP_SOCK_OPS:
2015 case BPF_CGROUP_DEVICE: 2023 case BPF_CGROUP_DEVICE:
2016 case BPF_CGROUP_SYSCTL: 2024 case BPF_CGROUP_SYSCTL: