diff options
| author | Martin KaFai Lau <kafai@fb.com> | 2018-08-08 04:01:25 -0400 |
|---|---|---|
| committer | Daniel Borkmann <daniel@iogearbox.net> | 2018-08-10 19:58:46 -0400 |
| commit | 2dbb9b9e6df67d444fbe425c7f6014858d337adf (patch) | |
| tree | bc048a092095423a9d0b5dfac0a154c2046793a2 /include/uapi/linux | |
| parent | 5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c (diff) | |
bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT
This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select
a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other
non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN.
BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern"
to store the bpf context instead of using the skb->cb[48].
At the SO_REUSEPORT sk lookup time, it is in the middle of transiting
from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this
point, it is not always clear where the bpf context can be appended
in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting
aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not
clear if the lower layer is only ipv4 and ipv6 in the future and
will it not touch the cb[] again before transiting to the upper
layer.
For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB
instead of IP[6]CB and it may still modify the cb[] after calling
the udp[46]_lib_lookup_skb(). Because of the above reason, if
sk->cb is used for the bpf ctx, saving-and-restoring is needed
and likely the whole 48 bytes cb[] has to be saved and restored.
Instead of saving, setting and restoring the cb[], this patch opts
to create a new "struct sk_reuseport_kern" and setting the needed
values in there.
The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)"
will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol
specific usage at this point and it is also inline with the current
sock_reuseport.c implementation (i.e. no protocol specific requirement).
In "struct sk_reuseport_md", this patch exposes data/data_end/len
with semantic similar to other existing usages. Together
with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()",
the bpf prog can peek anywhere in the skb. The "bind_inany" tells
the bpf prog that the reuseport group is bind-ed to a local
INANY address which cannot be learned from skb.
The new "bind_inany" is added to "struct sock_reuseport" which will be
used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order
to avoid repeating the "bind INANY" test on
"sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can
only be properly initialized when a "sk->sk_reuseport" enabled sk is
adding to a hashtable (i.e. during "reuseport_alloc()" and
"reuseport_add_sock()").
The new "sk_select_reuseport()" is the main helper that the
bpf prog will use to select a SO_REUSEPORT sk. It is the only function
that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in
the earlier patch, the validity of a selected sk is checked in
run time in "sk_select_reuseport()". Doing the check in
verification time is difficult and inflexible (consider the map-in-map
use case). The runtime check is to compare the selected sk's reuseport_id
with the reuseport_id that we want. This helper will return -EXXX if the
selected sk cannot serve the incoming request (e.g. reuseport_id
not match). The bpf prog can decide if it wants to do SK_DROP as its
discretion.
When the bpf prog returns SK_PASS, the kernel will check if a
valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL").
If it does , it will use the selected sk. If not, the kernel
will select one from "reuse->socks[]" (as before this patch).
The SK_DROP and SK_PASS handling logic will be in the next patch.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Diffstat (limited to 'include/uapi/linux')
| -rw-r--r-- | include/uapi/linux/bpf.h | 36 |
1 files changed, 35 insertions, 1 deletions
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 40f584bc7da0..3102a2a23c31 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h | |||
| @@ -151,6 +151,7 @@ enum bpf_prog_type { | |||
| 151 | BPF_PROG_TYPE_CGROUP_SOCK_ADDR, | 151 | BPF_PROG_TYPE_CGROUP_SOCK_ADDR, |
| 152 | BPF_PROG_TYPE_LWT_SEG6LOCAL, | 152 | BPF_PROG_TYPE_LWT_SEG6LOCAL, |
| 153 | BPF_PROG_TYPE_LIRC_MODE2, | 153 | BPF_PROG_TYPE_LIRC_MODE2, |
| 154 | BPF_PROG_TYPE_SK_REUSEPORT, | ||
| 154 | }; | 155 | }; |
| 155 | 156 | ||
| 156 | enum bpf_attach_type { | 157 | enum bpf_attach_type { |
| @@ -2114,6 +2115,14 @@ union bpf_attr { | |||
| 2114 | * the shared data. | 2115 | * the shared data. |
| 2115 | * Return | 2116 | * Return |
| 2116 | * Pointer to the local storage area. | 2117 | * Pointer to the local storage area. |
| 2118 | * | ||
| 2119 | * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) | ||
| 2120 | * Description | ||
| 2121 | * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map | ||
| 2122 | * It checks the selected sk is matching the incoming | ||
| 2123 | * request in the skb. | ||
| 2124 | * Return | ||
| 2125 | * 0 on success, or a negative error in case of failure. | ||
| 2117 | */ | 2126 | */ |
| 2118 | #define __BPF_FUNC_MAPPER(FN) \ | 2127 | #define __BPF_FUNC_MAPPER(FN) \ |
| 2119 | FN(unspec), \ | 2128 | FN(unspec), \ |
| @@ -2197,7 +2206,8 @@ union bpf_attr { | |||
| 2197 | FN(rc_keydown), \ | 2206 | FN(rc_keydown), \ |
| 2198 | FN(skb_cgroup_id), \ | 2207 | FN(skb_cgroup_id), \ |
| 2199 | FN(get_current_cgroup_id), \ | 2208 | FN(get_current_cgroup_id), \ |
| 2200 | FN(get_local_storage), | 2209 | FN(get_local_storage), \ |
| 2210 | FN(sk_select_reuseport), | ||
| 2201 | 2211 | ||
| 2202 | /* integer value in 'imm' field of BPF_CALL instruction selects which helper | 2212 | /* integer value in 'imm' field of BPF_CALL instruction selects which helper |
| 2203 | * function eBPF program intends to call | 2213 | * function eBPF program intends to call |
| @@ -2414,6 +2424,30 @@ struct sk_msg_md { | |||
| 2414 | __u32 local_port; /* stored in host byte order */ | 2424 | __u32 local_port; /* stored in host byte order */ |
| 2415 | }; | 2425 | }; |
| 2416 | 2426 | ||
| 2427 | struct sk_reuseport_md { | ||
| 2428 | /* | ||
| 2429 | * Start of directly accessible data. It begins from | ||
| 2430 | * the tcp/udp header. | ||
| 2431 | */ | ||
| 2432 | void *data; | ||
| 2433 | void *data_end; /* End of directly accessible data */ | ||
| 2434 | /* | ||
| 2435 | * Total length of packet (starting from the tcp/udp header). | ||
| 2436 | * Note that the directly accessible bytes (data_end - data) | ||
| 2437 | * could be less than this "len". Those bytes could be | ||
| 2438 | * indirectly read by a helper "bpf_skb_load_bytes()". | ||
| 2439 | */ | ||
| 2440 | __u32 len; | ||
| 2441 | /* | ||
| 2442 | * Eth protocol in the mac header (network byte order). e.g. | ||
| 2443 | * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) | ||
| 2444 | */ | ||
| 2445 | __u32 eth_protocol; | ||
| 2446 | __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ | ||
| 2447 | __u32 bind_inany; /* Is sock bound to an INANY address? */ | ||
| 2448 | __u32 hash; /* A hash of the packet 4 tuples */ | ||
| 2449 | }; | ||
| 2450 | |||
| 2417 | #define BPF_TAG_SIZE 8 | 2451 | #define BPF_TAG_SIZE 8 |
| 2418 | 2452 | ||
| 2419 | struct bpf_prog_info { | 2453 | struct bpf_prog_info { |
