bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT

This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN. BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern" to store the bpf context instead of using the skb->cb[48]. At the SO_REUSEPORT sk lookup time, it is in the middle of transiting from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this point, it is not always clear where the bpf context can be appended in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not clear if the lower layer is only ipv4 and ipv6 in the future and will it not touch the cb[] again before transiting to the upper layer. For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB instead of IP[6]CB and it may still modify the cb[] after calling the udp[46]_lib_lookup_skb(). Because of the above reason, if sk->cb is used for the bpf ctx, saving-and-restoring is needed and likely the whole 48 bytes cb[] has to be saved and restored. Instead of saving, setting and restoring the cb[], this patch opts to create a new "struct sk_reuseport_kern" and setting the needed values in there. The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)" will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol specific usage at this point and it is also inline with the current sock_reuseport.c implementation (i.e. no protocol specific requirement). In "struct sk_reuseport_md", this patch exposes data/data_end/len with semantic similar to other existing usages. Together with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()", the bpf prog can peek anywhere in the skb. The "bind_inany" tells the bpf prog that the reuseport group is bind-ed to a local INANY address which cannot be learned from skb. The new "bind_inany" is added to "struct sock_reuseport" which will be used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order to avoid repeating the "bind INANY" test on "sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can only be properly initialized when a "sk->sk_reuseport" enabled sk is adding to a hashtable (i.e. during "reuseport_alloc()" and "reuseport_add_sock()"). The new "sk_select_reuseport()" is the main helper that the bpf prog will use to select a SO_REUSEPORT sk. It is the only function that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in the earlier patch, the validity of a selected sk is checked in run time in "sk_select_reuseport()". Doing the check in verification time is difficult and inflexible (consider the map-in-map use case). The runtime check is to compare the selected sk's reuseport_id with the reuseport_id that we want. This helper will return -EXXX if the selected sk cannot serve the incoming request (e.g. reuseport_id not match). The bpf prog can decide if it wants to do SK_DROP as its discretion. When the bpf prog returns SK_PASS, the kernel will check if a valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL"). If it does , it will use the selected sk. If not, the kernel will select one from "reuse->socks[]" (as before this patch). The SK_DROP and SK_PASS handling logic will be in the next patch. Signed-off-by: Martin KaFai Lau <kafai@fb.com> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
author: Martin KaFai Lau <kafai@fb.com> 2018-08-08 04:01:25 -0400
committer: Daniel Borkmann <daniel@iogearbox.net> 2018-08-10 19:58:46 -0400
commit: 2dbb9b9e6df67d444fbe425c7f6014858d337adf (patch)
tree: bc048a092095423a9d0b5dfac0a154c2046793a2 /include/uapi/linux
parent: 5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c (diff)
1 files changed, 35 insertions, 1 deletions
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 40f584bc7da0..3102a2a23c31 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -151,6 +151,7 @@ enum bpf_prog_type {
        BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
        BPF_PROG_TYPE_LWT_SEG6LOCAL,
        BPF_PROG_TYPE_LIRC_MODE2,
+        BPF_PROG_TYPE_SK_REUSEPORT,
 };
 enum bpf_attach_type {
@@ -2114,6 +2115,14 @@ union bpf_attr {
 *              the shared data.
 *      Return
 *              Pointer to the local storage area.
+ *
+ * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
+ *      Description
+ *              Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map
+ *              It checks the selected sk is matching the incoming
+ *              request in the skb.
+ *      Return
+ *              0 on success, or a negative error in case of failure.
 */
 #define __BPF_FUNC_MAPPER(FN)           \
        FN(unspec),                     \
@@ -2197,7 +2206,8 @@ union bpf_attr {
        FN(rc_keydown),                 \
        FN(skb_cgroup_id),              \
        FN(get_current_cgroup_id),      \
-        FN(get_local_storage),
+        FN(get_local_storage),          \
+        FN(sk_select_reuseport),
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
 * function eBPF program intends to call
@@ -2414,6 +2424,30 @@ struct sk_msg_md {
        __u32 local_port;       /* stored in host byte order */
 };
+struct sk_reuseport_md {
+        /*
+         * Start of directly accessible data. It begins from
+         * the tcp/udp header.
+         */
+        void *data;
+        void *data_end;         /* End of directly accessible data */
+        /*
+         * Total length of packet (starting from the tcp/udp header).
+         * Note that the directly accessible bytes (data_end - data)
+         * could be less than this "len".  Those bytes could be
+         * indirectly read by a helper "bpf_skb_load_bytes()".
+         */
+        __u32 len;
+        /*
+         * Eth protocol in the mac header (network byte order). e.g.
+         * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
+         */
+        __u32 eth_protocol;
+        __u32 ip_protocol;      /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
+        __u32 bind_inany;       /* Is sock bound to an INANY address? */
+        __u32 hash;             /* A hash of the packet 4 tuples */
+};
 #define BPF_TAG_SIZE    8
 struct bpf_prog_info {
author	Martin KaFai Lau <kafai@fb.com>	2018-08-08 04:01:25 -0400
committer	Daniel Borkmann <daniel@iogearbox.net>	2018-08-10 19:58:46 -0400
commit	2dbb9b9e6df67d444fbe425c7f6014858d337adf (patch)
tree	bc048a092095423a9d0b5dfac0a154c2046793a2 /include/uapi/linux
parent	5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c (diff)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 40f584bc7da0..3102a2a23c31 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h
@@ -151,6 +151,7 @@ enum bpf_prog_type {
151	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,	151	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
152	BPF_PROG_TYPE_LWT_SEG6LOCAL,	152	BPF_PROG_TYPE_LWT_SEG6LOCAL,
153	BPF_PROG_TYPE_LIRC_MODE2,	153	BPF_PROG_TYPE_LIRC_MODE2,
		154	BPF_PROG_TYPE_SK_REUSEPORT,
154	};	155	};
155		156
156	enum bpf_attach_type {	157	enum bpf_attach_type {
@@ -2114,6 +2115,14 @@ union bpf_attr {
2114	* the shared data.	2115	* the shared data.
2115	* Return	2116	* Return
2116	* Pointer to the local storage area.	2117	* Pointer to the local storage area.
		2118	*
		2119	* int bpf_sk_select_reuseport(struct sk_reuseport_md reuse, struct bpf_map map, void *key, u64 flags)
		2120	* Description
		2121	* Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map
		2122	* It checks the selected sk is matching the incoming
		2123	* request in the skb.
		2124	* Return
		2125	* 0 on success, or a negative error in case of failure.
2117	*/	2126	*/
2118	#define __BPF_FUNC_MAPPER(FN) \	2127	#define __BPF_FUNC_MAPPER(FN) \
2119	FN(unspec), \	2128	FN(unspec), \
@@ -2197,7 +2206,8 @@ union bpf_attr {
2197	FN(rc_keydown), \	2206	FN(rc_keydown), \
2198	FN(skb_cgroup_id), \	2207	FN(skb_cgroup_id), \
2199	FN(get_current_cgroup_id), \	2208	FN(get_current_cgroup_id), \
2200	FN(get_local_storage),	2209	FN(get_local_storage), \
		2210	FN(sk_select_reuseport),
2201		2211
2202	/* integer value in 'imm' field of BPF_CALL instruction selects which helper	2212	/* integer value in 'imm' field of BPF_CALL instruction selects which helper
2203	* function eBPF program intends to call	2213	* function eBPF program intends to call
@@ -2414,6 +2424,30 @@ struct sk_msg_md {
2414	__u32 local_port; /* stored in host byte order */	2424	__u32 local_port; /* stored in host byte order */
2415	};	2425	};
2416		2426
		2427	struct sk_reuseport_md {
		2428	/*
		2429	* Start of directly accessible data. It begins from
		2430	* the tcp/udp header.
		2431	*/
		2432	void *data;
		2433	void data_end; / End of directly accessible data */
		2434	/*
		2435	* Total length of packet (starting from the tcp/udp header).
		2436	* Note that the directly accessible bytes (data_end - data)
		2437	* could be less than this "len". Those bytes could be
		2438	* indirectly read by a helper "bpf_skb_load_bytes()".
		2439	*/
		2440	__u32 len;
		2441	/*
		2442	* Eth protocol in the mac header (network byte order). e.g.
		2443	* ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
		2444	*/
		2445	__u32 eth_protocol;
		2446	__u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
		2447	__u32 bind_inany; /* Is sock bound to an INANY address? */
		2448	__u32 hash; /* A hash of the packet 4 tuples */
		2449	};
		2450
2417	#define BPF_TAG_SIZE 8	2451	#define BPF_TAG_SIZE 8
2418		2452
2419	struct bpf_prog_info {	2453	struct bpf_prog_info {