tcp: add one skb cache for rx

Often times, recvmsg() system calls and BH handling for a particular TCP socket are done on different cpus. This means the incoming skb had to be allocated on a cpu, but freed on another. This incurs a high spinlock contention in slab layer for small rpc, but also a high number of cache line ping pongs for larger packets. A full size GRO packet might use 45 page fragments, meaning that up to 45 put_page() can be involved. More over performing the __kfree_skb() in the recvmsg() context adds a latency for user applications, and increase probability of trapping them in backlog processing, since the BH handler might found the socket owned by the user. This patch, combined with the prior one increases the rpc performance by about 10 % on servers with large number of cores. (tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps instead of 8 Mpps) This also increases single bulk flow performance on 40Gbit+ links, since in this case there are often two cpus working in tandem : - CPU handling the NIC rx interrupts, feeding the receive queue, and (after this patch) freeing the skbs that were consumed. - CPU in recvmsg() system call, essentially 100 % busy copying out data to user space. Having at most one skb in a per-socket cache has very little risk of memory exhaustion, and since it is protected by socket lock, its management is essentially free. Note that if rps/rfs is used, we do not enable this feature, because there is high chance that the same cpu is handling both the recvmsg() system call and the TCP rx path, but that another cpu did the skb allocations in the device driver right before the RPS/RFS logic. To properly handle this case, it seems we would need to record on which cpu skb was allocated, and use a different channel to give skbs back to this cpu. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Acked-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2019-03-22 11:56:40 -0400
committer: David S. Miller <davem@davemloft.net> 2019-03-23 21:57:38 -0400
commit: 8b27dae5a2e89a61c46c6dbc76c040c0e6d0ed4c (patch)
tree: 0e6f2cfd66715d2234acda3ae48d1543facc5303
parent: 472c2e07eef045145bc1493cc94a01c87140780a (diff)
5 files changed, 36 insertions, 5 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 314c47a8f5d1..577d91fb5626 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -368,6 +368,7 @@ struct sock {
        atomic_t                sk_drops;
        int                     sk_rcvlowat;
        struct sk_buff_head     sk_error_queue;
+        struct sk_buff          *sk_rx_skb_cache;
        struct sk_buff_head     sk_receive_queue;
        /*
         * The backlog queue is special, it is always used with
@@ -2438,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
 {
        __skb_unlink(skb, &sk->sk_receive_queue);
+        if (
+#ifdef CONFIG_RPS
+            !static_branch_unlikely(&rps_needed) &&
+#endif
+            !sk->sk_rx_skb_cache) {
+                sk->sk_rx_skb_cache = skb;
+                skb_orphan(skb);
+                return;
+        }
        __kfree_skb(skb);
 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eab3ebde981e..7f3a984ad618 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -136,6 +136,10 @@ void inet_sock_destruct(struct sock *sk)
        struct inet_sock *inet = inet_sk(sk);
        __skb_queue_purge(&sk->sk_receive_queue);
+        if (sk->sk_rx_skb_cache) {
+                __kfree_skb(sk->sk_rx_skb_cache);
+                sk->sk_rx_skb_cache = NULL;
+        }
        __skb_queue_purge(&sk->sk_error_queue);
        sk_mem_reclaim(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f0b5a5999145..29b94edf05f9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2583,6 +2583,10 @@ int tcp_disconnect(struct sock *sk, int flags)
        tcp_clear_xmit_timers(sk);
        __skb_queue_purge(&sk->sk_receive_queue);
+        if (sk->sk_rx_skb_cache) {
+                __kfree_skb(sk->sk_rx_skb_cache);
+                sk->sk_rx_skb_cache = NULL;
+        }
        tp->copied_seq = tp->rcv_nxt;
        tp->urg_data = 0;
        tcp_write_queue_purge(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 277d71239d75..3979939804b7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1774,6 +1774,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
 int tcp_v4_rcv(struct sk_buff *skb)
 {
        struct net *net = dev_net(skb->dev);
+        struct sk_buff *skb_to_free;
        int sdif = inet_sdif(skb);
        const struct iphdr *iph;
        const struct tcphdr *th;
@@ -1905,11 +1906,17 @@ process:
        tcp_segs_in(tcp_sk(sk), skb);
        ret = 0;
        if (!sock_owned_by_user(sk)) {
+                skb_to_free = sk->sk_rx_skb_cache;
+                sk->sk_rx_skb_cache = NULL;
                ret = tcp_v4_do_rcv(sk, skb);
-        } else if (tcp_add_backlog(sk, skb)) {
+        } else {
-                goto discard_and_relse;
+                if (tcp_add_backlog(sk, skb))
+                        goto discard_and_relse;
+                skb_to_free = NULL;
        }
        bh_unlock_sock(sk);
+        if (skb_to_free)
+                __kfree_skb(skb_to_free);
 put_and_return:
        if (refcounted)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 983ad7a75102..77d723bbe050 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1436,6 +1436,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
 static int tcp_v6_rcv(struct sk_buff *skb)
 {
+        struct sk_buff *skb_to_free;
        int sdif = inet6_sdif(skb);
        const struct tcphdr *th;
        const struct ipv6hdr *hdr;
@@ -1562,12 +1563,17 @@ process:
        tcp_segs_in(tcp_sk(sk), skb);
        ret = 0;
        if (!sock_owned_by_user(sk)) {
+                skb_to_free = sk->sk_rx_skb_cache;
+                sk->sk_rx_skb_cache = NULL;
                ret = tcp_v6_do_rcv(sk, skb);
-        } else if (tcp_add_backlog(sk, skb)) {
+        } else {
-                goto discard_and_relse;
+                if (tcp_add_backlog(sk, skb))
+                        goto discard_and_relse;
+                skb_to_free = NULL;
        }
        bh_unlock_sock(sk);
+        if (skb_to_free)
+                __kfree_skb(skb_to_free);
 put_and_return:
        if (refcounted)
                sock_put(sk);
author	Eric Dumazet <edumazet@google.com>	2019-03-22 11:56:40 -0400
committer	David S. Miller <davem@davemloft.net>	2019-03-23 21:57:38 -0400
commit	8b27dae5a2e89a61c46c6dbc76c040c0e6d0ed4c (patch)
tree	0e6f2cfd66715d2234acda3ae48d1543facc5303
parent	472c2e07eef045145bc1493cc94a01c87140780a (diff)

diff --git a/include/net/sock.h b/include/net/sock.h index 314c47a8f5d1..577d91fb5626 100644 --- a/include/net/sock.h +++ b/include/net/sock.h
@@ -368,6 +368,7 @@ struct sock {
368	atomic_t sk_drops;	368	atomic_t sk_drops;
369	int sk_rcvlowat;	369	int sk_rcvlowat;
370	struct sk_buff_head sk_error_queue;	370	struct sk_buff_head sk_error_queue;
		371	struct sk_buff *sk_rx_skb_cache;
371	struct sk_buff_head sk_receive_queue;	372	struct sk_buff_head sk_receive_queue;
372	/*	373	/*
373	* The backlog queue is special, it is always used with	374	* The backlog queue is special, it is always used with
@@ -2438,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
2438	static inline void sk_eat_skb(struct sock sk, struct sk_buff skb)	2439	static inline void sk_eat_skb(struct sock sk, struct sk_buff skb)
2439	{	2440	{
2440	__skb_unlink(skb, &sk->sk_receive_queue);	2441	__skb_unlink(skb, &sk->sk_receive_queue);
		2442	if (
		2443	#ifdef CONFIG_RPS
		2444	!static_branch_unlikely(&rps_needed) &&
		2445	#endif
		2446	!sk->sk_rx_skb_cache) {
		2447	sk->sk_rx_skb_cache = skb;
		2448	skb_orphan(skb);
		2449	return;
		2450	}
2441	__kfree_skb(skb);	2451	__kfree_skb(skb);
2442	}	2452	}
2443		2453


diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eab3ebde981e..7f3a984ad618 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c
@@ -136,6 +136,10 @@ void inet_sock_destruct(struct sock *sk)
136	struct inet_sock *inet = inet_sk(sk);	136	struct inet_sock *inet = inet_sk(sk);
137		137
138	__skb_queue_purge(&sk->sk_receive_queue);	138	__skb_queue_purge(&sk->sk_receive_queue);
		139	if (sk->sk_rx_skb_cache) {
		140	__kfree_skb(sk->sk_rx_skb_cache);
		141	sk->sk_rx_skb_cache = NULL;
		142	}
139	__skb_queue_purge(&sk->sk_error_queue);	143	__skb_queue_purge(&sk->sk_error_queue);
140		144
141	sk_mem_reclaim(sk);	145	sk_mem_reclaim(sk);


diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f0b5a5999145..29b94edf05f9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c
@@ -2583,6 +2583,10 @@ int tcp_disconnect(struct sock *sk, int flags)
2583		2583
2584	tcp_clear_xmit_timers(sk);	2584	tcp_clear_xmit_timers(sk);
2585	__skb_queue_purge(&sk->sk_receive_queue);	2585	__skb_queue_purge(&sk->sk_receive_queue);
		2586	if (sk->sk_rx_skb_cache) {
		2587	__kfree_skb(sk->sk_rx_skb_cache);
		2588	sk->sk_rx_skb_cache = NULL;
		2589	}
2586	tp->copied_seq = tp->rcv_nxt;	2590	tp->copied_seq = tp->rcv_nxt;
2587	tp->urg_data = 0;	2591	tp->urg_data = 0;
2588	tcp_write_queue_purge(sk);	2592	tcp_write_queue_purge(sk);


diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 277d71239d75..3979939804b7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c
@@ -1774,6 +1774,7 @@ static void tcp_v4_fill_cb(struct sk_buff skb, const struct iphdr iph,
1774	int tcp_v4_rcv(struct sk_buff *skb)	1774	int tcp_v4_rcv(struct sk_buff *skb)
1775	{	1775	{
1776	struct net *net = dev_net(skb->dev);	1776	struct net *net = dev_net(skb->dev);
		1777	struct sk_buff *skb_to_free;
1777	int sdif = inet_sdif(skb);	1778	int sdif = inet_sdif(skb);
1778	const struct iphdr *iph;	1779	const struct iphdr *iph;
1779	const struct tcphdr *th;	1780	const struct tcphdr *th;
@@ -1905,11 +1906,17 @@ process:
1905	tcp_segs_in(tcp_sk(sk), skb);	1906	tcp_segs_in(tcp_sk(sk), skb);
1906	ret = 0;	1907	ret = 0;
1907	if (!sock_owned_by_user(sk)) {	1908	if (!sock_owned_by_user(sk)) {
		1909	skb_to_free = sk->sk_rx_skb_cache;
		1910	sk->sk_rx_skb_cache = NULL;
1908	ret = tcp_v4_do_rcv(sk, skb);	1911	ret = tcp_v4_do_rcv(sk, skb);
1909	} else if (tcp_add_backlog(sk, skb)) {	1912	} else {
1910	goto discard_and_relse;	1913	if (tcp_add_backlog(sk, skb))
		1914	goto discard_and_relse;
		1915	skb_to_free = NULL;
1911	}	1916	}
1912	bh_unlock_sock(sk);	1917	bh_unlock_sock(sk);
		1918	if (skb_to_free)
		1919	__kfree_skb(skb_to_free);
1913		1920
1914	put_and_return:	1921	put_and_return:
1915	if (refcounted)	1922	if (refcounted)


diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 983ad7a75102..77d723bbe050 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c
@@ -1436,6 +1436,7 @@ static void tcp_v6_fill_cb(struct sk_buff skb, const struct ipv6hdr hdr,
1436		1436
1437	static int tcp_v6_rcv(struct sk_buff *skb)	1437	static int tcp_v6_rcv(struct sk_buff *skb)
1438	{	1438	{
		1439	struct sk_buff *skb_to_free;
1439	int sdif = inet6_sdif(skb);	1440	int sdif = inet6_sdif(skb);
1440	const struct tcphdr *th;	1441	const struct tcphdr *th;
1441	const struct ipv6hdr *hdr;	1442	const struct ipv6hdr *hdr;
@@ -1562,12 +1563,17 @@ process:
1562	tcp_segs_in(tcp_sk(sk), skb);	1563	tcp_segs_in(tcp_sk(sk), skb);
1563	ret = 0;	1564	ret = 0;
1564	if (!sock_owned_by_user(sk)) {	1565	if (!sock_owned_by_user(sk)) {
		1566	skb_to_free = sk->sk_rx_skb_cache;
		1567	sk->sk_rx_skb_cache = NULL;
1565	ret = tcp_v6_do_rcv(sk, skb);	1568	ret = tcp_v6_do_rcv(sk, skb);
1566	} else if (tcp_add_backlog(sk, skb)) {	1569	} else {
1567	goto discard_and_relse;	1570	if (tcp_add_backlog(sk, skb))
		1571	goto discard_and_relse;
		1572	skb_to_free = NULL;
1568	}	1573	}
1569	bh_unlock_sock(sk);	1574	bh_unlock_sock(sk);
1570		1575	if (skb_to_free)
		1576	__kfree_skb(skb_to_free);
1571	put_and_return:	1577	put_and_return:
1572	if (refcounted)	1578	if (refcounted)
1573	sock_put(sk);	1579	sock_put(sk);