tcp: reduce out_of_order memory use

With increasing receive window sizes, but speed of light not improved that much, out of order queue can contain a huge number of skbs, waiting to be moved to receive_queue when missing packets can fill the holes. Some devices happen to use fat skbs (truesize of 4096 + sizeof(struct sk_buff)) to store regular (MTU <= 1500) frames. This makes highly probable sk_rmem_alloc hits sk_rcvbuf limit, which can be 4Mbytes in many cases. When limit is hit, tcp stack calls tcp_collapse_ofo_queue(), a true latency killer and cpu cache blower. Doing the coalescing attempt each time we add a frame in ofo queue permits to keep memory use tight and in many cases avoid the tcp_collapse() thing later. Tested on various wireless setups (b43, ath9k, ...) known to use big skb truesize, this patch removed the "packets collapsed in receive queue due to low socket buffer" I had before. This also reduced average memory used by tcp sockets. With help from Neal Cardwell. Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: H.K. Jerry Chu <hkchu@google.com> Cc: Tom Herbert <therbert@google.com> Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <eric.dumazet@gmail.com> 2012-03-18 07:07:47 -0400
committer: David S. Miller <davem@davemloft.net> 2012-03-19 16:53:08 -0400
commit: c8628155ece363487b57d33441ea0359018c0fa7 (patch)
tree: a3a4e89d3f66208f4145bb2ed401e464474a8d9f /net
parent: e86b291962cbf477e35d983d312428cf737bc0f8 (diff)
2 files changed, 19 insertions, 1 deletions
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 02d61079f08b..8af0d44e4e22 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -257,6 +257,7 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
        SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
        SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
+        SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
        SNMP_MIB_SENTINEL
 };
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fa7de12c4a52..e886e2f7fa8d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4484,7 +4484,24 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
        end_seq = TCP_SKB_CB(skb)->end_seq;
        if (seq == TCP_SKB_CB(skb1)->end_seq) {
-                __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+                /* Packets in ofo can stay in queue a long time.
+                 * Better try to coalesce them right now
+                 * to avoid future tcp_collapse_ofo_queue(),
+                 * probably the most expensive function in tcp stack.
+                 */
+                if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) {
+                        NET_INC_STATS_BH(sock_net(sk),
+                                         LINUX_MIB_TCPRCVCOALESCE);
+                        BUG_ON(skb_copy_bits(skb, 0,
+                                             skb_put(skb1, skb->len),
+                                             skb->len));
+                        TCP_SKB_CB(skb1)->end_seq = end_seq;
+                        TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+                        __kfree_skb(skb);
+                        skb = NULL;
+                } else {
+                        __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+                }
                if (!tp->rx_opt.num_sacks ||
                    tp->selective_acks[0].end_seq != seq)
author	Eric Dumazet <eric.dumazet@gmail.com>	2012-03-18 07:07:47 -0400
committer	David S. Miller <davem@davemloft.net>	2012-03-19 16:53:08 -0400
commit	c8628155ece363487b57d33441ea0359018c0fa7 (patch)
tree	a3a4e89d3f66208f4145bb2ed401e464474a8d9f /net
parent	e86b291962cbf477e35d983d312428cf737bc0f8 (diff)

diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 02d61079f08b..8af0d44e4e22 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c
@@ -257,6 +257,7 @@ static const struct snmp_mib snmp4_net_list[] = {
257	SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),	257	SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
258	SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),	258	SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
259	SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),	259	SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
		260	SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
260	SNMP_MIB_SENTINEL	261	SNMP_MIB_SENTINEL
261	};	262	};
262		263


diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fa7de12c4a52..e886e2f7fa8d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -4484,7 +4484,24 @@ static void tcp_data_queue_ofo(struct sock sk, struct sk_buff skb)
4484	end_seq = TCP_SKB_CB(skb)->end_seq;	4484	end_seq = TCP_SKB_CB(skb)->end_seq;
4485		4485
4486	if (seq == TCP_SKB_CB(skb1)->end_seq) {	4486	if (seq == TCP_SKB_CB(skb1)->end_seq) {
4487	__skb_queue_after(&tp->out_of_order_queue, skb1, skb);	4487	/* Packets in ofo can stay in queue a long time.
		4488	* Better try to coalesce them right now
		4489	* to avoid future tcp_collapse_ofo_queue(),
		4490	* probably the most expensive function in tcp stack.
		4491	*/
		4492	if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) {
		4493	NET_INC_STATS_BH(sock_net(sk),
		4494	LINUX_MIB_TCPRCVCOALESCE);
		4495	BUG_ON(skb_copy_bits(skb, 0,
		4496	skb_put(skb1, skb->len),
		4497	skb->len));
		4498	TCP_SKB_CB(skb1)->end_seq = end_seq;
		4499	TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
		4500	__kfree_skb(skb);
		4501	skb = NULL;
		4502	} else {
		4503	__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
		4504	}
4488		4505
4489	if (!tp->rx_opt.num_sacks \|\|	4506	if (!tp->rx_opt.num_sacks \|\|
4490	tp->selective_acks[0].end_seq != seq)	4507	tp->selective_acks[0].end_seq != seq)