Merge branch 'tcp-take-a-bit-more-care-of-backlog-stress'

Eric Dumazet says: ==================== tcp: take a bit more care of backlog stress While working on the SACK compression issue Jean-Louis Dupond reported, we found that his linux box was suffering very hard from tail drops on the socket backlog queue. First patch hints the compiler about sack flows being the norm. Second patch changes non-sack code in preparation of the ack compression. Third patch fixes tcp_space() to take backlog into account. Fourth patch is attempting coalescing when a new packet must be added to the backlog queue. Cooking bigger skbs helps to keep backlog list smaller and speeds its handling when user thread finally releases the socket lock. v3: Neal/Yuchung feedback addressed : Do not aggregate if any skb has URG bit set. Do not aggregate if the skbs have different ECE/CWR bits v2: added feedback from Neal : tcp: take care of compressed acks in tcp_add_reno_sack() added : tcp: hint compiler about sack flows added : tcp: make tcp_space() aware of socket backlog ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2018-11-30 16:26:54 -0500
committer: David S. Miller <davem@davemloft.net> 2018-11-30 16:26:54 -0500
commit: 2f69555315ad7dc1ac37366b2ac2429e2d24d444 (patch)
tree: 3b70704dbf9f7cf27ec16d9321f7c1956b81c3a5 /net/ipv4/tcp_ipv4.c
parent: b0e3f1bdf9e7140fd1151af575f468b5827a61e1 (diff)
parent: 4f693b55c3d2d2239b8a0094b518a1e533cf75d5 (diff)
1 files changed, 86 insertions, 6 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 795605a23275..4904250a9aac 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
+        struct skb_shared_info *shinfo;
-        /* Only socket owner can try to collapse/prune rx queues
+        const struct tcphdr *th;
-         * to reduce memory overhead, so add a little headroom here.
+        struct tcphdr *thtail;
-         * Few sockets backlog are possibly concurrently non empty.
+        struct sk_buff *tail;
-         */
+        unsigned int hdrlen;
-        limit += 64*1024;
+        bool fragstolen;
+        u32 gso_segs;
+        int delta;
        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
         * we can fix skb->truesize to its real value to avoid future drops.
@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
        skb_dst_drop(skb);
+        if (unlikely(tcp_checksum_complete(skb))) {
+                bh_unlock_sock(sk);
+                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+                return true;
+        }
+        /* Attempt coalescing to last skb in backlog, even if we are
+         * above the limits.
+         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
+         */
+        th = (const struct tcphdr *)skb->data;
+        hdrlen = th->doff * 4;
+        shinfo = skb_shinfo(skb);
+        if (!shinfo->gso_size)
+                shinfo->gso_size = skb->len - hdrlen;
+        if (!shinfo->gso_segs)
+                shinfo->gso_segs = 1;
+        tail = sk->sk_backlog.tail;
+        if (!tail)
+                goto no_coalesce;
+        thtail = (struct tcphdr *)tail->data;
+        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
+            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
+            ((TCP_SKB_CB(tail)->tcp_flags |
+              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
+            ((TCP_SKB_CB(tail)->tcp_flags ^
+              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
+#ifdef CONFIG_TLS_DEVICE
+            tail->decrypted != skb->decrypted ||
+#endif
+            thtail->doff != th->doff ||
+            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
+                goto no_coalesce;
+        __skb_pull(skb, hdrlen);
+        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+                thtail->window = th->window;
+                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
+                if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
+                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+                if (TCP_SKB_CB(skb)->has_rxtstamp) {
+                        TCP_SKB_CB(tail)->has_rxtstamp = true;
+                        tail->tstamp = skb->tstamp;
+                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+                }
+                /* Not as strict as GRO. We only need to carry mss max value */
+                skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
+                                                 skb_shinfo(tail)->gso_size);
+                gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
+                skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+                sk->sk_backlog.len += delta;
+                __NET_INC_STATS(sock_net(sk),
+                                LINUX_MIB_TCPBACKLOGCOALESCE);
+                kfree_skb_partial(skb, fragstolen);
+                return false;
+        }
+        __skb_push(skb, hdrlen);
+no_coalesce:
+        /* Only socket owner can try to collapse/prune rx queues
+         * to reduce memory overhead, so add a little headroom here.
+         * Few sockets backlog are possibly concurrently non empty.
+         */
+        limit += 64*1024;
        if (unlikely(sk_add_backlog(sk, skb, limit))) {
                bh_unlock_sock(sk);
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
author	David S. Miller <davem@davemloft.net>	2018-11-30 16:26:54 -0500
committer	David S. Miller <davem@davemloft.net>	2018-11-30 16:26:54 -0500
commit	2f69555315ad7dc1ac37366b2ac2429e2d24d444 (patch)
tree	3b70704dbf9f7cf27ec16d9321f7c1956b81c3a5 /net/ipv4/tcp_ipv4.c
parent	b0e3f1bdf9e7140fd1151af575f468b5827a61e1 (diff)
parent	4f693b55c3d2d2239b8a0094b518a1e533cf75d5 (diff)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 795605a23275..4904250a9aac 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c
@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
1619	bool tcp_add_backlog(struct sock sk, struct sk_buff skb)	1619	bool tcp_add_backlog(struct sock sk, struct sk_buff skb)
1620	{	1620	{
1621	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;	1621	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1622		1622	struct skb_shared_info *shinfo;
1623	/* Only socket owner can try to collapse/prune rx queues	1623	const struct tcphdr *th;
1624	* to reduce memory overhead, so add a little headroom here.	1624	struct tcphdr *thtail;
1625	* Few sockets backlog are possibly concurrently non empty.	1625	struct sk_buff *tail;
1626	*/	1626	unsigned int hdrlen;
1627	limit += 64*1024;	1627	bool fragstolen;
		1628	u32 gso_segs;
		1629	int delta;
1628		1630
1629	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),	1631	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1630	* we can fix skb->truesize to its real value to avoid future drops.	1632	* we can fix skb->truesize to its real value to avoid future drops.
@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock sk, struct sk_buff skb)
1636		1638
1637	skb_dst_drop(skb);	1639	skb_dst_drop(skb);
1638		1640
		1641	if (unlikely(tcp_checksum_complete(skb))) {
		1642	bh_unlock_sock(sk);
		1643	__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
		1644	__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
		1645	return true;
		1646	}
		1647
		1648	/* Attempt coalescing to last skb in backlog, even if we are
		1649	* above the limits.
		1650	* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
		1651	*/
		1652	th = (const struct tcphdr *)skb->data;
		1653	hdrlen = th->doff * 4;
		1654	shinfo = skb_shinfo(skb);
		1655
		1656	if (!shinfo->gso_size)
		1657	shinfo->gso_size = skb->len - hdrlen;
		1658
		1659	if (!shinfo->gso_segs)
		1660	shinfo->gso_segs = 1;
		1661
		1662	tail = sk->sk_backlog.tail;
		1663	if (!tail)
		1664	goto no_coalesce;
		1665	thtail = (struct tcphdr *)tail->data;
		1666
		1667	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq \|\|
		1668	TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield \|\|
		1669	((TCP_SKB_CB(tail)->tcp_flags \|
		1670	TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) \|\|
		1671	((TCP_SKB_CB(tail)->tcp_flags ^
		1672	TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE \| TCPHDR_CWR)) \|\|
		1673	#ifdef CONFIG_TLS_DEVICE
		1674	tail->decrypted != skb->decrypted \|\|
		1675	#endif
		1676	thtail->doff != th->doff \|\|
		1677	memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
		1678	goto no_coalesce;
		1679
		1680	__skb_pull(skb, hdrlen);
		1681	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
		1682	thtail->window = th->window;
		1683
		1684	TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
		1685
		1686	if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
		1687	TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
		1688
		1689	TCP_SKB_CB(tail)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
		1690
		1691	if (TCP_SKB_CB(skb)->has_rxtstamp) {
		1692	TCP_SKB_CB(tail)->has_rxtstamp = true;
		1693	tail->tstamp = skb->tstamp;
		1694	skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
		1695	}
		1696
		1697	/* Not as strict as GRO. We only need to carry mss max value */
		1698	skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
		1699	skb_shinfo(tail)->gso_size);
		1700
		1701	gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
		1702	skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
		1703
		1704	sk->sk_backlog.len += delta;
		1705	__NET_INC_STATS(sock_net(sk),
		1706	LINUX_MIB_TCPBACKLOGCOALESCE);
		1707	kfree_skb_partial(skb, fragstolen);
		1708	return false;
		1709	}
		1710	__skb_push(skb, hdrlen);
		1711
		1712	no_coalesce:
		1713	/* Only socket owner can try to collapse/prune rx queues
		1714	* to reduce memory overhead, so add a little headroom here.
		1715	* Few sockets backlog are possibly concurrently non empty.
		1716	*/
		1717	limit += 64*1024;
		1718
1639	if (unlikely(sk_add_backlog(sk, skb, limit))) {	1719	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1640	bh_unlock_sock(sk);	1720	bh_unlock_sock(sk);
1641	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);	1721	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);