tcp: implement coalescing on backlog queue

In case GRO is not as efficient as it should be or disabled, we might have a user thread trapped in __release_sock() while softirq handler flood packets up to the point we have to drop. This patch balances work done from user thread and softirq, to give more chances to __release_sock() to complete its work before new packets are added the the backlog. This also helps if we receive many ACK packets, since GRO does not aggregate them. This patch brings ~60% throughput increase on a receiver without GRO, but the spectacular gain is really on 1000x release_sock() latency reduction I have measured. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2018-11-27 17:42:03 -0500
committer: David S. Miller <davem@davemloft.net> 2018-11-30 16:26:54 -0500
commit: 4f693b55c3d2d2239b8a0094b518a1e533cf75d5 (patch)
tree: 3b70704dbf9f7cf27ec16d9321f7c1956b81c3a5 /net/ipv4/tcp_ipv4.c
parent: 85bdf7db5b53cdcc7a901db12bcb3d0063e3866d (diff)
1 files changed, 86 insertions, 6 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 795605a23275..4904250a9aac 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
+        struct skb_shared_info *shinfo;
-        /* Only socket owner can try to collapse/prune rx queues
+        const struct tcphdr *th;
-         * to reduce memory overhead, so add a little headroom here.
+        struct tcphdr *thtail;
-         * Few sockets backlog are possibly concurrently non empty.
+        struct sk_buff *tail;
-         */
+        unsigned int hdrlen;
-        limit += 64*1024;
+        bool fragstolen;
+        u32 gso_segs;
+        int delta;
        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
         * we can fix skb->truesize to its real value to avoid future drops.
@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
        skb_dst_drop(skb);
+        if (unlikely(tcp_checksum_complete(skb))) {
+                bh_unlock_sock(sk);
+                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+                return true;
+        }
+        /* Attempt coalescing to last skb in backlog, even if we are
+         * above the limits.
+         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
+         */
+        th = (const struct tcphdr *)skb->data;
+        hdrlen = th->doff * 4;
+        shinfo = skb_shinfo(skb);
+        if (!shinfo->gso_size)
+                shinfo->gso_size = skb->len - hdrlen;
+        if (!shinfo->gso_segs)
+                shinfo->gso_segs = 1;
+        tail = sk->sk_backlog.tail;
+        if (!tail)
+                goto no_coalesce;
+        thtail = (struct tcphdr *)tail->data;
+        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
+            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
+            ((TCP_SKB_CB(tail)->tcp_flags |
+              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
+            ((TCP_SKB_CB(tail)->tcp_flags ^
+              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
+#ifdef CONFIG_TLS_DEVICE
+            tail->decrypted != skb->decrypted ||
+#endif
+            thtail->doff != th->doff ||
+            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
+                goto no_coalesce;
+        __skb_pull(skb, hdrlen);
+        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+                thtail->window = th->window;
+                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
+                if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
+                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+                if (TCP_SKB_CB(skb)->has_rxtstamp) {
+                        TCP_SKB_CB(tail)->has_rxtstamp = true;
+                        tail->tstamp = skb->tstamp;
+                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+                }
+                /* Not as strict as GRO. We only need to carry mss max value */
+                skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
+                                                 skb_shinfo(tail)->gso_size);
+                gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
+                skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+                sk->sk_backlog.len += delta;
+                __NET_INC_STATS(sock_net(sk),
+                                LINUX_MIB_TCPBACKLOGCOALESCE);
+                kfree_skb_partial(skb, fragstolen);
+                return false;
+        }
+        __skb_push(skb, hdrlen);
+no_coalesce:
+        /* Only socket owner can try to collapse/prune rx queues
+         * to reduce memory overhead, so add a little headroom here.
+         * Few sockets backlog are possibly concurrently non empty.
+         */
+        limit += 64*1024;
        if (unlikely(sk_add_backlog(sk, skb, limit))) {
                bh_unlock_sock(sk);
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
author	Eric Dumazet <edumazet@google.com>	2018-11-27 17:42:03 -0500
committer	David S. Miller <davem@davemloft.net>	2018-11-30 16:26:54 -0500
commit	4f693b55c3d2d2239b8a0094b518a1e533cf75d5 (patch)
tree	3b70704dbf9f7cf27ec16d9321f7c1956b81c3a5 /net/ipv4/tcp_ipv4.c
parent	85bdf7db5b53cdcc7a901db12bcb3d0063e3866d (diff)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 795605a23275..4904250a9aac 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c
@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
1619	bool tcp_add_backlog(struct sock sk, struct sk_buff skb)	1619	bool tcp_add_backlog(struct sock sk, struct sk_buff skb)
1620	{	1620	{
1621	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;	1621	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1622		1622	struct skb_shared_info *shinfo;
1623	/* Only socket owner can try to collapse/prune rx queues	1623	const struct tcphdr *th;
1624	* to reduce memory overhead, so add a little headroom here.	1624	struct tcphdr *thtail;
1625	* Few sockets backlog are possibly concurrently non empty.	1625	struct sk_buff *tail;
1626	*/	1626	unsigned int hdrlen;
1627	limit += 64*1024;	1627	bool fragstolen;
		1628	u32 gso_segs;
		1629	int delta;
1628		1630
1629	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),	1631	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1630	* we can fix skb->truesize to its real value to avoid future drops.	1632	* we can fix skb->truesize to its real value to avoid future drops.
@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock sk, struct sk_buff skb)
1636		1638
1637	skb_dst_drop(skb);	1639	skb_dst_drop(skb);
1638		1640
		1641	if (unlikely(tcp_checksum_complete(skb))) {
		1642	bh_unlock_sock(sk);
		1643	__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
		1644	__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
		1645	return true;
		1646	}
		1647
		1648	/* Attempt coalescing to last skb in backlog, even if we are
		1649	* above the limits.
		1650	* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
		1651	*/
		1652	th = (const struct tcphdr *)skb->data;
		1653	hdrlen = th->doff * 4;
		1654	shinfo = skb_shinfo(skb);
		1655
		1656	if (!shinfo->gso_size)
		1657	shinfo->gso_size = skb->len - hdrlen;
		1658
		1659	if (!shinfo->gso_segs)
		1660	shinfo->gso_segs = 1;
		1661
		1662	tail = sk->sk_backlog.tail;
		1663	if (!tail)
		1664	goto no_coalesce;
		1665	thtail = (struct tcphdr *)tail->data;
		1666
		1667	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq \|\|
		1668	TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield \|\|
		1669	((TCP_SKB_CB(tail)->tcp_flags \|
		1670	TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) \|\|
		1671	((TCP_SKB_CB(tail)->tcp_flags ^
		1672	TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE \| TCPHDR_CWR)) \|\|
		1673	#ifdef CONFIG_TLS_DEVICE
		1674	tail->decrypted != skb->decrypted \|\|
		1675	#endif
		1676	thtail->doff != th->doff \|\|
		1677	memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
		1678	goto no_coalesce;
		1679
		1680	__skb_pull(skb, hdrlen);
		1681	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
		1682	thtail->window = th->window;
		1683
		1684	TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
		1685
		1686	if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
		1687	TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
		1688
		1689	TCP_SKB_CB(tail)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
		1690
		1691	if (TCP_SKB_CB(skb)->has_rxtstamp) {
		1692	TCP_SKB_CB(tail)->has_rxtstamp = true;
		1693	tail->tstamp = skb->tstamp;
		1694	skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
		1695	}
		1696
		1697	/* Not as strict as GRO. We only need to carry mss max value */
		1698	skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
		1699	skb_shinfo(tail)->gso_size);
		1700
		1701	gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
		1702	skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
		1703
		1704	sk->sk_backlog.len += delta;
		1705	__NET_INC_STATS(sock_net(sk),
		1706	LINUX_MIB_TCPBACKLOGCOALESCE);
		1707	kfree_skb_partial(skb, fragstolen);
		1708	return false;
		1709	}
		1710	__skb_push(skb, hdrlen);
		1711
		1712	no_coalesce:
		1713	/* Only socket owner can try to collapse/prune rx queues
		1714	* to reduce memory overhead, so add a little headroom here.
		1715	* Few sockets backlog are possibly concurrently non empty.
		1716	*/
		1717	limit += 64*1024;
		1718
1639	if (unlikely(sk_add_backlog(sk, skb, limit))) {	1719	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1640	bh_unlock_sock(sk);	1720	bh_unlock_sock(sk);
1641	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);	1721	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);