aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_ipv4.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2018-11-27 17:42:03 -0500
committerDavid S. Miller <davem@davemloft.net>2018-11-30 16:26:54 -0500
commit4f693b55c3d2d2239b8a0094b518a1e533cf75d5 (patch)
tree3b70704dbf9f7cf27ec16d9321f7c1956b81c3a5 /net/ipv4/tcp_ipv4.c
parent85bdf7db5b53cdcc7a901db12bcb3d0063e3866d (diff)
tcp: implement coalescing on backlog queue
In case GRO is not as efficient as it should be or disabled, we might have a user thread trapped in __release_sock() while softirq handler flood packets up to the point we have to drop. This patch balances work done from user thread and softirq, to give more chances to __release_sock() to complete its work before new packets are added the the backlog. This also helps if we receive many ACK packets, since GRO does not aggregate them. This patch brings ~60% throughput increase on a receiver without GRO, but the spectacular gain is really on 1000x release_sock() latency reduction I have measured. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
-rw-r--r--net/ipv4/tcp_ipv4.c92
1 files changed, 86 insertions, 6 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 795605a23275..4904250a9aac 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
1619bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1619bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1620{ 1620{
1621 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; 1621 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1622 1622 struct skb_shared_info *shinfo;
1623 /* Only socket owner can try to collapse/prune rx queues 1623 const struct tcphdr *th;
1624 * to reduce memory overhead, so add a little headroom here. 1624 struct tcphdr *thtail;
1625 * Few sockets backlog are possibly concurrently non empty. 1625 struct sk_buff *tail;
1626 */ 1626 unsigned int hdrlen;
1627 limit += 64*1024; 1627 bool fragstolen;
1628 u32 gso_segs;
1629 int delta;
1628 1630
1629 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1631 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1630 * we can fix skb->truesize to its real value to avoid future drops. 1632 * we can fix skb->truesize to its real value to avoid future drops.
@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1636 1638
1637 skb_dst_drop(skb); 1639 skb_dst_drop(skb);
1638 1640
1641 if (unlikely(tcp_checksum_complete(skb))) {
1642 bh_unlock_sock(sk);
1643 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1644 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1645 return true;
1646 }
1647
1648 /* Attempt coalescing to last skb in backlog, even if we are
1649 * above the limits.
1650 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1651 */
1652 th = (const struct tcphdr *)skb->data;
1653 hdrlen = th->doff * 4;
1654 shinfo = skb_shinfo(skb);
1655
1656 if (!shinfo->gso_size)
1657 shinfo->gso_size = skb->len - hdrlen;
1658
1659 if (!shinfo->gso_segs)
1660 shinfo->gso_segs = 1;
1661
1662 tail = sk->sk_backlog.tail;
1663 if (!tail)
1664 goto no_coalesce;
1665 thtail = (struct tcphdr *)tail->data;
1666
1667 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1668 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1669 ((TCP_SKB_CB(tail)->tcp_flags |
1670 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
1671 ((TCP_SKB_CB(tail)->tcp_flags ^
1672 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1673#ifdef CONFIG_TLS_DEVICE
1674 tail->decrypted != skb->decrypted ||
1675#endif
1676 thtail->doff != th->doff ||
1677 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1678 goto no_coalesce;
1679
1680 __skb_pull(skb, hdrlen);
1681 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1682 thtail->window = th->window;
1683
1684 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1685
1686 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1687 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1688
1689 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1690
1691 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1692 TCP_SKB_CB(tail)->has_rxtstamp = true;
1693 tail->tstamp = skb->tstamp;
1694 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1695 }
1696
1697 /* Not as strict as GRO. We only need to carry mss max value */
1698 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1699 skb_shinfo(tail)->gso_size);
1700
1701 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1702 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1703
1704 sk->sk_backlog.len += delta;
1705 __NET_INC_STATS(sock_net(sk),
1706 LINUX_MIB_TCPBACKLOGCOALESCE);
1707 kfree_skb_partial(skb, fragstolen);
1708 return false;
1709 }
1710 __skb_push(skb, hdrlen);
1711
1712no_coalesce:
1713 /* Only socket owner can try to collapse/prune rx queues
1714 * to reduce memory overhead, so add a little headroom here.
1715 * Few sockets backlog are possibly concurrently non empty.
1716 */
1717 limit += 64*1024;
1718
1639 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1719 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1640 bh_unlock_sock(sk); 1720 bh_unlock_sock(sk);
1641 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1721 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);