aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_ipv4.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-11-30 16:26:54 -0500
committerDavid S. Miller <davem@davemloft.net>2018-11-30 16:26:54 -0500
commit2f69555315ad7dc1ac37366b2ac2429e2d24d444 (patch)
tree3b70704dbf9f7cf27ec16d9321f7c1956b81c3a5 /net/ipv4/tcp_ipv4.c
parentb0e3f1bdf9e7140fd1151af575f468b5827a61e1 (diff)
parent4f693b55c3d2d2239b8a0094b518a1e533cf75d5 (diff)
Merge branch 'tcp-take-a-bit-more-care-of-backlog-stress'
Eric Dumazet says: ==================== tcp: take a bit more care of backlog stress While working on the SACK compression issue Jean-Louis Dupond reported, we found that his linux box was suffering very hard from tail drops on the socket backlog queue. First patch hints the compiler about sack flows being the norm. Second patch changes non-sack code in preparation of the ack compression. Third patch fixes tcp_space() to take backlog into account. Fourth patch is attempting coalescing when a new packet must be added to the backlog queue. Cooking bigger skbs helps to keep backlog list smaller and speeds its handling when user thread finally releases the socket lock. v3: Neal/Yuchung feedback addressed : Do not aggregate if any skb has URG bit set. Do not aggregate if the skbs have different ECE/CWR bits v2: added feedback from Neal : tcp: take care of compressed acks in tcp_add_reno_sack() added : tcp: hint compiler about sack flows added : tcp: make tcp_space() aware of socket backlog ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
-rw-r--r--net/ipv4/tcp_ipv4.c92
1 files changed, 86 insertions, 6 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 795605a23275..4904250a9aac 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
1619bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1619bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1620{ 1620{
1621 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; 1621 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1622 1622 struct skb_shared_info *shinfo;
1623 /* Only socket owner can try to collapse/prune rx queues 1623 const struct tcphdr *th;
1624 * to reduce memory overhead, so add a little headroom here. 1624 struct tcphdr *thtail;
1625 * Few sockets backlog are possibly concurrently non empty. 1625 struct sk_buff *tail;
1626 */ 1626 unsigned int hdrlen;
1627 limit += 64*1024; 1627 bool fragstolen;
1628 u32 gso_segs;
1629 int delta;
1628 1630
1629 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 1631 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1630 * we can fix skb->truesize to its real value to avoid future drops. 1632 * we can fix skb->truesize to its real value to avoid future drops.
@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1636 1638
1637 skb_dst_drop(skb); 1639 skb_dst_drop(skb);
1638 1640
1641 if (unlikely(tcp_checksum_complete(skb))) {
1642 bh_unlock_sock(sk);
1643 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1644 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1645 return true;
1646 }
1647
1648 /* Attempt coalescing to last skb in backlog, even if we are
1649 * above the limits.
1650 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1651 */
1652 th = (const struct tcphdr *)skb->data;
1653 hdrlen = th->doff * 4;
1654 shinfo = skb_shinfo(skb);
1655
1656 if (!shinfo->gso_size)
1657 shinfo->gso_size = skb->len - hdrlen;
1658
1659 if (!shinfo->gso_segs)
1660 shinfo->gso_segs = 1;
1661
1662 tail = sk->sk_backlog.tail;
1663 if (!tail)
1664 goto no_coalesce;
1665 thtail = (struct tcphdr *)tail->data;
1666
1667 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1668 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1669 ((TCP_SKB_CB(tail)->tcp_flags |
1670 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
1671 ((TCP_SKB_CB(tail)->tcp_flags ^
1672 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1673#ifdef CONFIG_TLS_DEVICE
1674 tail->decrypted != skb->decrypted ||
1675#endif
1676 thtail->doff != th->doff ||
1677 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1678 goto no_coalesce;
1679
1680 __skb_pull(skb, hdrlen);
1681 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1682 thtail->window = th->window;
1683
1684 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1685
1686 if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1687 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1688
1689 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1690
1691 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1692 TCP_SKB_CB(tail)->has_rxtstamp = true;
1693 tail->tstamp = skb->tstamp;
1694 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1695 }
1696
1697 /* Not as strict as GRO. We only need to carry mss max value */
1698 skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1699 skb_shinfo(tail)->gso_size);
1700
1701 gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1702 skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1703
1704 sk->sk_backlog.len += delta;
1705 __NET_INC_STATS(sock_net(sk),
1706 LINUX_MIB_TCPBACKLOGCOALESCE);
1707 kfree_skb_partial(skb, fragstolen);
1708 return false;
1709 }
1710 __skb_push(skb, hdrlen);
1711
1712no_coalesce:
1713 /* Only socket owner can try to collapse/prune rx queues
1714 * to reduce memory overhead, so add a little headroom here.
1715 * Few sockets backlog are possibly concurrently non empty.
1716 */
1717 limit += 64*1024;
1718
1639 if (unlikely(sk_add_backlog(sk, skb, limit))) { 1719 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1640 bh_unlock_sock(sk); 1720 bh_unlock_sock(sk);
1641 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 1721 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);