aboutsummaryrefslogtreecommitdiffstats
path: root/include/net/tcp.h
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2017-10-06 01:21:27 -0400
committerDavid S. Miller <davem@davemloft.net>2017-10-06 19:28:54 -0400
commit75c119afe14f74b4dd967d75ed9f57ab6c0ef045 (patch)
treea9e03880b4f700a0f45026f06262a916d42f7e5e /include/net/tcp.h
parentf33198163a0fbb03766444253edf6ea50685d725 (diff)
tcp: implement rb-tree based retransmit queue
Using a linear list to store all skbs in write queue has been okay for quite a while : O(N) is not too bad when N < 500. Things get messy when N is the order of 100,000 : Modern TCP stacks want 10Gbit+ of throughput even with 200 ms RTT flows. 40 ns per cache line miss means a full scan can use 4 ms, blowing away CPU caches. SACK processing often can use various hints to avoid parsing whole retransmit queue. But with high packet losses and/or high reordering, hints no longer work. Sender has to process thousands of unfriendly SACK, accumulating a huge socket backlog, burning a cpu and massively dropping packets. Using an rb-tree for retransmit queue has been avoided for years because it added complexity and overhead, but now is the time to be more resistant and say no to quadratic behavior. 1) RTX queue is no longer part of the write queue : already sent skbs are stored in one rb-tree. 2) Since reaching the head of write queue no longer needs sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue Tested: On receiver : netem on ingress : delay 150ms 200us loss 1 GRO disabled to force stress and SACK storms. for f in `seq 1 10` do ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1 done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}' Before patch : 323.87 351.48 339.59 338.62 306.72 204.07 304.93 291.88 202.47 176.88 2840 After patch: 1700.83 2207.98 2070.17 1544.26 2114.76 2124.89 1693.14 1080.91 2216.82 1299.94 18053 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/net/tcp.h')
-rw-r--r--include/net/tcp.h89
1 files changed, 47 insertions, 42 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 744559b72784..5a95e5886b55 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *);
551void tcp_simple_retransmit(struct sock *); 551void tcp_simple_retransmit(struct sock *);
552void tcp_enter_recovery(struct sock *sk, bool ece_ack); 552void tcp_enter_recovery(struct sock *sk, bool ece_ack);
553int tcp_trim_head(struct sock *, struct sk_buff *, u32); 553int tcp_trim_head(struct sock *, struct sk_buff *, u32);
554int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t); 554enum tcp_queue {
555 TCP_FRAG_IN_WRITE_QUEUE,
556 TCP_FRAG_IN_RTX_QUEUE,
557};
558int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
559 struct sk_buff *skb, u32 len,
560 unsigned int mss_now, gfp_t gfp);
555 561
556void tcp_send_probe0(struct sock *); 562void tcp_send_probe0(struct sock *);
557void tcp_send_partial(struct sock *); 563void tcp_send_partial(struct sock *);
@@ -1608,6 +1614,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
1608 1614
1609void tcp_write_queue_purge(struct sock *sk); 1615void tcp_write_queue_purge(struct sock *sk);
1610 1616
1617static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
1618{
1619 return skb_rb_first(&sk->tcp_rtx_queue);
1620}
1621
1611static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) 1622static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
1612{ 1623{
1613 return skb_peek(&sk->sk_write_queue); 1624 return skb_peek(&sk->sk_write_queue);
@@ -1630,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
1630 return skb_queue_prev(&sk->sk_write_queue, skb); 1641 return skb_queue_prev(&sk->sk_write_queue, skb);
1631} 1642}
1632 1643
1633#define tcp_for_write_queue(skb, sk) \
1634 skb_queue_walk(&(sk)->sk_write_queue, skb)
1635
1636#define tcp_for_write_queue_from(skb, sk) \
1637 skb_queue_walk_from(&(sk)->sk_write_queue, skb)
1638
1639#define tcp_for_write_queue_from_safe(skb, tmp, sk) \ 1644#define tcp_for_write_queue_from_safe(skb, tmp, sk) \
1640 skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) 1645 skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
1641 1646
1642static inline struct sk_buff *tcp_send_head(const struct sock *sk) 1647static inline struct sk_buff *tcp_send_head(const struct sock *sk)
1643{ 1648{
1644 return sk->sk_send_head; 1649 return skb_peek(&sk->sk_write_queue);
1645} 1650}
1646 1651
1647static inline bool tcp_skb_is_last(const struct sock *sk, 1652static inline bool tcp_skb_is_last(const struct sock *sk,
@@ -1650,29 +1655,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
1650 return skb_queue_is_last(&sk->sk_write_queue, skb); 1655 return skb_queue_is_last(&sk->sk_write_queue, skb);
1651} 1656}
1652 1657
1653static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb) 1658static inline bool tcp_write_queue_empty(const struct sock *sk)
1654{ 1659{
1655 if (tcp_skb_is_last(sk, skb)) 1660 return skb_queue_empty(&sk->sk_write_queue);
1656 sk->sk_send_head = NULL; 1661}
1657 else 1662
1658 sk->sk_send_head = tcp_write_queue_next(sk, skb); 1663static inline bool tcp_rtx_queue_empty(const struct sock *sk)
1664{
1665 return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
1666}
1667
1668static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
1669{
1670 return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
1659} 1671}
1660 1672
1661static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked) 1673static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
1662{ 1674{
1663 if (sk->sk_send_head == skb_unlinked) { 1675 if (tcp_write_queue_empty(sk))
1664 sk->sk_send_head = NULL;
1665 tcp_chrono_stop(sk, TCP_CHRONO_BUSY); 1676 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
1666 } 1677
1667 if (tcp_sk(sk)->highest_sack == skb_unlinked) 1678 if (tcp_sk(sk)->highest_sack == skb_unlinked)
1668 tcp_sk(sk)->highest_sack = NULL; 1679 tcp_sk(sk)->highest_sack = NULL;
1669} 1680}
1670 1681
1671static inline void tcp_init_send_head(struct sock *sk)
1672{
1673 sk->sk_send_head = NULL;
1674}
1675
1676static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) 1682static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
1677{ 1683{
1678 __skb_queue_tail(&sk->sk_write_queue, skb); 1684 __skb_queue_tail(&sk->sk_write_queue, skb);
@@ -1683,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
1683 __tcp_add_write_queue_tail(sk, skb); 1689 __tcp_add_write_queue_tail(sk, skb);
1684 1690
1685 /* Queue it, remembering where we must start sending. */ 1691 /* Queue it, remembering where we must start sending. */
1686 if (sk->sk_send_head == NULL) { 1692 if (sk->sk_write_queue.next == skb) {
1687 sk->sk_send_head = skb;
1688 tcp_chrono_start(sk, TCP_CHRONO_BUSY); 1693 tcp_chrono_start(sk, TCP_CHRONO_BUSY);
1689 1694
1690 if (tcp_sk(sk)->highest_sack == NULL) 1695 if (tcp_sk(sk)->highest_sack == NULL)
@@ -1697,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s
1697 __skb_queue_head(&sk->sk_write_queue, skb); 1702 __skb_queue_head(&sk->sk_write_queue, skb);
1698} 1703}
1699 1704
1700/* Insert buff after skb on the write queue of sk. */
1701static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
1702 struct sk_buff *buff,
1703 struct sock *sk)
1704{
1705 __skb_queue_after(&sk->sk_write_queue, skb, buff);
1706}
1707
1708/* Insert new before skb on the write queue of sk. */ 1705/* Insert new before skb on the write queue of sk. */
1709static inline void tcp_insert_write_queue_before(struct sk_buff *new, 1706static inline void tcp_insert_write_queue_before(struct sk_buff *new,
1710 struct sk_buff *skb, 1707 struct sk_buff *skb,
1711 struct sock *sk) 1708 struct sock *sk)
1712{ 1709{
1713 __skb_queue_before(&sk->sk_write_queue, skb, new); 1710 __skb_queue_before(&sk->sk_write_queue, skb, new);
1714
1715 if (sk->sk_send_head == skb)
1716 sk->sk_send_head = new;
1717} 1711}
1718 1712
1719static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) 1713static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
1720{ 1714{
1721 list_del(&skb->tcp_tsorted_anchor);
1722 tcp_skb_tsorted_anchor_cleanup(skb);
1723 __skb_unlink(skb, &sk->sk_write_queue); 1715 __skb_unlink(skb, &sk->sk_write_queue);
1724} 1716}
1725 1717
1726static inline bool tcp_write_queue_empty(struct sock *sk) 1718void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
1719
1720static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
1727{ 1721{
1728 return skb_queue_empty(&sk->sk_write_queue); 1722 tcp_skb_tsorted_anchor_cleanup(skb);
1723 rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
1724}
1725
1726static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
1727{
1728 list_del(&skb->tcp_tsorted_anchor);
1729 tcp_rtx_queue_unlink(skb, sk);
1730 sk_wmem_free_skb(sk, skb);
1729} 1731}
1730 1732
1731static inline void tcp_push_pending_frames(struct sock *sk) 1733static inline void tcp_push_pending_frames(struct sock *sk)
@@ -1754,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
1754 1756
1755static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) 1757static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
1756{ 1758{
1757 tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL : 1759 struct sk_buff *next = skb_rb_next(skb);
1758 tcp_write_queue_next(sk, skb); 1760
1761 tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
1759} 1762}
1760 1763
1761static inline struct sk_buff *tcp_highest_sack(struct sock *sk) 1764static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
@@ -1765,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
1765 1768
1766static inline void tcp_highest_sack_reset(struct sock *sk) 1769static inline void tcp_highest_sack_reset(struct sock *sk)
1767{ 1770{
1768 tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk); 1771 struct sk_buff *skb = tcp_rtx_queue_head(sk);
1772
1773 tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
1769} 1774}
1770 1775
1771/* Called when old skb is about to be deleted (to be combined with new skb) */ 1776/* Called when old skb is about to be deleted (to be combined with new skb) */
@@ -1935,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk);
1935/* At how many usecs into the future should the RTO fire? */ 1940/* At how many usecs into the future should the RTO fire? */
1936static inline s64 tcp_rto_delta_us(const struct sock *sk) 1941static inline s64 tcp_rto_delta_us(const struct sock *sk)
1937{ 1942{
1938 const struct sk_buff *skb = tcp_write_queue_head(sk); 1943 const struct sk_buff *skb = tcp_rtx_queue_head(sk);
1939 u32 rto = inet_csk(sk)->icsk_rto; 1944 u32 rto = inet_csk(sk)->icsk_rto;
1940 u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto); 1945 u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
1941 1946