aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_timer.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2017-10-06 01:21:27 -0400
committerDavid S. Miller <davem@davemloft.net>2017-10-06 19:28:54 -0400
commit75c119afe14f74b4dd967d75ed9f57ab6c0ef045 (patch)
treea9e03880b4f700a0f45026f06262a916d42f7e5e /net/ipv4/tcp_timer.c
parentf33198163a0fbb03766444253edf6ea50685d725 (diff)
tcp: implement rb-tree based retransmit queue
Using a linear list to store all skbs in write queue has been okay for quite a while : O(N) is not too bad when N < 500. Things get messy when N is the order of 100,000 : Modern TCP stacks want 10Gbit+ of throughput even with 200 ms RTT flows. 40 ns per cache line miss means a full scan can use 4 ms, blowing away CPU caches. SACK processing often can use various hints to avoid parsing whole retransmit queue. But with high packet losses and/or high reordering, hints no longer work. Sender has to process thousands of unfriendly SACK, accumulating a huge socket backlog, burning a cpu and massively dropping packets. Using an rb-tree for retransmit queue has been avoided for years because it added complexity and overhead, but now is the time to be more resistant and say no to quadratic behavior. 1) RTX queue is no longer part of the write queue : already sent skbs are stored in one rb-tree. 2) Since reaching the head of write queue no longer needs sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue Tested: On receiver : netem on ingress : delay 150ms 200us loss 1 GRO disabled to force stress and SACK storms. for f in `seq 1 10` do ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1 done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}' Before patch : 323.87 351.48 339.59 338.62 306.72 204.07 304.93 291.88 202.47 176.88 2840 After patch: 1700.83 2207.98 2070.17 1544.26 2114.76 2124.89 1693.14 1080.91 2216.82 1299.94 18053 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_timer.c')
-rw-r--r--net/ipv4/tcp_timer.c24
1 files changed, 15 insertions, 9 deletions
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 655dd8d7f064..7014cc00c74c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk,
156 return false; 156 return false;
157 157
158 start_ts = tcp_sk(sk)->retrans_stamp; 158 start_ts = tcp_sk(sk)->retrans_stamp;
159 if (unlikely(!start_ts)) 159 if (unlikely(!start_ts)) {
160 start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); 160 struct sk_buff *head = tcp_rtx_queue_head(sk);
161
162 if (!head)
163 return false;
164 start_ts = tcp_skb_timestamp(head);
165 }
161 166
162 if (likely(timeout == 0)) { 167 if (likely(timeout == 0)) {
163 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); 168 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -304,11 +309,12 @@ static void tcp_delack_timer(unsigned long data)
304static void tcp_probe_timer(struct sock *sk) 309static void tcp_probe_timer(struct sock *sk)
305{ 310{
306 struct inet_connection_sock *icsk = inet_csk(sk); 311 struct inet_connection_sock *icsk = inet_csk(sk);
312 struct sk_buff *skb = tcp_send_head(sk);
307 struct tcp_sock *tp = tcp_sk(sk); 313 struct tcp_sock *tp = tcp_sk(sk);
308 int max_probes; 314 int max_probes;
309 u32 start_ts; 315 u32 start_ts;
310 316
311 if (tp->packets_out || !tcp_send_head(sk)) { 317 if (tp->packets_out || !skb) {
312 icsk->icsk_probes_out = 0; 318 icsk->icsk_probes_out = 0;
313 return; 319 return;
314 } 320 }
@@ -321,9 +327,9 @@ static void tcp_probe_timer(struct sock *sk)
321 * corresponding system limit. We also implement similar policy when 327 * corresponding system limit. We also implement similar policy when
322 * we use RTO to probe window in tcp_retransmit_timer(). 328 * we use RTO to probe window in tcp_retransmit_timer().
323 */ 329 */
324 start_ts = tcp_skb_timestamp(tcp_send_head(sk)); 330 start_ts = tcp_skb_timestamp(skb);
325 if (!start_ts) 331 if (!start_ts)
326 tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; 332 skb->skb_mstamp = tp->tcp_mstamp;
327 else if (icsk->icsk_user_timeout && 333 else if (icsk->icsk_user_timeout &&
328 (s32)(tcp_time_stamp(tp) - start_ts) > 334 (s32)(tcp_time_stamp(tp) - start_ts) >
329 jiffies_to_msecs(icsk->icsk_user_timeout)) 335 jiffies_to_msecs(icsk->icsk_user_timeout))
@@ -408,7 +414,7 @@ void tcp_retransmit_timer(struct sock *sk)
408 if (!tp->packets_out) 414 if (!tp->packets_out)
409 goto out; 415 goto out;
410 416
411 WARN_ON(tcp_write_queue_empty(sk)); 417 WARN_ON(tcp_rtx_queue_empty(sk));
412 418
413 tp->tlp_high_seq = 0; 419 tp->tlp_high_seq = 0;
414 420
@@ -441,7 +447,7 @@ void tcp_retransmit_timer(struct sock *sk)
441 goto out; 447 goto out;
442 } 448 }
443 tcp_enter_loss(sk); 449 tcp_enter_loss(sk);
444 tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); 450 tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
445 __sk_dst_reset(sk); 451 __sk_dst_reset(sk);
446 goto out_reset_timer; 452 goto out_reset_timer;
447 } 453 }
@@ -473,7 +479,7 @@ void tcp_retransmit_timer(struct sock *sk)
473 479
474 tcp_enter_loss(sk); 480 tcp_enter_loss(sk);
475 481
476 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { 482 if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
477 /* Retransmission failed because of local congestion, 483 /* Retransmission failed because of local congestion,
478 * do not backoff. 484 * do not backoff.
479 */ 485 */
@@ -647,7 +653,7 @@ static void tcp_keepalive_timer (unsigned long data)
647 elapsed = keepalive_time_when(tp); 653 elapsed = keepalive_time_when(tp);
648 654
649 /* It is alive without keepalive 8) */ 655 /* It is alive without keepalive 8) */
650 if (tp->packets_out || tcp_send_head(sk)) 656 if (tp->packets_out || !tcp_write_queue_empty(sk))
651 goto resched; 657 goto resched;
652 658
653 elapsed = keepalive_time_elapsed(tp); 659 elapsed = keepalive_time_elapsed(tp);