aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2017-10-06 01:21:27 -0400
committerDavid S. Miller <davem@davemloft.net>2017-10-06 19:28:54 -0400
commit75c119afe14f74b4dd967d75ed9f57ab6c0ef045 (patch)
treea9e03880b4f700a0f45026f06262a916d42f7e5e /net/ipv4/tcp.c
parentf33198163a0fbb03766444253edf6ea50685d725 (diff)
tcp: implement rb-tree based retransmit queue
Using a linear list to store all skbs in write queue has been okay for quite a while : O(N) is not too bad when N < 500. Things get messy when N is the order of 100,000 : Modern TCP stacks want 10Gbit+ of throughput even with 200 ms RTT flows. 40 ns per cache line miss means a full scan can use 4 ms, blowing away CPU caches. SACK processing often can use various hints to avoid parsing whole retransmit queue. But with high packet losses and/or high reordering, hints no longer work. Sender has to process thousands of unfriendly SACK, accumulating a huge socket backlog, burning a cpu and massively dropping packets. Using an rb-tree for retransmit queue has been avoided for years because it added complexity and overhead, but now is the time to be more resistant and say no to quadratic behavior. 1) RTX queue is no longer part of the write queue : already sent skbs are stored in one rb-tree. 2) Since reaching the head of write queue no longer needs sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue Tested: On receiver : netem on ingress : delay 150ms 200us loss 1 GRO disabled to force stress and SACK storms. for f in `seq 1 10` do ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1 done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}' Before patch : 323.87 351.48 339.59 338.62 306.72 204.07 304.93 291.88 202.47 176.88 2840 After patch: 1700.83 2207.98 2070.17 1544.26 2114.76 2124.89 1693.14 1080.91 2216.82 1299.94 18053 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c41
1 files changed, 32 insertions, 9 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b8d379c80936..3b34850d361f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -413,6 +413,7 @@ void tcp_init_sock(struct sock *sk)
413 struct tcp_sock *tp = tcp_sk(sk); 413 struct tcp_sock *tp = tcp_sk(sk);
414 414
415 tp->out_of_order_queue = RB_ROOT; 415 tp->out_of_order_queue = RB_ROOT;
416 sk->tcp_rtx_queue = RB_ROOT;
416 tcp_init_xmit_timers(sk); 417 tcp_init_xmit_timers(sk);
417 INIT_LIST_HEAD(&tp->tsq_node); 418 INIT_LIST_HEAD(&tp->tsq_node);
418 INIT_LIST_HEAD(&tp->tsorted_sent_queue); 419 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
@@ -701,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
701 struct tcp_sock *tp = tcp_sk(sk); 702 struct tcp_sock *tp = tcp_sk(sk);
702 struct sk_buff *skb; 703 struct sk_buff *skb;
703 704
704 if (!tcp_send_head(sk))
705 return;
706
707 skb = tcp_write_queue_tail(sk); 705 skb = tcp_write_queue_tail(sk);
706 if (!skb)
707 return;
708 if (!(flags & MSG_MORE) || forced_push(tp)) 708 if (!(flags & MSG_MORE) || forced_push(tp))
709 tcp_mark_push(tp, skb); 709 tcp_mark_push(tp, skb);
710 710
@@ -964,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
964 int copy, i; 964 int copy, i;
965 bool can_coalesce; 965 bool can_coalesce;
966 966
967 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 || 967 if (!skb || (copy = size_goal - skb->len) <= 0 ||
968 !tcp_skb_can_collapse_to(skb)) { 968 !tcp_skb_can_collapse_to(skb)) {
969new_segment: 969new_segment:
970 if (!sk_stream_memory_free(sk)) 970 if (!sk_stream_memory_free(sk))
971 goto wait_for_sndbuf; 971 goto wait_for_sndbuf;
972 972
973 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, 973 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
974 skb_queue_empty(&sk->sk_write_queue)); 974 tcp_rtx_and_write_queues_empty(sk));
975 if (!skb) 975 if (!skb)
976 goto wait_for_memory; 976 goto wait_for_memory;
977 977
@@ -1199,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1199 goto out_err; 1199 goto out_err;
1200 } 1200 }
1201 1201
1202 skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL; 1202 skb = tcp_write_queue_tail(sk);
1203 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); 1203 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
1204 if (!uarg) { 1204 if (!uarg) {
1205 err = -ENOBUFS; 1205 err = -ENOBUFS;
@@ -1275,7 +1275,7 @@ restart:
1275 int max = size_goal; 1275 int max = size_goal;
1276 1276
1277 skb = tcp_write_queue_tail(sk); 1277 skb = tcp_write_queue_tail(sk);
1278 if (tcp_send_head(sk)) { 1278 if (skb) {
1279 if (skb->ip_summed == CHECKSUM_NONE) 1279 if (skb->ip_summed == CHECKSUM_NONE)
1280 max = mss_now; 1280 max = mss_now;
1281 copy = max - skb->len; 1281 copy = max - skb->len;
@@ -1295,7 +1295,7 @@ new_segment:
1295 process_backlog = false; 1295 process_backlog = false;
1296 goto restart; 1296 goto restart;
1297 } 1297 }
1298 first_skb = skb_queue_empty(&sk->sk_write_queue); 1298 first_skb = tcp_rtx_and_write_queues_empty(sk);
1299 skb = sk_stream_alloc_skb(sk, 1299 skb = sk_stream_alloc_skb(sk,
1300 select_size(sk, sg, first_skb), 1300 select_size(sk, sg, first_skb),
1301 sk->sk_allocation, 1301 sk->sk_allocation,
@@ -1521,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1521 1521
1522 /* XXX -- need to support SO_PEEK_OFF */ 1522 /* XXX -- need to support SO_PEEK_OFF */
1523 1523
1524 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1525 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1526 if (err)
1527 return err;
1528 copied += skb->len;
1529 }
1530
1524 skb_queue_walk(&sk->sk_write_queue, skb) { 1531 skb_queue_walk(&sk->sk_write_queue, skb) {
1525 err = skb_copy_datagram_msg(skb, 0, msg, skb->len); 1532 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1526 if (err) 1533 if (err)
@@ -2320,6 +2327,22 @@ static inline bool tcp_need_reset(int state)
2320 TCPF_FIN_WAIT2 | TCPF_SYN_RECV); 2327 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2321} 2328}
2322 2329
2330static void tcp_rtx_queue_purge(struct sock *sk)
2331{
2332 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2333
2334 while (p) {
2335 struct sk_buff *skb = rb_to_skb(p);
2336
2337 p = rb_next(p);
2338 /* Since we are deleting whole queue, no need to
2339 * list_del(&skb->tcp_tsorted_anchor)
2340 */
2341 tcp_rtx_queue_unlink(skb, sk);
2342 sk_wmem_free_skb(sk, skb);
2343 }
2344}
2345
2323void tcp_write_queue_purge(struct sock *sk) 2346void tcp_write_queue_purge(struct sock *sk)
2324{ 2347{
2325 struct sk_buff *skb; 2348 struct sk_buff *skb;
@@ -2329,6 +2352,7 @@ void tcp_write_queue_purge(struct sock *sk)
2329 tcp_skb_tsorted_anchor_cleanup(skb); 2352 tcp_skb_tsorted_anchor_cleanup(skb);
2330 sk_wmem_free_skb(sk, skb); 2353 sk_wmem_free_skb(sk, skb);
2331 } 2354 }
2355 tcp_rtx_queue_purge(sk);
2332 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); 2356 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2333 sk_mem_reclaim(sk); 2357 sk_mem_reclaim(sk);
2334 tcp_clear_all_retrans_hints(tcp_sk(sk)); 2358 tcp_clear_all_retrans_hints(tcp_sk(sk));
@@ -2392,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags)
2392 * issue in __tcp_select_window() 2416 * issue in __tcp_select_window()
2393 */ 2417 */
2394 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; 2418 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2395 tcp_init_send_head(sk);
2396 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 2419 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2397 __sk_dst_reset(sk); 2420 __sk_dst_reset(sk);
2398 dst_release(sk->sk_rx_dst); 2421 dst_release(sk->sk_rx_dst);