diff options
author | Eric Dumazet <edumazet@google.com> | 2016-04-21 13:55:23 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2016-04-24 14:43:59 -0400 |
commit | 10d3be569243def8d92ac3722395ef5a59c504e6 (patch) | |
tree | ac01b70cff99ad2e59c54e80ddcd524eaa9691a8 | |
parent | 8cee83dd29dea4e7d27fda3b170381059f628868 (diff) |
tcp-tso: do not split TSO packets at retransmit time
Linux TCP stack painfully segments all TSO/GSO packets before retransmits.
This was fine back in the days when TSO/GSO were emerging, with their
bugs, but we believe the dark age is over.
Keeping big packets in write queues, but also in stack traversal
has a lot of benefits.
- Less memory overhead, because write queues have less skbs
- Less cpu overhead at ACK processing.
- Better SACK processing, as lot of studies mentioned how
awful linux was at this ;)
- Less cpu overhead to send the rtx packets
(IP stack traversal, netfilter traversal, drivers...)
- Better latencies in presence of losses.
- Smaller spikes in fq like packet schedulers, as retransmits
are not constrained by TCP Small Queues.
1 % packet losses are common today, and at 100Gbit speeds, this
translates to ~80,000 losses per second.
Losses are often correlated, and we see many retransmit events
leading to 1-MSS train of packets, at the time hosts are already
under stress.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/tcp.h | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 64 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 4 |
4 files changed, 34 insertions, 40 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h index c0ef0544dfcf..7f2553da10d1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -538,8 +538,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); | |||
538 | void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, | 538 | void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, |
539 | int nonagle); | 539 | int nonagle); |
540 | bool tcp_may_send_now(struct sock *sk); | 540 | bool tcp_may_send_now(struct sock *sk); |
541 | int __tcp_retransmit_skb(struct sock *, struct sk_buff *); | 541 | int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); |
542 | int tcp_retransmit_skb(struct sock *, struct sk_buff *); | 542 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); |
543 | void tcp_retransmit_timer(struct sock *sk); | 543 | void tcp_retransmit_timer(struct sock *sk); |
544 | void tcp_xmit_retransmit_queue(struct sock *); | 544 | void tcp_xmit_retransmit_queue(struct sock *); |
545 | void tcp_simple_retransmit(struct sock *); | 545 | void tcp_simple_retransmit(struct sock *); |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 75e8336f6ecd..dcad8f9f96eb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -5545,7 +5545,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | |||
5545 | if (data) { /* Retransmit unacked data in SYN */ | 5545 | if (data) { /* Retransmit unacked data in SYN */ |
5546 | tcp_for_write_queue_from(data, sk) { | 5546 | tcp_for_write_queue_from(data, sk) { |
5547 | if (data == tcp_send_head(sk) || | 5547 | if (data == tcp_send_head(sk) || |
5548 | __tcp_retransmit_skb(sk, data)) | 5548 | __tcp_retransmit_skb(sk, data, 1)) |
5549 | break; | 5549 | break; |
5550 | } | 5550 | } |
5551 | tcp_rearm_rto(sk); | 5551 | tcp_rearm_rto(sk); |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a6e4a8353b02..9d3b4b364652 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -2268,7 +2268,7 @@ void tcp_send_loss_probe(struct sock *sk) | |||
2268 | if (WARN_ON(!skb || !tcp_skb_pcount(skb))) | 2268 | if (WARN_ON(!skb || !tcp_skb_pcount(skb))) |
2269 | goto rearm_timer; | 2269 | goto rearm_timer; |
2270 | 2270 | ||
2271 | if (__tcp_retransmit_skb(sk, skb)) | 2271 | if (__tcp_retransmit_skb(sk, skb, 1)) |
2272 | goto rearm_timer; | 2272 | goto rearm_timer; |
2273 | 2273 | ||
2274 | /* Record snd_nxt for loss detection. */ | 2274 | /* Record snd_nxt for loss detection. */ |
@@ -2571,17 +2571,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, | |||
2571 | * state updates are done by the caller. Returns non-zero if an | 2571 | * state updates are done by the caller. Returns non-zero if an |
2572 | * error occurred which prevented the send. | 2572 | * error occurred which prevented the send. |
2573 | */ | 2573 | */ |
2574 | int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | 2574 | int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) |
2575 | { | 2575 | { |
2576 | struct tcp_sock *tp = tcp_sk(sk); | ||
2577 | struct inet_connection_sock *icsk = inet_csk(sk); | 2576 | struct inet_connection_sock *icsk = inet_csk(sk); |
2577 | struct tcp_sock *tp = tcp_sk(sk); | ||
2578 | unsigned int cur_mss; | 2578 | unsigned int cur_mss; |
2579 | int err; | 2579 | int diff, len, err; |
2580 | |||
2580 | 2581 | ||
2581 | /* Inconslusive MTU probe */ | 2582 | /* Inconclusive MTU probe */ |
2582 | if (icsk->icsk_mtup.probe_size) { | 2583 | if (icsk->icsk_mtup.probe_size) |
2583 | icsk->icsk_mtup.probe_size = 0; | 2584 | icsk->icsk_mtup.probe_size = 0; |
2584 | } | ||
2585 | 2585 | ||
2586 | /* Do not sent more than we queued. 1/4 is reserved for possible | 2586 | /* Do not sent more than we queued. 1/4 is reserved for possible |
2587 | * copying overhead: fragmentation, tunneling, mangling etc. | 2587 | * copying overhead: fragmentation, tunneling, mangling etc. |
@@ -2614,30 +2614,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2614 | TCP_SKB_CB(skb)->seq != tp->snd_una) | 2614 | TCP_SKB_CB(skb)->seq != tp->snd_una) |
2615 | return -EAGAIN; | 2615 | return -EAGAIN; |
2616 | 2616 | ||
2617 | if (skb->len > cur_mss) { | 2617 | len = cur_mss * segs; |
2618 | if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) | 2618 | if (skb->len > len) { |
2619 | if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) | ||
2619 | return -ENOMEM; /* We'll try again later. */ | 2620 | return -ENOMEM; /* We'll try again later. */ |
2620 | } else { | 2621 | } else { |
2621 | int oldpcount = tcp_skb_pcount(skb); | 2622 | if (skb_unclone(skb, GFP_ATOMIC)) |
2623 | return -ENOMEM; | ||
2622 | 2624 | ||
2623 | if (unlikely(oldpcount > 1)) { | 2625 | diff = tcp_skb_pcount(skb); |
2624 | if (skb_unclone(skb, GFP_ATOMIC)) | 2626 | tcp_set_skb_tso_segs(skb, cur_mss); |
2625 | return -ENOMEM; | 2627 | diff -= tcp_skb_pcount(skb); |
2626 | tcp_init_tso_segs(skb, cur_mss); | 2628 | if (diff) |
2627 | tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); | 2629 | tcp_adjust_pcount(sk, skb, diff); |
2628 | } | 2630 | if (skb->len < cur_mss) |
2631 | tcp_retrans_try_collapse(sk, skb, cur_mss); | ||
2629 | } | 2632 | } |
2630 | 2633 | ||
2631 | /* RFC3168, section 6.1.1.1. ECN fallback */ | 2634 | /* RFC3168, section 6.1.1.1. ECN fallback */ |
2632 | if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) | 2635 | if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) |
2633 | tcp_ecn_clear_syn(sk, skb); | 2636 | tcp_ecn_clear_syn(sk, skb); |
2634 | 2637 | ||
2635 | tcp_retrans_try_collapse(sk, skb, cur_mss); | ||
2636 | |||
2637 | /* Make a copy, if the first transmission SKB clone we made | ||
2638 | * is still in somebody's hands, else make a clone. | ||
2639 | */ | ||
2640 | |||
2641 | /* make sure skb->data is aligned on arches that require it | 2638 | /* make sure skb->data is aligned on arches that require it |
2642 | * and check if ack-trimming & collapsing extended the headroom | 2639 | * and check if ack-trimming & collapsing extended the headroom |
2643 | * beyond what csum_start can cover. | 2640 | * beyond what csum_start can cover. |
@@ -2653,20 +2650,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
2653 | } | 2650 | } |
2654 | 2651 | ||
2655 | if (likely(!err)) { | 2652 | if (likely(!err)) { |
2653 | segs = tcp_skb_pcount(skb); | ||
2654 | |||
2656 | TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; | 2655 | TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; |
2657 | /* Update global TCP statistics. */ | 2656 | /* Update global TCP statistics. */ |
2658 | TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); | 2657 | TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); |
2659 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) | 2658 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) |
2660 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); | 2659 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); |
2661 | tp->total_retrans++; | 2660 | tp->total_retrans += segs; |
2662 | } | 2661 | } |
2663 | return err; | 2662 | return err; |
2664 | } | 2663 | } |
2665 | 2664 | ||
2666 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | 2665 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) |
2667 | { | 2666 | { |
2668 | struct tcp_sock *tp = tcp_sk(sk); | 2667 | struct tcp_sock *tp = tcp_sk(sk); |
2669 | int err = __tcp_retransmit_skb(sk, skb); | 2668 | int err = __tcp_retransmit_skb(sk, skb, segs); |
2670 | 2669 | ||
2671 | if (err == 0) { | 2670 | if (err == 0) { |
2672 | #if FASTRETRANS_DEBUG > 0 | 2671 | #if FASTRETRANS_DEBUG > 0 |
@@ -2757,6 +2756,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
2757 | 2756 | ||
2758 | tcp_for_write_queue_from(skb, sk) { | 2757 | tcp_for_write_queue_from(skb, sk) { |
2759 | __u8 sacked = TCP_SKB_CB(skb)->sacked; | 2758 | __u8 sacked = TCP_SKB_CB(skb)->sacked; |
2759 | int segs; | ||
2760 | 2760 | ||
2761 | if (skb == tcp_send_head(sk)) | 2761 | if (skb == tcp_send_head(sk)) |
2762 | break; | 2762 | break; |
@@ -2764,14 +2764,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
2764 | if (!hole) | 2764 | if (!hole) |
2765 | tp->retransmit_skb_hint = skb; | 2765 | tp->retransmit_skb_hint = skb; |
2766 | 2766 | ||
2767 | /* Assume this retransmit will generate | 2767 | segs = tp->snd_cwnd - tcp_packets_in_flight(tp); |
2768 | * only one packet for congestion window | 2768 | if (segs <= 0) |
2769 | * calculation purposes. This works because | ||
2770 | * tcp_retransmit_skb() will chop up the | ||
2771 | * packet to be MSS sized and all the | ||
2772 | * packet counting works out. | ||
2773 | */ | ||
2774 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) | ||
2775 | return; | 2769 | return; |
2776 | 2770 | ||
2777 | if (fwd_rexmitting) { | 2771 | if (fwd_rexmitting) { |
@@ -2808,7 +2802,7 @@ begin_fwd: | |||
2808 | if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) | 2802 | if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) |
2809 | continue; | 2803 | continue; |
2810 | 2804 | ||
2811 | if (tcp_retransmit_skb(sk, skb)) | 2805 | if (tcp_retransmit_skb(sk, skb, segs)) |
2812 | return; | 2806 | return; |
2813 | 2807 | ||
2814 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | 2808 | NET_INC_STATS_BH(sock_net(sk), mib_idx); |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 49bc474f8e35..373b03e78aaa 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -404,7 +404,7 @@ void tcp_retransmit_timer(struct sock *sk) | |||
404 | goto out; | 404 | goto out; |
405 | } | 405 | } |
406 | tcp_enter_loss(sk); | 406 | tcp_enter_loss(sk); |
407 | tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); | 407 | tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); |
408 | __sk_dst_reset(sk); | 408 | __sk_dst_reset(sk); |
409 | goto out_reset_timer; | 409 | goto out_reset_timer; |
410 | } | 410 | } |
@@ -436,7 +436,7 @@ void tcp_retransmit_timer(struct sock *sk) | |||
436 | 436 | ||
437 | tcp_enter_loss(sk); | 437 | tcp_enter_loss(sk); |
438 | 438 | ||
439 | if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { | 439 | if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { |
440 | /* Retransmission failed because of local congestion, | 440 | /* Retransmission failed because of local congestion, |
441 | * do not backoff. | 441 | * do not backoff. |
442 | */ | 442 | */ |