aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2016-04-21 13:55:23 -0400
committerDavid S. Miller <davem@davemloft.net>2016-04-24 14:43:59 -0400
commit10d3be569243def8d92ac3722395ef5a59c504e6 (patch)
treeac01b70cff99ad2e59c54e80ddcd524eaa9691a8
parent8cee83dd29dea4e7d27fda3b170381059f628868 (diff)
tcp-tso: do not split TSO packets at retransmit time
Linux TCP stack painfully segments all TSO/GSO packets before retransmits. This was fine back in the days when TSO/GSO were emerging, with their bugs, but we believe the dark age is over. Keeping big packets in write queues, but also in stack traversal has a lot of benefits. - Less memory overhead, because write queues have less skbs - Less cpu overhead at ACK processing. - Better SACK processing, as lot of studies mentioned how awful linux was at this ;) - Less cpu overhead to send the rtx packets (IP stack traversal, netfilter traversal, drivers...) - Better latencies in presence of losses. - Smaller spikes in fq like packet schedulers, as retransmits are not constrained by TCP Small Queues. 1 % packet losses are common today, and at 100Gbit speeds, this translates to ~80,000 losses per second. Losses are often correlated, and we see many retransmit events leading to 1-MSS train of packets, at the time hosts are already under stress. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/tcp.h4
-rw-r--r--net/ipv4/tcp_input.c2
-rw-r--r--net/ipv4/tcp_output.c64
-rw-r--r--net/ipv4/tcp_timer.c4
4 files changed, 34 insertions, 40 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c0ef0544dfcf..7f2553da10d1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -538,8 +538,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
538void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, 538void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
539 int nonagle); 539 int nonagle);
540bool tcp_may_send_now(struct sock *sk); 540bool tcp_may_send_now(struct sock *sk);
541int __tcp_retransmit_skb(struct sock *, struct sk_buff *); 541int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
542int tcp_retransmit_skb(struct sock *, struct sk_buff *); 542int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
543void tcp_retransmit_timer(struct sock *sk); 543void tcp_retransmit_timer(struct sock *sk);
544void tcp_xmit_retransmit_queue(struct sock *); 544void tcp_xmit_retransmit_queue(struct sock *);
545void tcp_simple_retransmit(struct sock *); 545void tcp_simple_retransmit(struct sock *);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 75e8336f6ecd..dcad8f9f96eb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5545,7 +5545,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5545 if (data) { /* Retransmit unacked data in SYN */ 5545 if (data) { /* Retransmit unacked data in SYN */
5546 tcp_for_write_queue_from(data, sk) { 5546 tcp_for_write_queue_from(data, sk) {
5547 if (data == tcp_send_head(sk) || 5547 if (data == tcp_send_head(sk) ||
5548 __tcp_retransmit_skb(sk, data)) 5548 __tcp_retransmit_skb(sk, data, 1))
5549 break; 5549 break;
5550 } 5550 }
5551 tcp_rearm_rto(sk); 5551 tcp_rearm_rto(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a6e4a8353b02..9d3b4b364652 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2268,7 +2268,7 @@ void tcp_send_loss_probe(struct sock *sk)
2268 if (WARN_ON(!skb || !tcp_skb_pcount(skb))) 2268 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2269 goto rearm_timer; 2269 goto rearm_timer;
2270 2270
2271 if (__tcp_retransmit_skb(sk, skb)) 2271 if (__tcp_retransmit_skb(sk, skb, 1))
2272 goto rearm_timer; 2272 goto rearm_timer;
2273 2273
2274 /* Record snd_nxt for loss detection. */ 2274 /* Record snd_nxt for loss detection. */
@@ -2571,17 +2571,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2571 * state updates are done by the caller. Returns non-zero if an 2571 * state updates are done by the caller. Returns non-zero if an
2572 * error occurred which prevented the send. 2572 * error occurred which prevented the send.
2573 */ 2573 */
2574int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 2574int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2575{ 2575{
2576 struct tcp_sock *tp = tcp_sk(sk);
2577 struct inet_connection_sock *icsk = inet_csk(sk); 2576 struct inet_connection_sock *icsk = inet_csk(sk);
2577 struct tcp_sock *tp = tcp_sk(sk);
2578 unsigned int cur_mss; 2578 unsigned int cur_mss;
2579 int err; 2579 int diff, len, err;
2580
2580 2581
2581 /* Inconslusive MTU probe */ 2582 /* Inconclusive MTU probe */
2582 if (icsk->icsk_mtup.probe_size) { 2583 if (icsk->icsk_mtup.probe_size)
2583 icsk->icsk_mtup.probe_size = 0; 2584 icsk->icsk_mtup.probe_size = 0;
2584 }
2585 2585
2586 /* Do not sent more than we queued. 1/4 is reserved for possible 2586 /* Do not sent more than we queued. 1/4 is reserved for possible
2587 * copying overhead: fragmentation, tunneling, mangling etc. 2587 * copying overhead: fragmentation, tunneling, mangling etc.
@@ -2614,30 +2614,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2614 TCP_SKB_CB(skb)->seq != tp->snd_una) 2614 TCP_SKB_CB(skb)->seq != tp->snd_una)
2615 return -EAGAIN; 2615 return -EAGAIN;
2616 2616
2617 if (skb->len > cur_mss) { 2617 len = cur_mss * segs;
2618 if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) 2618 if (skb->len > len) {
2619 if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
2619 return -ENOMEM; /* We'll try again later. */ 2620 return -ENOMEM; /* We'll try again later. */
2620 } else { 2621 } else {
2621 int oldpcount = tcp_skb_pcount(skb); 2622 if (skb_unclone(skb, GFP_ATOMIC))
2623 return -ENOMEM;
2622 2624
2623 if (unlikely(oldpcount > 1)) { 2625 diff = tcp_skb_pcount(skb);
2624 if (skb_unclone(skb, GFP_ATOMIC)) 2626 tcp_set_skb_tso_segs(skb, cur_mss);
2625 return -ENOMEM; 2627 diff -= tcp_skb_pcount(skb);
2626 tcp_init_tso_segs(skb, cur_mss); 2628 if (diff)
2627 tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); 2629 tcp_adjust_pcount(sk, skb, diff);
2628 } 2630 if (skb->len < cur_mss)
2631 tcp_retrans_try_collapse(sk, skb, cur_mss);
2629 } 2632 }
2630 2633
2631 /* RFC3168, section 6.1.1.1. ECN fallback */ 2634 /* RFC3168, section 6.1.1.1. ECN fallback */
2632 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) 2635 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
2633 tcp_ecn_clear_syn(sk, skb); 2636 tcp_ecn_clear_syn(sk, skb);
2634 2637
2635 tcp_retrans_try_collapse(sk, skb, cur_mss);
2636
2637 /* Make a copy, if the first transmission SKB clone we made
2638 * is still in somebody's hands, else make a clone.
2639 */
2640
2641 /* make sure skb->data is aligned on arches that require it 2638 /* make sure skb->data is aligned on arches that require it
2642 * and check if ack-trimming & collapsing extended the headroom 2639 * and check if ack-trimming & collapsing extended the headroom
2643 * beyond what csum_start can cover. 2640 * beyond what csum_start can cover.
@@ -2653,20 +2650,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2653 } 2650 }
2654 2651
2655 if (likely(!err)) { 2652 if (likely(!err)) {
2653 segs = tcp_skb_pcount(skb);
2654
2656 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; 2655 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
2657 /* Update global TCP statistics. */ 2656 /* Update global TCP statistics. */
2658 TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); 2657 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
2659 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) 2658 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2660 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); 2659 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
2661 tp->total_retrans++; 2660 tp->total_retrans += segs;
2662 } 2661 }
2663 return err; 2662 return err;
2664} 2663}
2665 2664
2666int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 2665int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2667{ 2666{
2668 struct tcp_sock *tp = tcp_sk(sk); 2667 struct tcp_sock *tp = tcp_sk(sk);
2669 int err = __tcp_retransmit_skb(sk, skb); 2668 int err = __tcp_retransmit_skb(sk, skb, segs);
2670 2669
2671 if (err == 0) { 2670 if (err == 0) {
2672#if FASTRETRANS_DEBUG > 0 2671#if FASTRETRANS_DEBUG > 0
@@ -2757,6 +2756,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2757 2756
2758 tcp_for_write_queue_from(skb, sk) { 2757 tcp_for_write_queue_from(skb, sk) {
2759 __u8 sacked = TCP_SKB_CB(skb)->sacked; 2758 __u8 sacked = TCP_SKB_CB(skb)->sacked;
2759 int segs;
2760 2760
2761 if (skb == tcp_send_head(sk)) 2761 if (skb == tcp_send_head(sk))
2762 break; 2762 break;
@@ -2764,14 +2764,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2764 if (!hole) 2764 if (!hole)
2765 tp->retransmit_skb_hint = skb; 2765 tp->retransmit_skb_hint = skb;
2766 2766
2767 /* Assume this retransmit will generate 2767 segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
2768 * only one packet for congestion window 2768 if (segs <= 0)
2769 * calculation purposes. This works because
2770 * tcp_retransmit_skb() will chop up the
2771 * packet to be MSS sized and all the
2772 * packet counting works out.
2773 */
2774 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
2775 return; 2769 return;
2776 2770
2777 if (fwd_rexmitting) { 2771 if (fwd_rexmitting) {
@@ -2808,7 +2802,7 @@ begin_fwd:
2808 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) 2802 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
2809 continue; 2803 continue;
2810 2804
2811 if (tcp_retransmit_skb(sk, skb)) 2805 if (tcp_retransmit_skb(sk, skb, segs))
2812 return; 2806 return;
2813 2807
2814 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2808 NET_INC_STATS_BH(sock_net(sk), mib_idx);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 49bc474f8e35..373b03e78aaa 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -404,7 +404,7 @@ void tcp_retransmit_timer(struct sock *sk)
404 goto out; 404 goto out;
405 } 405 }
406 tcp_enter_loss(sk); 406 tcp_enter_loss(sk);
407 tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); 407 tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
408 __sk_dst_reset(sk); 408 __sk_dst_reset(sk);
409 goto out_reset_timer; 409 goto out_reset_timer;
410 } 410 }
@@ -436,7 +436,7 @@ void tcp_retransmit_timer(struct sock *sk)
436 436
437 tcp_enter_loss(sk); 437 tcp_enter_loss(sk);
438 438
439 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { 439 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
440 /* Retransmission failed because of local congestion, 440 /* Retransmission failed because of local congestion,
441 * do not backoff. 441 * do not backoff.
442 */ 442 */