aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2013-08-27 08:46:32 -0400
committerDavid S. Miller <davem@davemloft.net>2013-08-29 15:50:06 -0400
commit95bd09eb27507691520d39ee1044d6ad831c1168 (patch)
treee05045cc6418ce08aa87d5f8c17366a7fa672f3c /net/ipv4/tcp_input.c
parentb800c3b966bcf004bd8592293a49ed5cb7ea67a9 (diff)
tcp: TSO packets automatic sizing
After hearing many people over past years complaining against TSO being bursty or even buggy, we are proud to present automatic sizing of TSO packets. One part of the problem is that tcp_tso_should_defer() uses an heuristic relying on upcoming ACKS instead of a timer, but more generally, having big TSO packets makes little sense for low rates, as it tends to create micro bursts on the network, and general consensus is to reduce the buffering amount. This patch introduces a per socket sk_pacing_rate, that approximates the current sending rate, and allows us to size the TSO packets so that we try to send one packet every ms. This field could be set by other transports. Patch has no impact for high speed flows, where having large TSO packets makes sense to reach line rate. For other flows, this helps better packet scheduling and ACK clocking. This patch increases performance of TCP flows in lossy environments. A new sysctl (tcp_min_tso_segs) is added, to specify the minimal size of a TSO packet (default being 2). A follow-up patch will provide a new packet scheduler (FQ), using sk_pacing_rate as an input to perform optional per flow pacing. This explains why we chose to set sk_pacing_rate to twice the current rate, allowing 'slow start' ramp up. sk_pacing_rate = 2 * cwnd * mss / srtt v2: Neal Cardwell reported a suspect deferring of last two segments on initial write of 10 MSS, I had to change tcp_tso_should_defer() to take into account tp->xmit_size_goal_segs Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Van Jacobson <vanj@google.com> Cc: Tom Herbert <therbert@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c32
1 files changed, 31 insertions, 1 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ec492eae0cd7..1a84fffe6993 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
688 } 688 }
689} 689}
690 690
691/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
692 * Note: TCP stack does not yet implement pacing.
693 * FQ packet scheduler can be used to implement cheap but effective
694 * TCP pacing, to smooth the burst on large writes when packets
695 * in flight is significantly lower than cwnd (or rwin)
696 */
697static void tcp_update_pacing_rate(struct sock *sk)
698{
699 const struct tcp_sock *tp = tcp_sk(sk);
700 u64 rate;
701
702 /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
703 rate = (u64)tp->mss_cache * 2 * (HZ << 3);
704
705 rate *= max(tp->snd_cwnd, tp->packets_out);
706
707 /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
708 * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
709 * We probably need usec resolution in the future.
710 * Note: This also takes care of possible srtt=0 case,
711 * when tcp_rtt_estimator() was not yet called.
712 */
713 if (tp->srtt > 8 + 2)
714 do_div(rate, tp->srtt);
715
716 sk->sk_pacing_rate = min_t(u64, rate, ~0U);
717}
718
691/* Calculate rto without backoff. This is the second half of Van Jacobson's 719/* Calculate rto without backoff. This is the second half of Van Jacobson's
692 * routine referred to above. 720 * routine referred to above.
693 */ 721 */
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3278 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3306 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3279 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3307 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3280 bool is_dupack = false; 3308 bool is_dupack = false;
3281 u32 prior_in_flight; 3309 u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
3282 u32 prior_fackets; 3310 u32 prior_fackets;
3283 int prior_packets = tp->packets_out; 3311 int prior_packets = tp->packets_out;
3284 const int prior_unsacked = tp->packets_out - tp->sacked_out; 3312 const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3383 3411
3384 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3412 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3385 tcp_schedule_loss_probe(sk); 3413 tcp_schedule_loss_probe(sk);
3414 if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
3415 tcp_update_pacing_rate(sk);
3386 return 1; 3416 return 1;
3387 3417
3388no_queue: 3418no_queue: