aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2017-04-25 13:15:41 -0400
committerDavid S. Miller <davem@davemloft.net>2017-04-26 14:44:39 -0400
commit645f4c6f2ebd040688cc2a5f626ffc909e66ccf2 (patch)
tree7ceebdc356adf410a9d78bd51d7e200697d19658 /net/ipv4/tcp_input.c
parenta6db50b81e3f20b2b692bbddd35d9484057eae9d (diff)
tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps
Some devices or distributions use HZ=100 or HZ=250 TCP receive buffer autotuning has poor behavior caused by this choice. Since autotuning happens after 4 ms or 10 ms, short distance flows get their receive buffer tuned to a very high value, but after an initial period where it was frozen to (too small) initial value. With tp->tcp_mstamp introduction, we can switch to high resolution timestamps almost for free (at the expense of 8 additional bytes per TCP structure) Note that some TCP stacks use usec TCP timestamps where this patch makes even more sense : Many TCP flows have < 500 usec RTT. Hopefully this finer TS option can be standardized soon. Tested: HZ=100 kernel ./netperf -H lpaa24 -t TCP_RR -l 1000 -- -r 10000,10000 & Peer without patch : lpaa24:~# ss -tmi dst lpaa23 ... skmem:(r0,rb8388608,...) rcv_rtt:10 rcv_space:3210000 minrtt:0.017 Peer with the patch : lpaa23:~# ss -tmi dst lpaa24 ... skmem:(r0,rb428800,...) rcv_rtt:0.069 rcv_space:30000 minrtt:0.017 We can see saner RCVBUF, and more precise rcv_rtt information. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c28
1 files changed, 17 insertions, 11 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f475f0b53bfe..9739962bfb3f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -442,7 +442,8 @@ void tcp_init_buffer_space(struct sock *sk)
442 tcp_sndbuf_expand(sk); 442 tcp_sndbuf_expand(sk);
443 443
444 tp->rcvq_space.space = tp->rcv_wnd; 444 tp->rcvq_space.space = tp->rcv_wnd;
445 tp->rcvq_space.time = tcp_time_stamp; 445 skb_mstamp_get(&tp->tcp_mstamp);
446 tp->rcvq_space.time = tp->tcp_mstamp;
446 tp->rcvq_space.seq = tp->copied_seq; 447 tp->rcvq_space.seq = tp->copied_seq;
447 448
448 maxwin = tcp_full_space(sk); 449 maxwin = tcp_full_space(sk);
@@ -518,7 +519,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
518 */ 519 */
519static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) 520static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
520{ 521{
521 u32 new_sample = tp->rcv_rtt_est.rtt; 522 u32 new_sample = tp->rcv_rtt_est.rtt_us;
522 long m = sample; 523 long m = sample;
523 524
524 if (m == 0) 525 if (m == 0)
@@ -548,21 +549,23 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
548 new_sample = m << 3; 549 new_sample = m << 3;
549 } 550 }
550 551
551 if (tp->rcv_rtt_est.rtt != new_sample) 552 tp->rcv_rtt_est.rtt_us = new_sample;
552 tp->rcv_rtt_est.rtt = new_sample;
553} 553}
554 554
555static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) 555static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
556{ 556{
557 if (tp->rcv_rtt_est.time == 0) 557 u32 delta_us;
558
559 if (tp->rcv_rtt_est.time.v64 == 0)
558 goto new_measure; 560 goto new_measure;
559 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) 561 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
560 return; 562 return;
561 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); 563 delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time);
564 tcp_rcv_rtt_update(tp, delta_us, 1);
562 565
563new_measure: 566new_measure:
564 tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; 567 tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
565 tp->rcv_rtt_est.time = tcp_time_stamp; 568 tp->rcv_rtt_est.time = tp->tcp_mstamp;
566} 569}
567 570
568static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, 571static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
@@ -572,7 +575,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
572 if (tp->rx_opt.rcv_tsecr && 575 if (tp->rx_opt.rcv_tsecr &&
573 (TCP_SKB_CB(skb)->end_seq - 576 (TCP_SKB_CB(skb)->end_seq -
574 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) 577 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
575 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); 578 tcp_rcv_rtt_update(tp,
579 jiffies_to_usecs(tcp_time_stamp -
580 tp->rx_opt.rcv_tsecr),
581 0);
576} 582}
577 583
578/* 584/*
@@ -585,8 +591,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
585 int time; 591 int time;
586 int copied; 592 int copied;
587 593
588 time = tcp_time_stamp - tp->rcvq_space.time; 594 time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time);
589 if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) 595 if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
590 return; 596 return;
591 597
592 /* Number of bytes copied to user in last RTT */ 598 /* Number of bytes copied to user in last RTT */
@@ -642,7 +648,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
642 648
643new_measure: 649new_measure:
644 tp->rcvq_space.seq = tp->copied_seq; 650 tp->rcvq_space.seq = tp->copied_seq;
645 tp->rcvq_space.time = tcp_time_stamp; 651 tp->rcvq_space.time = tp->tcp_mstamp;
646} 652}
647 653
648/* There is something which you must keep in mind when you analyze the 654/* There is something which you must keep in mind when you analyze the