diff options
author | Eric Dumazet <edumazet@google.com> | 2017-04-25 13:15:41 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-04-26 14:44:39 -0400 |
commit | 645f4c6f2ebd040688cc2a5f626ffc909e66ccf2 (patch) | |
tree | 7ceebdc356adf410a9d78bd51d7e200697d19658 /net/ipv4/tcp_input.c | |
parent | a6db50b81e3f20b2b692bbddd35d9484057eae9d (diff) |
tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps
Some devices or distributions use HZ=100 or HZ=250
TCP receive buffer autotuning has poor behavior caused by this choice.
Since autotuning happens after 4 ms or 10 ms, short distance flows
get their receive buffer tuned to a very high value, but after an initial
period where it was frozen to (too small) initial value.
With tp->tcp_mstamp introduction, we can switch to high resolution
timestamps almost for free (at the expense of 8 additional bytes per
TCP structure)
Note that some TCP stacks use usec TCP timestamps where this
patch makes even more sense : Many TCP flows have < 500 usec RTT.
Hopefully this finer TS option can be standardized soon.
Tested:
HZ=100 kernel
./netperf -H lpaa24 -t TCP_RR -l 1000 -- -r 10000,10000 &
Peer without patch :
lpaa24:~# ss -tmi dst lpaa23
...
skmem:(r0,rb8388608,...)
rcv_rtt:10 rcv_space:3210000 minrtt:0.017
Peer with the patch :
lpaa23:~# ss -tmi dst lpaa24
...
skmem:(r0,rb428800,...)
rcv_rtt:0.069 rcv_space:30000 minrtt:0.017
We can see saner RCVBUF, and more precise rcv_rtt information.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 28 |
1 files changed, 17 insertions, 11 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f475f0b53bfe..9739962bfb3f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -442,7 +442,8 @@ void tcp_init_buffer_space(struct sock *sk) | |||
442 | tcp_sndbuf_expand(sk); | 442 | tcp_sndbuf_expand(sk); |
443 | 443 | ||
444 | tp->rcvq_space.space = tp->rcv_wnd; | 444 | tp->rcvq_space.space = tp->rcv_wnd; |
445 | tp->rcvq_space.time = tcp_time_stamp; | 445 | skb_mstamp_get(&tp->tcp_mstamp); |
446 | tp->rcvq_space.time = tp->tcp_mstamp; | ||
446 | tp->rcvq_space.seq = tp->copied_seq; | 447 | tp->rcvq_space.seq = tp->copied_seq; |
447 | 448 | ||
448 | maxwin = tcp_full_space(sk); | 449 | maxwin = tcp_full_space(sk); |
@@ -518,7 +519,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); | |||
518 | */ | 519 | */ |
519 | static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) | 520 | static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) |
520 | { | 521 | { |
521 | u32 new_sample = tp->rcv_rtt_est.rtt; | 522 | u32 new_sample = tp->rcv_rtt_est.rtt_us; |
522 | long m = sample; | 523 | long m = sample; |
523 | 524 | ||
524 | if (m == 0) | 525 | if (m == 0) |
@@ -548,21 +549,23 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) | |||
548 | new_sample = m << 3; | 549 | new_sample = m << 3; |
549 | } | 550 | } |
550 | 551 | ||
551 | if (tp->rcv_rtt_est.rtt != new_sample) | 552 | tp->rcv_rtt_est.rtt_us = new_sample; |
552 | tp->rcv_rtt_est.rtt = new_sample; | ||
553 | } | 553 | } |
554 | 554 | ||
555 | static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) | 555 | static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) |
556 | { | 556 | { |
557 | if (tp->rcv_rtt_est.time == 0) | 557 | u32 delta_us; |
558 | |||
559 | if (tp->rcv_rtt_est.time.v64 == 0) | ||
558 | goto new_measure; | 560 | goto new_measure; |
559 | if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) | 561 | if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) |
560 | return; | 562 | return; |
561 | tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); | 563 | delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time); |
564 | tcp_rcv_rtt_update(tp, delta_us, 1); | ||
562 | 565 | ||
563 | new_measure: | 566 | new_measure: |
564 | tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; | 567 | tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; |
565 | tp->rcv_rtt_est.time = tcp_time_stamp; | 568 | tp->rcv_rtt_est.time = tp->tcp_mstamp; |
566 | } | 569 | } |
567 | 570 | ||
568 | static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, | 571 | static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, |
@@ -572,7 +575,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, | |||
572 | if (tp->rx_opt.rcv_tsecr && | 575 | if (tp->rx_opt.rcv_tsecr && |
573 | (TCP_SKB_CB(skb)->end_seq - | 576 | (TCP_SKB_CB(skb)->end_seq - |
574 | TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) | 577 | TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) |
575 | tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); | 578 | tcp_rcv_rtt_update(tp, |
579 | jiffies_to_usecs(tcp_time_stamp - | ||
580 | tp->rx_opt.rcv_tsecr), | ||
581 | 0); | ||
576 | } | 582 | } |
577 | 583 | ||
578 | /* | 584 | /* |
@@ -585,8 +591,8 @@ void tcp_rcv_space_adjust(struct sock *sk) | |||
585 | int time; | 591 | int time; |
586 | int copied; | 592 | int copied; |
587 | 593 | ||
588 | time = tcp_time_stamp - tp->rcvq_space.time; | 594 | time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time); |
589 | if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) | 595 | if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) |
590 | return; | 596 | return; |
591 | 597 | ||
592 | /* Number of bytes copied to user in last RTT */ | 598 | /* Number of bytes copied to user in last RTT */ |
@@ -642,7 +648,7 @@ void tcp_rcv_space_adjust(struct sock *sk) | |||
642 | 648 | ||
643 | new_measure: | 649 | new_measure: |
644 | tp->rcvq_space.seq = tp->copied_seq; | 650 | tp->rcvq_space.seq = tp->copied_seq; |
645 | tp->rcvq_space.time = tcp_time_stamp; | 651 | tp->rcvq_space.time = tp->tcp_mstamp; |
646 | } | 652 | } |
647 | 653 | ||
648 | /* There is something which you must keep in mind when you analyze the | 654 | /* There is something which you must keep in mind when you analyze the |