diff options
author | Yuchung Cheng <ycheng@google.com> | 2013-07-22 19:20:46 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-07-22 20:53:42 -0400 |
commit | 5b08e47caf1f2034a3a5b566bbccc8b0be3961ca (patch) | |
tree | 5cdd3ad01a4b854c80bad612051ac7ce7140e948 /net | |
parent | 375fe02c91792917aa26d68a87ab110d1937f44e (diff) |
tcp: prefer packet timing to TS-ECR for RTT
Prefer packet timings to TS-ecr for RTT measurements when both
sources are available. That's because broken middle-boxes and remote
peer can return packets with corrupted TS ECR fields. Similarly most
congestion controls that require RTT signals favor timing-based
sources as well. Also check for bad TS ECR values to avoid RTT
blow-ups. It has happened on production Web servers.
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/ipv4/tcp_input.c | 67 |
1 files changed, 18 insertions, 49 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b531710596ec..c7398f05d12b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -2792,65 +2792,36 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, | |||
2792 | tcp_xmit_retransmit_queue(sk); | 2792 | tcp_xmit_retransmit_queue(sk); |
2793 | } | 2793 | } |
2794 | 2794 | ||
2795 | void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) | 2795 | static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, |
2796 | s32 seq_rtt) | ||
2796 | { | 2797 | { |
2797 | tcp_rtt_estimator(sk, seq_rtt); | 2798 | const struct tcp_sock *tp = tcp_sk(sk); |
2798 | tcp_set_rto(sk); | 2799 | |
2799 | inet_csk(sk)->icsk_backoff = 0; | 2800 | /* Prefer RTT measured from ACK's timing to TS-ECR. This is because |
2800 | } | 2801 | * broken middle-boxes or peers may corrupt TS-ECR fields. But |
2801 | EXPORT_SYMBOL(tcp_valid_rtt_meas); | 2802 | * Karn's algorithm forbids taking RTT if some retransmitted data |
2803 | * is acked (RFC6298). | ||
2804 | */ | ||
2805 | if (flag & FLAG_RETRANS_DATA_ACKED) | ||
2806 | seq_rtt = -1; | ||
2802 | 2807 | ||
2803 | /* Read draft-ietf-tcplw-high-performance before mucking | ||
2804 | * with this code. (Supersedes RFC1323) | ||
2805 | */ | ||
2806 | static void tcp_ack_saw_tstamp(struct sock *sk, int flag) | ||
2807 | { | ||
2808 | /* RTTM Rule: A TSecr value received in a segment is used to | 2808 | /* RTTM Rule: A TSecr value received in a segment is used to |
2809 | * update the averaged RTT measurement only if the segment | 2809 | * update the averaged RTT measurement only if the segment |
2810 | * acknowledges some new data, i.e., only if it advances the | 2810 | * acknowledges some new data, i.e., only if it advances the |
2811 | * left edge of the send window. | 2811 | * left edge of the send window. |
2812 | * | ||
2813 | * See draft-ietf-tcplw-high-performance-00, section 3.3. | 2812 | * See draft-ietf-tcplw-high-performance-00, section 3.3. |
2814 | * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> | ||
2815 | * | ||
2816 | * Changed: reset backoff as soon as we see the first valid sample. | ||
2817 | * If we do not, we get strongly overestimated rto. With timestamps | ||
2818 | * samples are accepted even from very old segments: f.e., when rtt=1 | ||
2819 | * increases to 8, we retransmit 5 times and after 8 seconds delayed | ||
2820 | * answer arrives rto becomes 120 seconds! If at least one of segments | ||
2821 | * in window is lost... Voila. --ANK (010210) | ||
2822 | */ | 2813 | */ |
2823 | struct tcp_sock *tp = tcp_sk(sk); | 2814 | if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
2815 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | ||
2824 | 2816 | ||
2825 | tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); | 2817 | if (seq_rtt < 0) |
2826 | } | ||
2827 | |||
2828 | static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) | ||
2829 | { | ||
2830 | /* We don't have a timestamp. Can only use | ||
2831 | * packets that are not retransmitted to determine | ||
2832 | * rtt estimates. Also, we must not reset the | ||
2833 | * backoff for rto until we get a non-retransmitted | ||
2834 | * packet. This allows us to deal with a situation | ||
2835 | * where the network delay has increased suddenly. | ||
2836 | * I.e. Karn's algorithm. (SIGCOMM '87, p5.) | ||
2837 | */ | ||
2838 | |||
2839 | if (flag & FLAG_RETRANS_DATA_ACKED) | ||
2840 | return; | 2818 | return; |
2841 | 2819 | ||
2842 | tcp_valid_rtt_meas(sk, seq_rtt); | 2820 | tcp_rtt_estimator(sk, seq_rtt); |
2843 | } | 2821 | tcp_set_rto(sk); |
2844 | 2822 | ||
2845 | static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, | 2823 | /* RFC6298: only reset backoff on valid RTT measurement. */ |
2846 | const s32 seq_rtt) | 2824 | inet_csk(sk)->icsk_backoff = 0; |
2847 | { | ||
2848 | const struct tcp_sock *tp = tcp_sk(sk); | ||
2849 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | ||
2850 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | ||
2851 | tcp_ack_saw_tstamp(sk, flag); | ||
2852 | else if (seq_rtt >= 0) | ||
2853 | tcp_ack_no_tstamp(sk, seq_rtt, flag); | ||
2854 | } | 2825 | } |
2855 | 2826 | ||
2856 | /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ | 2827 | /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ |
@@ -2989,8 +2960,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
2989 | if (sacked & TCPCB_SACKED_RETRANS) | 2960 | if (sacked & TCPCB_SACKED_RETRANS) |
2990 | tp->retrans_out -= acked_pcount; | 2961 | tp->retrans_out -= acked_pcount; |
2991 | flag |= FLAG_RETRANS_DATA_ACKED; | 2962 | flag |= FLAG_RETRANS_DATA_ACKED; |
2992 | ca_seq_rtt = -1; | ||
2993 | seq_rtt = -1; | ||
2994 | } else { | 2963 | } else { |
2995 | ca_seq_rtt = now - scb->when; | 2964 | ca_seq_rtt = now - scb->when; |
2996 | last_ackt = skb->tstamp; | 2965 | last_ackt = skb->tstamp; |