aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuchung Cheng <ycheng@google.com>2013-07-22 19:20:46 -0400
committerDavid S. Miller <davem@davemloft.net>2013-07-22 20:53:42 -0400
commit5b08e47caf1f2034a3a5b566bbccc8b0be3961ca (patch)
tree5cdd3ad01a4b854c80bad612051ac7ce7140e948
parent375fe02c91792917aa26d68a87ab110d1937f44e (diff)
tcp: prefer packet timing to TS-ECR for RTT
Prefer packet timings to TS-ecr for RTT measurements when both sources are available. That's because broken middle-boxes and remote peer can return packets with corrupted TS ECR fields. Similarly most congestion controls that require RTT signals favor timing-based sources as well. Also check for bad TS ECR values to avoid RTT blow-ups. It has happened on production Web servers. Signed-off-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/tcp.h1
-rw-r--r--net/ipv4/tcp_input.c67
2 files changed, 18 insertions, 50 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f9777dbede75..c5868471abae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -591,7 +591,6 @@ extern void tcp_initialize_rcv_mss(struct sock *sk);
591extern int tcp_mtu_to_mss(struct sock *sk, int pmtu); 591extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
592extern int tcp_mss_to_mtu(struct sock *sk, int mss); 592extern int tcp_mss_to_mtu(struct sock *sk, int mss);
593extern void tcp_mtup_init(struct sock *sk); 593extern void tcp_mtup_init(struct sock *sk);
594extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
595extern void tcp_init_buffer_space(struct sock *sk); 594extern void tcp_init_buffer_space(struct sock *sk);
596 595
597static inline void tcp_bound_rto(const struct sock *sk) 596static inline void tcp_bound_rto(const struct sock *sk)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b531710596ec..c7398f05d12b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2792,65 +2792,36 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2792 tcp_xmit_retransmit_queue(sk); 2792 tcp_xmit_retransmit_queue(sk);
2793} 2793}
2794 2794
2795void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) 2795static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
2796 s32 seq_rtt)
2796{ 2797{
2797 tcp_rtt_estimator(sk, seq_rtt); 2798 const struct tcp_sock *tp = tcp_sk(sk);
2798 tcp_set_rto(sk); 2799
2799 inet_csk(sk)->icsk_backoff = 0; 2800 /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
2800} 2801 * broken middle-boxes or peers may corrupt TS-ECR fields. But
2801EXPORT_SYMBOL(tcp_valid_rtt_meas); 2802 * Karn's algorithm forbids taking RTT if some retransmitted data
2803 * is acked (RFC6298).
2804 */
2805 if (flag & FLAG_RETRANS_DATA_ACKED)
2806 seq_rtt = -1;
2802 2807
2803/* Read draft-ietf-tcplw-high-performance before mucking
2804 * with this code. (Supersedes RFC1323)
2805 */
2806static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
2807{
2808 /* RTTM Rule: A TSecr value received in a segment is used to 2808 /* RTTM Rule: A TSecr value received in a segment is used to
2809 * update the averaged RTT measurement only if the segment 2809 * update the averaged RTT measurement only if the segment
2810 * acknowledges some new data, i.e., only if it advances the 2810 * acknowledges some new data, i.e., only if it advances the
2811 * left edge of the send window. 2811 * left edge of the send window.
2812 *
2813 * See draft-ietf-tcplw-high-performance-00, section 3.3. 2812 * See draft-ietf-tcplw-high-performance-00, section 3.3.
2814 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
2815 *
2816 * Changed: reset backoff as soon as we see the first valid sample.
2817 * If we do not, we get strongly overestimated rto. With timestamps
2818 * samples are accepted even from very old segments: f.e., when rtt=1
2819 * increases to 8, we retransmit 5 times and after 8 seconds delayed
2820 * answer arrives rto becomes 120 seconds! If at least one of segments
2821 * in window is lost... Voila. --ANK (010210)
2822 */ 2813 */
2823 struct tcp_sock *tp = tcp_sk(sk); 2814 if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2815 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
2824 2816
2825 tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); 2817 if (seq_rtt < 0)
2826}
2827
2828static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
2829{
2830 /* We don't have a timestamp. Can only use
2831 * packets that are not retransmitted to determine
2832 * rtt estimates. Also, we must not reset the
2833 * backoff for rto until we get a non-retransmitted
2834 * packet. This allows us to deal with a situation
2835 * where the network delay has increased suddenly.
2836 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
2837 */
2838
2839 if (flag & FLAG_RETRANS_DATA_ACKED)
2840 return; 2818 return;
2841 2819
2842 tcp_valid_rtt_meas(sk, seq_rtt); 2820 tcp_rtt_estimator(sk, seq_rtt);
2843} 2821 tcp_set_rto(sk);
2844 2822
2845static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, 2823 /* RFC6298: only reset backoff on valid RTT measurement. */
2846 const s32 seq_rtt) 2824 inet_csk(sk)->icsk_backoff = 0;
2847{
2848 const struct tcp_sock *tp = tcp_sk(sk);
2849 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
2850 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2851 tcp_ack_saw_tstamp(sk, flag);
2852 else if (seq_rtt >= 0)
2853 tcp_ack_no_tstamp(sk, seq_rtt, flag);
2854} 2825}
2855 2826
2856/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ 2827/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
@@ -2989,8 +2960,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
2989 if (sacked & TCPCB_SACKED_RETRANS) 2960 if (sacked & TCPCB_SACKED_RETRANS)
2990 tp->retrans_out -= acked_pcount; 2961 tp->retrans_out -= acked_pcount;
2991 flag |= FLAG_RETRANS_DATA_ACKED; 2962 flag |= FLAG_RETRANS_DATA_ACKED;
2992 ca_seq_rtt = -1;
2993 seq_rtt = -1;
2994 } else { 2963 } else {
2995 ca_seq_rtt = now - scb->when; 2964 ca_seq_rtt = now - scb->when;
2996 last_ackt = skb->tstamp; 2965 last_ackt = skb->tstamp;