aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-05-01 11:47:54 -0400
committerDavid S. Miller <davem@davemloft.net>2019-05-01 11:47:54 -0400
commitcd86972a9fd076aea43165394b05bbca26254cd7 (patch)
tree0d8c0ce941e0e6441fb61631d1680b29b0c9eaf7 /net/ipv4/tcp_input.c
parent6d1474a94ea2641f56c7893eb1e30558fd92f55d (diff)
parent98fa6271cfcb1de873b3fe0caf48d9daa1bcc0ac (diff)
Merge branch 'tcp-undo-congestion'
Yuchung Cheng says: ==================== undo congestion window on spurious SYN or SYNACK timeout Linux TCP currently uses the initial congestion window of 1 packet if multiple SYN or SYNACK timeouts per RFC6298. However such timeouts are often spurious on wireless or cellular networks that experience high delay variances (e.g. ramping up dormant radios or local link retransmission). Another case is when the underlying path is longer than the default SYN timeout (e.g. 1 second). In these cases starting the transfer with a minimal congestion window is detrimental to the performance for short flows. One naive approach is to simply ignore SYN or SYNACK timeouts and always use a larger or default initial window. This approach however risks pouring gas to the fire when the network is already highly congested. This is particularly true in data center where application could start thousands to millions of connections over a single or multiple hosts resulting in high SYN drops (e.g. incast). This patch-set detects spurious SYN and SYNACK timeouts upon completing the handshake via the widely-supported TCP timestamp options. Upon such events the sender reverts to the default initial window to start the data transfer so it gets best of both worlds. This patch-set supports this feature for both active and passive as well as Fast Open or regular connections. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c99
1 files changed, 72 insertions, 27 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 97671bff597a..077d9abdfcf5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2252,7 +2252,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
2252 */ 2252 */
2253static inline bool tcp_packet_delayed(const struct tcp_sock *tp) 2253static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2254{ 2254{
2255 return !tp->retrans_stamp || 2255 return tp->retrans_stamp &&
2256 tcp_tsopt_ecr_before(tp, tp->retrans_stamp); 2256 tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
2257} 2257}
2258 2258
@@ -3521,7 +3521,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
3521{ 3521{
3522 struct tcp_sock *tp = tcp_sk(sk); 3522 struct tcp_sock *tp = tcp_sk(sk);
3523 3523
3524 if (rexmit == REXMIT_NONE) 3524 if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
3525 return; 3525 return;
3526 3526
3527 if (unlikely(rexmit == 2)) { 3527 if (unlikely(rexmit == 2)) {
@@ -5647,6 +5647,32 @@ discard:
5647} 5647}
5648EXPORT_SYMBOL(tcp_rcv_established); 5648EXPORT_SYMBOL(tcp_rcv_established);
5649 5649
5650void tcp_init_transfer(struct sock *sk, int bpf_op)
5651{
5652 struct inet_connection_sock *icsk = inet_csk(sk);
5653 struct tcp_sock *tp = tcp_sk(sk);
5654
5655 tcp_mtup_init(sk);
5656 icsk->icsk_af_ops->rebuild_header(sk);
5657 tcp_init_metrics(sk);
5658
5659 /* Initialize the congestion window to start the transfer.
5660 * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
5661 * retransmitted. In light of RFC6298 more aggressive 1sec
5662 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
5663 * retransmission has occurred.
5664 */
5665 if (tp->total_retrans > 1 && tp->undo_marker)
5666 tp->snd_cwnd = 1;
5667 else
5668 tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
5669 tp->snd_cwnd_stamp = tcp_jiffies32;
5670
5671 tcp_call_bpf(sk, bpf_op, 0, NULL);
5672 tcp_init_congestion_control(sk);
5673 tcp_init_buffer_space(sk);
5674}
5675
5650void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) 5676void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5651{ 5677{
5652 struct tcp_sock *tp = tcp_sk(sk); 5678 struct tcp_sock *tp = tcp_sk(sk);
@@ -5748,6 +5774,21 @@ static void smc_check_reset_syn(struct tcp_sock *tp)
5748#endif 5774#endif
5749} 5775}
5750 5776
5777static void tcp_try_undo_spurious_syn(struct sock *sk)
5778{
5779 struct tcp_sock *tp = tcp_sk(sk);
5780 u32 syn_stamp;
5781
5782 /* undo_marker is set when SYN or SYNACK times out. The timeout is
5783 * spurious if the ACK's timestamp option echo value matches the
5784 * original SYN timestamp.
5785 */
5786 syn_stamp = tp->retrans_stamp;
5787 if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
5788 syn_stamp == tp->rx_opt.rcv_tsecr)
5789 tp->undo_marker = 0;
5790}
5791
5751static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5792static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5752 const struct tcphdr *th) 5793 const struct tcphdr *th)
5753{ 5794{
@@ -5815,6 +5856,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5815 tcp_ecn_rcv_synack(tp, th); 5856 tcp_ecn_rcv_synack(tp, th);
5816 5857
5817 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5858 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5859 tcp_try_undo_spurious_syn(sk);
5818 tcp_ack(sk, skb, FLAG_SLOWPATH); 5860 tcp_ack(sk, skb, FLAG_SLOWPATH);
5819 5861
5820 /* Ok.. it's good. Set up sequence numbers and 5862 /* Ok.. it's good. Set up sequence numbers and
@@ -5973,6 +6015,27 @@ reset_and_undo:
5973 return 1; 6015 return 1;
5974} 6016}
5975 6017
6018static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
6019{
6020 tcp_try_undo_loss(sk, false);
6021 inet_csk(sk)->icsk_retransmits = 0;
6022
6023 /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
6024 * we no longer need req so release it.
6025 */
6026 reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false);
6027
6028 /* Re-arm the timer because data may have been sent out.
6029 * This is similar to the regular data transmission case
6030 * when new data has just been ack'ed.
6031 *
6032 * (TFO) - we could try to be more aggressive and
6033 * retransmitting any data sooner based on when they
6034 * are sent out.
6035 */
6036 tcp_rearm_rto(sk);
6037}
6038
5976/* 6039/*
5977 * This function implements the receiving procedure of RFC 793 for 6040 * This function implements the receiving procedure of RFC 793 for
5978 * all states except ESTABLISHED and TIME_WAIT. 6041 * all states except ESTABLISHED and TIME_WAIT.
@@ -6069,22 +6132,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
6069 if (!tp->srtt_us) 6132 if (!tp->srtt_us)
6070 tcp_synack_rtt_meas(sk, req); 6133 tcp_synack_rtt_meas(sk, req);
6071 6134
6072 /* Once we leave TCP_SYN_RECV, we no longer need req
6073 * so release it.
6074 */
6075 if (req) { 6135 if (req) {
6076 inet_csk(sk)->icsk_retransmits = 0; 6136 tcp_rcv_synrecv_state_fastopen(sk);
6077 reqsk_fastopen_remove(sk, req, false);
6078 /* Re-arm the timer because data may have been sent out.
6079 * This is similar to the regular data transmission case
6080 * when new data has just been ack'ed.
6081 *
6082 * (TFO) - we could try to be more aggressive and
6083 * retransmitting any data sooner based on when they
6084 * are sent out.
6085 */
6086 tcp_rearm_rto(sk);
6087 } else { 6137 } else {
6138 tcp_try_undo_spurious_syn(sk);
6139 tp->retrans_stamp = 0;
6088 tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); 6140 tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
6089 tp->copied_seq = tp->rcv_nxt; 6141 tp->copied_seq = tp->rcv_nxt;
6090 } 6142 }
@@ -6119,16 +6171,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
6119 case TCP_FIN_WAIT1: { 6171 case TCP_FIN_WAIT1: {
6120 int tmo; 6172 int tmo;
6121 6173
6122 /* If we enter the TCP_FIN_WAIT1 state and we are a 6174 if (req)
6123 * Fast Open socket and this is the first acceptable 6175 tcp_rcv_synrecv_state_fastopen(sk);
6124 * ACK we have received, this would have acknowledged 6176
6125 * our SYNACK so stop the SYNACK timer.
6126 */
6127 if (req) {
6128 /* We no longer need the request sock. */
6129 reqsk_fastopen_remove(sk, req, false);
6130 tcp_rearm_rto(sk);
6131 }
6132 if (tp->snd_una != tp->write_seq) 6177 if (tp->snd_una != tp->write_seq)
6133 break; 6178 break;
6134 6179
@@ -6303,7 +6348,7 @@ static void tcp_openreq_init(struct request_sock *req,
6303 req->cookie_ts = 0; 6348 req->cookie_ts = 0;
6304 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; 6349 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
6305 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 6350 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
6306 tcp_rsk(req)->snt_synack = tcp_clock_us(); 6351 tcp_rsk(req)->snt_synack = 0;
6307 tcp_rsk(req)->last_oow_ack_time = 0; 6352 tcp_rsk(req)->last_oow_ack_time = 0;
6308 req->mss = rx_opt->mss_clamp; 6353 req->mss = rx_opt->mss_clamp;
6309 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; 6354 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;