1 files changed, 148 insertions, 133 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d377f4854cb8..432c36649db3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -237,7 +237,11 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s
                        tcp_enter_quickack_mode((struct sock *)tp);
                break;
        case INET_ECN_CE:
-                tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+                if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+                        /* Better not delay acks, sender can have a very low cwnd */
+                        tcp_enter_quickack_mode((struct sock *)tp);
+                        tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+                }
                /* fallinto */
        default:
                tp->ecn_flags |= TCP_ECN_SEEN;
@@ -374,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
 /* 4. Try to fixup all. It is made immediately after connection enters
 *    established state.
 */
-static void tcp_init_buffer_space(struct sock *sk)
+void tcp_init_buffer_space(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int maxwin;
@@ -739,29 +743,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
-/* Set slow start threshold and cwnd not falling to slow start */
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        const struct inet_connection_sock *icsk = inet_csk(sk);
-        tp->prior_ssthresh = 0;
-        tp->bytes_acked = 0;
-        if (icsk->icsk_ca_state < TCP_CA_CWR) {
-                tp->undo_marker = 0;
-                if (set_ssthresh)
-                        tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
-                tp->snd_cwnd = min(tp->snd_cwnd,
-                                   tcp_packets_in_flight(tp) + 1U);
-                tp->snd_cwnd_cnt = 0;
-                tp->high_seq = tp->snd_nxt;
-                tp->snd_cwnd_stamp = tcp_time_stamp;
-                TCP_ECN_queue_cwr(tp);
-                tcp_set_ca_state(sk, TCP_CA_CWR);
-        }
-}
 /*
 * Packet counting of FACK is based on in-order assumptions, therefore TCP
 * disables it when reordering is detected
@@ -2489,35 +2470,6 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
-/* Lower bound on congestion window is slow start threshold
- * unless congestion avoidance choice decides to overide it.
- */
-static inline u32 tcp_cwnd_min(const struct sock *sk)
-{
-        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-        return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
-}
-/* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct sock *sk, int flag)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        int decr = tp->snd_cwnd_cnt + 1;
-        if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
-            (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
-                tp->snd_cwnd_cnt = decr & 1;
-                decr >>= 1;
-                if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
-                        tp->snd_cwnd -= decr;
-                tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
-                tp->snd_cwnd_stamp = tcp_time_stamp;
-        }
-}
 /* Nothing was retransmitted or returned timestamp is less
 * than timestamp of the first retransmission.
 */
@@ -2719,24 +2671,80 @@ static bool tcp_try_undo_loss(struct sock *sk)
        return false;
 }
-static inline void tcp_complete_cwr(struct sock *sk)
+/* The cwnd reduction in CWR and Recovery use the PRR algorithm
+ * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
+ * It computes the number of packets to send (sndcnt) based on packets newly
+ * delivered:
+ *   1) If the packets in flight is larger than ssthresh, PRR spreads the
+ *      cwnd reductions across a full RTT.
+ *   2) If packets in flight is lower than ssthresh (such as due to excess
+ *      losses and/or application stalls), do not perform any further cwnd
+ *      reductions, but instead slow start up to ssthresh.
+ */
+static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        /* Do not moderate cwnd if it's already undone in cwr or recovery. */
+        tp->high_seq = tp->snd_nxt;
-        if (tp->undo_marker) {
+        tp->bytes_acked = 0;
-                if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
+        tp->snd_cwnd_cnt = 0;
-                        tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+        tp->prior_cwnd = tp->snd_cwnd;
-                        tp->snd_cwnd_stamp = tcp_time_stamp;
+        tp->prr_delivered = 0;
-                } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
+        tp->prr_out = 0;
-                        /* PRR algorithm. */
+        if (set_ssthresh)
-                        tp->snd_cwnd = tp->snd_ssthresh;
+                tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
-                        tp->snd_cwnd_stamp = tcp_time_stamp;
+        TCP_ECN_queue_cwr(tp);
-                }
+}
+static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
+                               int fast_rexmit)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int sndcnt = 0;
+        int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
+        tp->prr_delivered += newly_acked_sacked;
+        if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+                u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
+                               tp->prior_cwnd - 1;
+                sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
+        } else {
+                sndcnt = min_t(int, delta,
+                               max_t(int, tp->prr_delivered - tp->prr_out,
+                                     newly_acked_sacked) + 1);
+        }
+        sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+        tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+}
+static inline void tcp_end_cwnd_reduction(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+        if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
+            (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
+                tp->snd_cwnd = tp->snd_ssthresh;
+                tp->snd_cwnd_stamp = tcp_time_stamp;
        }
        tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
+/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
+void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        tp->prior_ssthresh = 0;
+        tp->bytes_acked = 0;
+        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+                tp->undo_marker = 0;
+                tcp_init_cwnd_reduction(sk, set_ssthresh);
+                tcp_set_ca_state(sk, TCP_CA_CWR);
+        }
+}
 static void tcp_try_keep_open(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2751,7 +2759,7 @@ static void tcp_try_keep_open(struct sock *sk)
        }
 }
-static void tcp_try_to_open(struct sock *sk, int flag)
+static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2768,7 +2776,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
                if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
                        tcp_moderate_cwnd(tp);
        } else {
-                tcp_cwnd_down(sk, flag);
+                tcp_cwnd_reduction(sk, newly_acked_sacked, 0);
        }
 }
@@ -2850,38 +2858,6 @@ void tcp_simple_retransmit(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_simple_retransmit);
-/* This function implements the PRR algorithm, specifcally the PRR-SSRB
- * (proportional rate reduction with slow start reduction bound) as described in
- * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
- * It computes the number of packets to send (sndcnt) based on packets newly
- * delivered:
- *   1) If the packets in flight is larger than ssthresh, PRR spreads the
- *      cwnd reductions across a full RTT.
- *   2) If packets in flight is lower than ssthresh (such as due to excess
- *      losses and/or application stalls), do not perform any further cwnd
- *      reductions, but instead slow start up to ssthresh.
- */
-static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
-                                        int fast_rexmit, int flag)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        int sndcnt = 0;
-        int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
-        if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
-                u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
-                               tp->prior_cwnd - 1;
-                sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
-        } else {
-                sndcnt = min_t(int, delta,
-                               max_t(int, tp->prr_delivered - tp->prr_out,
-                                     newly_acked_sacked) + 1);
-        }
-        sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
-        tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
-}
 static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2894,7 +2870,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
        NET_INC_STATS_BH(sock_net(sk), mib_idx);
-        tp->high_seq = tp->snd_nxt;
        tp->prior_ssthresh = 0;
        tp->undo_marker = tp->snd_una;
        tp->undo_retrans = tp->retrans_out;
@@ -2902,15 +2877,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
                if (!ece_ack)
                        tp->prior_ssthresh = tcp_current_ssthresh(sk);
-                tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+                tcp_init_cwnd_reduction(sk, true);
-                TCP_ECN_queue_cwr(tp);
        }
-        tp->bytes_acked = 0;
-        tp->snd_cwnd_cnt = 0;
-        tp->prior_cwnd = tp->snd_cwnd;
-        tp->prr_delivered = 0;
-        tp->prr_out = 0;
        tcp_set_ca_state(sk, TCP_CA_Recovery);
 }
@@ -2970,7 +2938,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                        /* CWR is to be held something *above* high_seq
                         * is ACKed for CWR bit to reach receiver. */
                        if (tp->snd_una != tp->high_seq) {
-                                tcp_complete_cwr(sk);
+                                tcp_end_cwnd_reduction(sk);
                                tcp_set_ca_state(sk, TCP_CA_Open);
                        }
                        break;
@@ -2980,7 +2948,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                                tcp_reset_reno_sack(tp);
                        if (tcp_try_undo_recovery(sk))
                                return;
-                        tcp_complete_cwr(sk);
+                        tcp_end_cwnd_reduction(sk);
                        break;
                }
        }
@@ -3021,7 +2989,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                        tcp_try_undo_dsack(sk);
                if (!tcp_time_to_recover(sk, flag)) {
-                        tcp_try_to_open(sk, flag);
+                        tcp_try_to_open(sk, flag, newly_acked_sacked);
                        return;
                }
@@ -3043,8 +3011,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
        if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
                tcp_update_scoreboard(sk, fast_rexmit);
-        tp->prr_delivered += newly_acked_sacked;
+        tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
-        tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
        tcp_xmit_retransmit_queue(sk);
 }
@@ -3123,6 +3090,12 @@ void tcp_rearm_rto(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        /* If the retrans timer is currently being used by Fast Open
+         * for SYN-ACK retrans purpose, stay put.
+         */
+        if (tp->fastopen_rsk)
+                return;
        if (!tp->packets_out) {
                inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
        } else {
@@ -3384,7 +3357,7 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
-                !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
+                !tcp_in_cwnd_reduction(sk);
 }
 /* Check that window update is acceptable.
@@ -3452,9 +3425,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
 }
 /* A conservative spurious RTO response algorithm: reduce cwnd using
- * rate halving and continue in congestion avoidance.
+ * PRR and continue in congestion avoidance.
 */
-static void tcp_ratehalving_spur_to_response(struct sock *sk)
+static void tcp_cwr_spur_to_response(struct sock *sk)
 {
        tcp_enter_cwr(sk, 0);
 }
@@ -3462,7 +3435,7 @@ static void tcp_ratehalving_spur_to_response(struct sock *sk)
 static void tcp_undo_spur_to_response(struct sock *sk, int flag)
 {
        if (flag & FLAG_ECE)
-                tcp_ratehalving_spur_to_response(sk);
+                tcp_cwr_spur_to_response(sk);
        else
                tcp_undo_cwr(sk, true);
 }
@@ -3569,7 +3542,7 @@ static bool tcp_process_frto(struct sock *sk, int flag)
                        tcp_conservative_spur_to_response(tp);
                        break;
                default:
-                        tcp_ratehalving_spur_to_response(sk);
+                        tcp_cwr_spur_to_response(sk);
                        break;
                }
                tp->frto_counter = 0;
@@ -4034,7 +4007,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
 }
 /* When we get a reset we do this. */
-static void tcp_reset(struct sock *sk)
+void tcp_reset(struct sock *sk)
 {
        /* We want the right error as BSD sees it (and indeed as we do). */
        switch (sk->sk_state) {
@@ -5740,7 +5713,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                TCP_ECN_rcv_synack(tp, th);
-                tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
                tcp_ack(sk, skb, FLAG_SLOWPATH);
                /* Ok.. it's good. Set up sequence numbers and
@@ -5753,7 +5726,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                 * never scaled.
                 */
                tp->snd_wnd = ntohs(th->window);
-                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
                if (!tp->rx_opt.wscale_ok) {
                        tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5891,7 +5863,9 @@ discard:
                tcp_send_synack(sk);
 #if 0
                /* Note, we could accept data and URG from this segment.
-                 * There are no obstacles to make this.
+                 * There are no obstacles to make this (except that we must
+                 * either change tcp_recvmsg() to prevent it from returning data
+                 * before 3WHS completes per RFC793, or employ TCP Fast Open).
                 *
                 * However, if we ignore data in ACKless segments sometimes,
                 * we have no reasons to accept it sometimes.
@@ -5931,6 +5905,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct request_sock *req;
        int queued = 0;
        tp->rx_opt.saw_tstamp = 0;
@@ -5986,6 +5961,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                return 0;
        }
+        req = tp->fastopen_rsk;
+        if (req != NULL) {
+                BUG_ON(sk->sk_state != TCP_SYN_RECV &&
+                    sk->sk_state != TCP_FIN_WAIT1);
+                if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
+                        goto discard;
+        }
        if (!tcp_validate_incoming(sk, skb, th, 0))
                return 0;
@@ -5996,7 +5979,25 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                switch (sk->sk_state) {
                case TCP_SYN_RECV:
                        if (acceptable) {
-                                tp->copied_seq = tp->rcv_nxt;
+                                /* Once we leave TCP_SYN_RECV, we no longer
+                                 * need req so release it.
+                                 */
+                                if (req) {
+                                        tcp_synack_rtt_meas(sk, req);
+                                        tp->total_retrans = req->retrans;
+                                        reqsk_fastopen_remove(sk, req, false);
+                                } else {
+                                        /* Make sure socket is routed, for
+                                         * correct metrics.
+                                         */
+                                        icsk->icsk_af_ops->rebuild_header(sk);
+                                        tcp_init_congestion_control(sk);
+                                        tcp_mtup_init(sk);
+                                        tcp_init_buffer_space(sk);
+                                        tp->copied_seq = tp->rcv_nxt;
+                                }
                                smp_mb();
                                tcp_set_state(sk, TCP_ESTABLISHED);
                                sk->sk_state_change(sk);
@@ -6018,23 +6019,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                if (tp->rx_opt.tstamp_ok)
                                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
-                                /* Make sure socket is routed, for
+                                if (req) {
-                                 * correct metrics.
+                                        /* Re-arm the timer because data may
-                                 */
+                                         * have been sent out. This is similar
-                                icsk->icsk_af_ops->rebuild_header(sk);
+                                         * to the regular data transmission case
+                                         * when new data has just been ack'ed.
-                                tcp_init_metrics(sk);
+                                         *
+                                         * (TFO) - we could try to be more
-                                tcp_init_congestion_control(sk);
+                                         * aggressive and retranmitting any data
+                                         * sooner based on when they were sent
+                                         * out.
+                                         */
+                                        tcp_rearm_rto(sk);
+                                } else
+                                        tcp_init_metrics(sk);
                                /* Prevent spurious tcp_cwnd_restart() on
                                 * first data packet.
                                 */
                                tp->lsndtime = tcp_time_stamp;
-                                tcp_mtup_init(sk);
                                tcp_initialize_rcv_mss(sk);
-                                tcp_init_buffer_space(sk);
                                tcp_fast_path_on(tp);
                        } else {
                                return 1;
@@ -6042,6 +6047,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        break;
                case TCP_FIN_WAIT1:
+                        /* If we enter the TCP_FIN_WAIT1 state and we are a
+                         * Fast Open socket and this is the first acceptable
+                         * ACK we have received, this would have acknowledged
+                         * our SYNACK so stop the SYNACK timer.
+                         */
+                        if (acceptable && req != NULL) {
+                                /* We no longer need the request sock. */
+                                reqsk_fastopen_remove(sk, req, false);
+                                tcp_rearm_rto(sk);
+                        }
                        if (tp->snd_una == tp->write_seq) {
                                struct dst_entry *dst;