Proportional Rate Reduction for TCP.

This patch implements Proportional Rate Reduction (PRR) for TCP. PRR is an algorithm that determines TCP's sending rate in fast recovery. PRR avoids excessive window reductions and aims for the actual congestion window size at the end of recovery to be as close as possible to the window determined by the congestion control algorithm. PRR also improves accuracy of the amount of data sent during loss recovery. The patch implements the recommended flavor of PRR called PRR-SSRB (Proportional rate reduction with slow start reduction bound) and replaces the existing rate halving algorithm. PRR improves upon the existing Linux fast recovery under a number of conditions including: 1) burst losses where the losses implicitly reduce the amount of outstanding data (pipe) below the ssthresh value selected by the congestion control algorithm and, 2) losses near the end of short flows where application runs out of data to send. As an example, with the existing rate halving implementation a single loss event can cause a connection carrying short Web transactions to go into the slow start mode after the recovery. This is because during recovery Linux pulls the congestion window down to packets_in_flight+1 on every ACK. A short Web response often runs out of new data to send and its pipe reduces to zero by the end of recovery when all its packets are drained from the network. Subsequent HTTP responses using the same connection will have to slow start to raise cwnd to ssthresh. PRR on the other hand aims for the cwnd to be as close as possible to ssthresh by the end of recovery. A description of PRR and a discussion of its performance can be found at the following links: - IETF Draft: http://tools.ietf.org/html/draft-mathis-tcpm-proportional-rate-reduction-01 - IETF Slides: http://www.ietf.org/proceedings/80/slides/tcpm-6.pdf http://tools.ietf.org/agenda/81/slides/tcpm-2.pdf - Paper to appear in Internet Measurements Conference (IMC) 2011: Improving TCP Loss Recovery Nandita Dukkipati, Matt Mathis, Yuchung Cheng Signed-off-by: Nandita Dukkipati <nanditad@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Nandita Dukkipati <nanditad@google.com> 2011-08-21 16:21:57 -0400
committer: David S. Miller <davem@davemloft.net> 2011-08-24 22:40:40 -0400
commit: a262f0cdf1f2916ea918dc329492abb5323d9a6c (patch)
tree: 976cd31c3ea365f5810a154a1c77c75fb299c5fe
parent: f6fb8f100b807378fda19e83e5ac6828b638603a (diff)
3 files changed, 62 insertions, 7 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 531ede8006d9..6b63b310af36 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -379,6 +379,10 @@ struct tcp_sock {
        u32     snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
        u32     snd_cwnd_used;
        u32     snd_cwnd_stamp;
+        u32     prior_cwnd;     /* Congestion window at start of Recovery. */
+        u32     prr_delivered;  /* Number of newly delivered packets to
+                                 * receiver in Recovery. */
+        u32     prr_out;        /* Total number of pkts sent during Recovery. */
        u32     rcv_wnd;        /* Current receiver window              */
        u32     write_seq;      /* Tail(+1) of data held in tcp send buffer */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ea0d2183df4b..385c470195eb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2830,9 +2830,13 @@ static int tcp_try_undo_loss(struct sock *sk)
 static inline void tcp_complete_cwr(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        /* Do not moderate cwnd if it's already undone in cwr or recovery */
-        if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) {
+        /* Do not moderate cwnd if it's already undone in cwr or recovery. */
-                tp->snd_cwnd = tp->snd_ssthresh;
+        if (tp->undo_marker) {
+                if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR)
+                        tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+                else /* PRR */
+                        tp->snd_cwnd = tp->snd_ssthresh;
                tp->snd_cwnd_stamp = tcp_time_stamp;
        }
        tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
@@ -2950,6 +2954,38 @@ void tcp_simple_retransmit(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_simple_retransmit);
+/* This function implements the PRR algorithm, specifcally the PRR-SSRB
+ * (proportional rate reduction with slow start reduction bound) as described in
+ * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
+ * It computes the number of packets to send (sndcnt) based on packets newly
+ * delivered:
+ *   1) If the packets in flight is larger than ssthresh, PRR spreads the
+ *      cwnd reductions across a full RTT.
+ *   2) If packets in flight is lower than ssthresh (such as due to excess
+ *      losses and/or application stalls), do not perform any further cwnd
+ *      reductions, but instead slow start up to ssthresh.
+ */
+static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
+                                        int fast_rexmit, int flag)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int sndcnt = 0;
+        int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
+        if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+                u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
+                               tp->prior_cwnd - 1;
+                sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
+        } else {
+                sndcnt = min_t(int, delta,
+                               max_t(int, tp->prr_delivered - tp->prr_out,
+                                     newly_acked_sacked) + 1);
+        }
+        sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+        tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+}
 /* Process an event, which can update packets-in-flight not trivially.
 * Main goal of this function is to calculate new estimate for left_out,
 * taking into account both packets sitting in receiver's buffer and
@@ -2961,7 +2997,8 @@ EXPORT_SYMBOL(tcp_simple_retransmit);
 * It does _not_ decide what to send, it is made in function
 * tcp_xmit_retransmit_queue().
 */
-static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
+static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
+                                  int newly_acked_sacked, int flag)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
@@ -3111,13 +3148,17 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
                tp->bytes_acked = 0;
                tp->snd_cwnd_cnt = 0;
+                tp->prior_cwnd = tp->snd_cwnd;
+                tp->prr_delivered = 0;
+                tp->prr_out = 0;
                tcp_set_ca_state(sk, TCP_CA_Recovery);
                fast_rexmit = 1;
        }
        if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
                tcp_update_scoreboard(sk, fast_rexmit);
-        tcp_cwnd_down(sk, flag);
+        tp->prr_delivered += newly_acked_sacked;
+        tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
        tcp_xmit_retransmit_queue(sk);
 }
@@ -3632,6 +3673,8 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
        u32 prior_in_flight;
        u32 prior_fackets;
        int prior_packets;
+        int prior_sacked = tp->sacked_out;
+        int newly_acked_sacked = 0;
        int frto_cwnd = 0;
        /* If the ack is older than previous acks
@@ -3703,6 +3746,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
        /* See if we can take anything off of the retransmit queue. */
        flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
+        newly_acked_sacked = (prior_packets - prior_sacked) -
+                             (tp->packets_out - tp->sacked_out);
        if (tp->frto_counter)
                frto_cwnd = tcp_process_frto(sk, flag);
        /* Guarantee sacktag reordering detection against wrap-arounds */
@@ -3715,7 +3761,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
                    tcp_may_raise_cwnd(sk, flag))
                        tcp_cong_avoid(sk, ack, prior_in_flight);
                tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,
-                                      flag);
+                                      newly_acked_sacked, flag);
        } else {
                if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
                        tcp_cong_avoid(sk, ack, prior_in_flight);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0377c061f22f..081dcd6fd0c4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1796,11 +1796,13 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                tcp_event_new_data_sent(sk, skb);
                tcp_minshall_update(tp, mss_now, skb);
-                sent_pkts++;
+                sent_pkts += tcp_skb_pcount(skb);
                if (push_one)
                        break;
        }
+        if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
+                tp->prr_out += sent_pkts;
        if (likely(sent_pkts)) {
                tcp_cwnd_validate(sk);
@@ -2294,6 +2296,9 @@ begin_fwd:
                        return;
                NET_INC_STATS_BH(sock_net(sk), mib_idx);
+                if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
+                        tp->prr_out += tcp_skb_pcount(skb);
                if (skb == tcp_write_queue_head(sk))
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                                  inet_csk(sk)->icsk_rto,
author	Nandita Dukkipati <nanditad@google.com>	2011-08-21 16:21:57 -0400
committer	David S. Miller <davem@davemloft.net>	2011-08-24 22:40:40 -0400
commit	a262f0cdf1f2916ea918dc329492abb5323d9a6c (patch)
tree	976cd31c3ea365f5810a154a1c77c75fb299c5fe
parent	f6fb8f100b807378fda19e83e5ac6828b638603a (diff)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 531ede8006d9..6b63b310af36 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h
@@ -379,6 +379,10 @@ struct tcp_sock {
379	u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */	379	u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
380	u32 snd_cwnd_used;	380	u32 snd_cwnd_used;
381	u32 snd_cwnd_stamp;	381	u32 snd_cwnd_stamp;
		382	u32 prior_cwnd; /* Congestion window at start of Recovery. */
		383	u32 prr_delivered; /* Number of newly delivered packets to
		384	* receiver in Recovery. */
		385	u32 prr_out; /* Total number of pkts sent during Recovery. */
382		386
383	u32 rcv_wnd; /* Current receiver window */	387	u32 rcv_wnd; /* Current receiver window */
384	u32 write_seq; /* Tail(+1) of data held in tcp send buffer */	388	u32 write_seq; /* Tail(+1) of data held in tcp send buffer */


diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ea0d2183df4b..385c470195eb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -2830,9 +2830,13 @@ static int tcp_try_undo_loss(struct sock *sk)
2830	static inline void tcp_complete_cwr(struct sock *sk)	2830	static inline void tcp_complete_cwr(struct sock *sk)
2831	{	2831	{
2832	struct tcp_sock *tp = tcp_sk(sk);	2832	struct tcp_sock *tp = tcp_sk(sk);
2833	/* Do not moderate cwnd if it's already undone in cwr or recovery */	2833
2834	if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) {	2834	/* Do not moderate cwnd if it's already undone in cwr or recovery. */
2835	tp->snd_cwnd = tp->snd_ssthresh;	2835	if (tp->undo_marker) {
		2836	if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR)
		2837	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
		2838	else /* PRR */
		2839	tp->snd_cwnd = tp->snd_ssthresh;
2836	tp->snd_cwnd_stamp = tcp_time_stamp;	2840	tp->snd_cwnd_stamp = tcp_time_stamp;
2837	}	2841	}
2838	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);	2842	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
@@ -2950,6 +2954,38 @@ void tcp_simple_retransmit(struct sock *sk)
2950	}	2954	}
2951	EXPORT_SYMBOL(tcp_simple_retransmit);	2955	EXPORT_SYMBOL(tcp_simple_retransmit);
2952		2956
		2957	/* This function implements the PRR algorithm, specifcally the PRR-SSRB
		2958	* (proportional rate reduction with slow start reduction bound) as described in
		2959	* http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
		2960	* It computes the number of packets to send (sndcnt) based on packets newly
		2961	* delivered:
		2962	* 1) If the packets in flight is larger than ssthresh, PRR spreads the
		2963	* cwnd reductions across a full RTT.
		2964	* 2) If packets in flight is lower than ssthresh (such as due to excess
		2965	* losses and/or application stalls), do not perform any further cwnd
		2966	* reductions, but instead slow start up to ssthresh.
		2967	*/
		2968	static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
		2969	int fast_rexmit, int flag)
		2970	{
		2971	struct tcp_sock *tp = tcp_sk(sk);
		2972	int sndcnt = 0;
		2973	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
		2974
		2975	if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
		2976	u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
		2977	tp->prior_cwnd - 1;
		2978	sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
		2979	} else {
		2980	sndcnt = min_t(int, delta,
		2981	max_t(int, tp->prr_delivered - tp->prr_out,
		2982	newly_acked_sacked) + 1);
		2983	}
		2984
		2985	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
		2986	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
		2987	}
		2988
2953	/* Process an event, which can update packets-in-flight not trivially.	2989	/* Process an event, which can update packets-in-flight not trivially.
2954	* Main goal of this function is to calculate new estimate for left_out,	2990	* Main goal of this function is to calculate new estimate for left_out,
2955	* taking into account both packets sitting in receiver's buffer and	2991	* taking into account both packets sitting in receiver's buffer and
@@ -2961,7 +2997,8 @@ EXPORT_SYMBOL(tcp_simple_retransmit);
2961	* It does _not_ decide what to send, it is made in function	2997	* It does _not_ decide what to send, it is made in function
2962	* tcp_xmit_retransmit_queue().	2998	* tcp_xmit_retransmit_queue().
2963	*/	2999	*/
2964	static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)	3000	static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
		3001	int newly_acked_sacked, int flag)
2965	{	3002	{
2966	struct inet_connection_sock *icsk = inet_csk(sk);	3003	struct inet_connection_sock *icsk = inet_csk(sk);
2967	struct tcp_sock *tp = tcp_sk(sk);	3004	struct tcp_sock *tp = tcp_sk(sk);
@@ -3111,13 +3148,17 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
3111		3148
3112	tp->bytes_acked = 0;	3149	tp->bytes_acked = 0;
3113	tp->snd_cwnd_cnt = 0;	3150	tp->snd_cwnd_cnt = 0;
		3151	tp->prior_cwnd = tp->snd_cwnd;
		3152	tp->prr_delivered = 0;
		3153	tp->prr_out = 0;
3114	tcp_set_ca_state(sk, TCP_CA_Recovery);	3154	tcp_set_ca_state(sk, TCP_CA_Recovery);
3115	fast_rexmit = 1;	3155	fast_rexmit = 1;
3116	}	3156	}
3117		3157
3118	if (do_lost \|\| (tcp_is_fack(tp) && tcp_head_timedout(sk)))	3158	if (do_lost \|\| (tcp_is_fack(tp) && tcp_head_timedout(sk)))
3119	tcp_update_scoreboard(sk, fast_rexmit);	3159	tcp_update_scoreboard(sk, fast_rexmit);
3120	tcp_cwnd_down(sk, flag);	3160	tp->prr_delivered += newly_acked_sacked;
		3161	tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
3121	tcp_xmit_retransmit_queue(sk);	3162	tcp_xmit_retransmit_queue(sk);
3122	}	3163	}
3123		3164
@@ -3632,6 +3673,8 @@ static int tcp_ack(struct sock sk, struct sk_buff skb, int flag)
3632	u32 prior_in_flight;	3673	u32 prior_in_flight;
3633	u32 prior_fackets;	3674	u32 prior_fackets;
3634	int prior_packets;	3675	int prior_packets;
		3676	int prior_sacked = tp->sacked_out;
		3677	int newly_acked_sacked = 0;
3635	int frto_cwnd = 0;	3678	int frto_cwnd = 0;
3636		3679
3637	/* If the ack is older than previous acks	3680	/* If the ack is older than previous acks
@@ -3703,6 +3746,9 @@ static int tcp_ack(struct sock sk, struct sk_buff skb, int flag)
3703	/* See if we can take anything off of the retransmit queue. */	3746	/* See if we can take anything off of the retransmit queue. */
3704	flag \|= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);	3747	flag \|= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
3705		3748
		3749	newly_acked_sacked = (prior_packets - prior_sacked) -
		3750	(tp->packets_out - tp->sacked_out);
		3751
3706	if (tp->frto_counter)	3752	if (tp->frto_counter)
3707	frto_cwnd = tcp_process_frto(sk, flag);	3753	frto_cwnd = tcp_process_frto(sk, flag);
3708	/* Guarantee sacktag reordering detection against wrap-arounds */	3754	/* Guarantee sacktag reordering detection against wrap-arounds */
@@ -3715,7 +3761,7 @@ static int tcp_ack(struct sock sk, struct sk_buff skb, int flag)
3715	tcp_may_raise_cwnd(sk, flag))	3761	tcp_may_raise_cwnd(sk, flag))
3716	tcp_cong_avoid(sk, ack, prior_in_flight);	3762	tcp_cong_avoid(sk, ack, prior_in_flight);
3717	tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,	3763	tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,
3718	flag);	3764	newly_acked_sacked, flag);
3719	} else {	3765	} else {
3720	if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)	3766	if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
3721	tcp_cong_avoid(sk, ack, prior_in_flight);	3767	tcp_cong_avoid(sk, ack, prior_in_flight);


diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0377c061f22f..081dcd6fd0c4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c
@@ -1796,11 +1796,13 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1796	tcp_event_new_data_sent(sk, skb);	1796	tcp_event_new_data_sent(sk, skb);
1797		1797
1798	tcp_minshall_update(tp, mss_now, skb);	1798	tcp_minshall_update(tp, mss_now, skb);
1799	sent_pkts++;	1799	sent_pkts += tcp_skb_pcount(skb);
1800		1800
1801	if (push_one)	1801	if (push_one)
1802	break;	1802	break;
1803	}	1803	}
		1804	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
		1805	tp->prr_out += sent_pkts;
1804		1806
1805	if (likely(sent_pkts)) {	1807	if (likely(sent_pkts)) {
1806	tcp_cwnd_validate(sk);	1808	tcp_cwnd_validate(sk);
@@ -2294,6 +2296,9 @@ begin_fwd:
2294	return;	2296	return;
2295	NET_INC_STATS_BH(sock_net(sk), mib_idx);	2297	NET_INC_STATS_BH(sock_net(sk), mib_idx);
2296		2298
		2299	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
		2300	tp->prr_out += tcp_skb_pcount(skb);
		2301
2297	if (skb == tcp_write_queue_head(sk))	2302	if (skb == tcp_write_queue_head(sk))
2298	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,	2303	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2299	inet_csk(sk)->icsk_rto,	2304	inet_csk(sk)->icsk_rto,