aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNandita Dukkipati <nanditad@google.com>2011-08-21 16:21:57 -0400
committerDavid S. Miller <davem@davemloft.net>2011-08-24 22:40:40 -0400
commita262f0cdf1f2916ea918dc329492abb5323d9a6c (patch)
tree976cd31c3ea365f5810a154a1c77c75fb299c5fe
parentf6fb8f100b807378fda19e83e5ac6828b638603a (diff)
Proportional Rate Reduction for TCP.
This patch implements Proportional Rate Reduction (PRR) for TCP. PRR is an algorithm that determines TCP's sending rate in fast recovery. PRR avoids excessive window reductions and aims for the actual congestion window size at the end of recovery to be as close as possible to the window determined by the congestion control algorithm. PRR also improves accuracy of the amount of data sent during loss recovery. The patch implements the recommended flavor of PRR called PRR-SSRB (Proportional rate reduction with slow start reduction bound) and replaces the existing rate halving algorithm. PRR improves upon the existing Linux fast recovery under a number of conditions including: 1) burst losses where the losses implicitly reduce the amount of outstanding data (pipe) below the ssthresh value selected by the congestion control algorithm and, 2) losses near the end of short flows where application runs out of data to send. As an example, with the existing rate halving implementation a single loss event can cause a connection carrying short Web transactions to go into the slow start mode after the recovery. This is because during recovery Linux pulls the congestion window down to packets_in_flight+1 on every ACK. A short Web response often runs out of new data to send and its pipe reduces to zero by the end of recovery when all its packets are drained from the network. Subsequent HTTP responses using the same connection will have to slow start to raise cwnd to ssthresh. PRR on the other hand aims for the cwnd to be as close as possible to ssthresh by the end of recovery. A description of PRR and a discussion of its performance can be found at the following links: - IETF Draft: http://tools.ietf.org/html/draft-mathis-tcpm-proportional-rate-reduction-01 - IETF Slides: http://www.ietf.org/proceedings/80/slides/tcpm-6.pdf http://tools.ietf.org/agenda/81/slides/tcpm-2.pdf - Paper to appear in Internet Measurements Conference (IMC) 2011: Improving TCP Loss Recovery Nandita Dukkipati, Matt Mathis, Yuchung Cheng Signed-off-by: Nandita Dukkipati <nanditad@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/tcp.h4
-rw-r--r--net/ipv4/tcp_input.c58
-rw-r--r--net/ipv4/tcp_output.c7
3 files changed, 62 insertions, 7 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 531ede8006d9..6b63b310af36 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -379,6 +379,10 @@ struct tcp_sock {
379 u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ 379 u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
380 u32 snd_cwnd_used; 380 u32 snd_cwnd_used;
381 u32 snd_cwnd_stamp; 381 u32 snd_cwnd_stamp;
382 u32 prior_cwnd; /* Congestion window at start of Recovery. */
383 u32 prr_delivered; /* Number of newly delivered packets to
384 * receiver in Recovery. */
385 u32 prr_out; /* Total number of pkts sent during Recovery. */
382 386
383 u32 rcv_wnd; /* Current receiver window */ 387 u32 rcv_wnd; /* Current receiver window */
384 u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ 388 u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ea0d2183df4b..385c470195eb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2830,9 +2830,13 @@ static int tcp_try_undo_loss(struct sock *sk)
2830static inline void tcp_complete_cwr(struct sock *sk) 2830static inline void tcp_complete_cwr(struct sock *sk)
2831{ 2831{
2832 struct tcp_sock *tp = tcp_sk(sk); 2832 struct tcp_sock *tp = tcp_sk(sk);
2833 /* Do not moderate cwnd if it's already undone in cwr or recovery */ 2833
2834 if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) { 2834 /* Do not moderate cwnd if it's already undone in cwr or recovery. */
2835 tp->snd_cwnd = tp->snd_ssthresh; 2835 if (tp->undo_marker) {
2836 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR)
2837 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
2838 else /* PRR */
2839 tp->snd_cwnd = tp->snd_ssthresh;
2836 tp->snd_cwnd_stamp = tcp_time_stamp; 2840 tp->snd_cwnd_stamp = tcp_time_stamp;
2837 } 2841 }
2838 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 2842 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
@@ -2950,6 +2954,38 @@ void tcp_simple_retransmit(struct sock *sk)
2950} 2954}
2951EXPORT_SYMBOL(tcp_simple_retransmit); 2955EXPORT_SYMBOL(tcp_simple_retransmit);
2952 2956
2957/* This function implements the PRR algorithm, specifcally the PRR-SSRB
2958 * (proportional rate reduction with slow start reduction bound) as described in
2959 * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
2960 * It computes the number of packets to send (sndcnt) based on packets newly
2961 * delivered:
2962 * 1) If the packets in flight is larger than ssthresh, PRR spreads the
2963 * cwnd reductions across a full RTT.
2964 * 2) If packets in flight is lower than ssthresh (such as due to excess
2965 * losses and/or application stalls), do not perform any further cwnd
2966 * reductions, but instead slow start up to ssthresh.
2967 */
2968static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
2969 int fast_rexmit, int flag)
2970{
2971 struct tcp_sock *tp = tcp_sk(sk);
2972 int sndcnt = 0;
2973 int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2974
2975 if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
2976 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2977 tp->prior_cwnd - 1;
2978 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2979 } else {
2980 sndcnt = min_t(int, delta,
2981 max_t(int, tp->prr_delivered - tp->prr_out,
2982 newly_acked_sacked) + 1);
2983 }
2984
2985 sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
2986 tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
2987}
2988
2953/* Process an event, which can update packets-in-flight not trivially. 2989/* Process an event, which can update packets-in-flight not trivially.
2954 * Main goal of this function is to calculate new estimate for left_out, 2990 * Main goal of this function is to calculate new estimate for left_out,
2955 * taking into account both packets sitting in receiver's buffer and 2991 * taking into account both packets sitting in receiver's buffer and
@@ -2961,7 +2997,8 @@ EXPORT_SYMBOL(tcp_simple_retransmit);
2961 * It does _not_ decide what to send, it is made in function 2997 * It does _not_ decide what to send, it is made in function
2962 * tcp_xmit_retransmit_queue(). 2998 * tcp_xmit_retransmit_queue().
2963 */ 2999 */
2964static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) 3000static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3001 int newly_acked_sacked, int flag)
2965{ 3002{
2966 struct inet_connection_sock *icsk = inet_csk(sk); 3003 struct inet_connection_sock *icsk = inet_csk(sk);
2967 struct tcp_sock *tp = tcp_sk(sk); 3004 struct tcp_sock *tp = tcp_sk(sk);
@@ -3111,13 +3148,17 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
3111 3148
3112 tp->bytes_acked = 0; 3149 tp->bytes_acked = 0;
3113 tp->snd_cwnd_cnt = 0; 3150 tp->snd_cwnd_cnt = 0;
3151 tp->prior_cwnd = tp->snd_cwnd;
3152 tp->prr_delivered = 0;
3153 tp->prr_out = 0;
3114 tcp_set_ca_state(sk, TCP_CA_Recovery); 3154 tcp_set_ca_state(sk, TCP_CA_Recovery);
3115 fast_rexmit = 1; 3155 fast_rexmit = 1;
3116 } 3156 }
3117 3157
3118 if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) 3158 if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
3119 tcp_update_scoreboard(sk, fast_rexmit); 3159 tcp_update_scoreboard(sk, fast_rexmit);
3120 tcp_cwnd_down(sk, flag); 3160 tp->prr_delivered += newly_acked_sacked;
3161 tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
3121 tcp_xmit_retransmit_queue(sk); 3162 tcp_xmit_retransmit_queue(sk);
3122} 3163}
3123 3164
@@ -3632,6 +3673,8 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3632 u32 prior_in_flight; 3673 u32 prior_in_flight;
3633 u32 prior_fackets; 3674 u32 prior_fackets;
3634 int prior_packets; 3675 int prior_packets;
3676 int prior_sacked = tp->sacked_out;
3677 int newly_acked_sacked = 0;
3635 int frto_cwnd = 0; 3678 int frto_cwnd = 0;
3636 3679
3637 /* If the ack is older than previous acks 3680 /* If the ack is older than previous acks
@@ -3703,6 +3746,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3703 /* See if we can take anything off of the retransmit queue. */ 3746 /* See if we can take anything off of the retransmit queue. */
3704 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); 3747 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
3705 3748
3749 newly_acked_sacked = (prior_packets - prior_sacked) -
3750 (tp->packets_out - tp->sacked_out);
3751
3706 if (tp->frto_counter) 3752 if (tp->frto_counter)
3707 frto_cwnd = tcp_process_frto(sk, flag); 3753 frto_cwnd = tcp_process_frto(sk, flag);
3708 /* Guarantee sacktag reordering detection against wrap-arounds */ 3754 /* Guarantee sacktag reordering detection against wrap-arounds */
@@ -3715,7 +3761,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3715 tcp_may_raise_cwnd(sk, flag)) 3761 tcp_may_raise_cwnd(sk, flag))
3716 tcp_cong_avoid(sk, ack, prior_in_flight); 3762 tcp_cong_avoid(sk, ack, prior_in_flight);
3717 tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, 3763 tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,
3718 flag); 3764 newly_acked_sacked, flag);
3719 } else { 3765 } else {
3720 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) 3766 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
3721 tcp_cong_avoid(sk, ack, prior_in_flight); 3767 tcp_cong_avoid(sk, ack, prior_in_flight);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0377c061f22f..081dcd6fd0c4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1796,11 +1796,13 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1796 tcp_event_new_data_sent(sk, skb); 1796 tcp_event_new_data_sent(sk, skb);
1797 1797
1798 tcp_minshall_update(tp, mss_now, skb); 1798 tcp_minshall_update(tp, mss_now, skb);
1799 sent_pkts++; 1799 sent_pkts += tcp_skb_pcount(skb);
1800 1800
1801 if (push_one) 1801 if (push_one)
1802 break; 1802 break;
1803 } 1803 }
1804 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
1805 tp->prr_out += sent_pkts;
1804 1806
1805 if (likely(sent_pkts)) { 1807 if (likely(sent_pkts)) {
1806 tcp_cwnd_validate(sk); 1808 tcp_cwnd_validate(sk);
@@ -2294,6 +2296,9 @@ begin_fwd:
2294 return; 2296 return;
2295 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2297 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2296 2298
2299 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
2300 tp->prr_out += tcp_skb_pcount(skb);
2301
2297 if (skb == tcp_write_queue_head(sk)) 2302 if (skb == tcp_write_queue_head(sk))
2298 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2303 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2299 inet_csk(sk)->icsk_rto, 2304 inet_csk(sk)->icsk_rto,