aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorStephen Hemminger <shemminger@osdl.org>2005-06-23 15:19:55 -0400
committerDavid S. Miller <davem@davemloft.net>2005-06-23 15:19:55 -0400
commit317a76f9a44b437d6301718f4e5d08bd93f98da7 (patch)
treecaeba9839dee264f59b035b81c3d13d6c61b638e /net/ipv4/tcp_input.c
parenta8ad86f2dc46356f87be1327dabc18bdbda32f50 (diff)
[TCP]: Add pluggable congestion control algorithm infrastructure.
Allow TCP to have multiple pluggable congestion control algorithms. Algorithms are defined by a set of operations and can be built in or modules. The legacy "new RENO" algorithm is used as a starting point and fallback. Signed-off-by: Stephen Hemminger <shemminger@osdl.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c737
1 files changed, 53 insertions, 684 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504630a3..7bbbbc33eb4b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission 61 * Panu Kuhlberg: Experimental audit of TCP (re)transmission
62 * engine. Lots of bugs are found. 62 * engine. Lots of bugs are found.
63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs 63 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
64 * Angelo Dell'Aera: TCP Westwood+ support
65 */ 64 */
66 65
67#include <linux/config.h> 66#include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
88int sysctl_tcp_max_orphans = NR_FILE; 87int sysctl_tcp_max_orphans = NR_FILE;
89int sysctl_tcp_frto; 88int sysctl_tcp_frto;
90int sysctl_tcp_nometrics_save; 89int sysctl_tcp_nometrics_save;
91int sysctl_tcp_westwood;
92int sysctl_tcp_vegas_cong_avoid;
93 90
94int sysctl_tcp_moderate_rcvbuf = 1; 91int sysctl_tcp_moderate_rcvbuf = 1;
95 92
96/* Default values of the Vegas variables, in fixed-point representation
97 * with V_PARAM_SHIFT bits to the right of the binary point.
98 */
99#define V_PARAM_SHIFT 1
100int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
101int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
102int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
103int sysctl_tcp_bic = 1;
104int sysctl_tcp_bic_fast_convergence = 1;
105int sysctl_tcp_bic_low_window = 14;
106int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
107
108#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 93#define FLAG_DATA 0x01 /* Incoming frame contained data. */
109#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 94#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
110#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ 95#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk)
333 tp->snd_cwnd_stamp = tcp_time_stamp; 318 tp->snd_cwnd_stamp = tcp_time_stamp;
334} 319}
335 320
336static void init_bictcp(struct tcp_sock *tp)
337{
338 tp->bictcp.cnt = 0;
339
340 tp->bictcp.last_max_cwnd = 0;
341 tp->bictcp.last_cwnd = 0;
342 tp->bictcp.last_stamp = 0;
343}
344
345/* 5. Recalculate window clamp after socket hit its memory bounds. */ 321/* 5. Recalculate window clamp after socket hit its memory bounds. */
346static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 322static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
347{ 323{
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
558 tcp_grow_window(sk, tp, skb); 534 tcp_grow_window(sk, tp, skb);
559} 535}
560 536
561/* When starting a new connection, pin down the current choice of
562 * congestion algorithm.
563 */
564void tcp_ca_init(struct tcp_sock *tp)
565{
566 if (sysctl_tcp_westwood)
567 tp->adv_cong = TCP_WESTWOOD;
568 else if (sysctl_tcp_bic)
569 tp->adv_cong = TCP_BIC;
570 else if (sysctl_tcp_vegas_cong_avoid) {
571 tp->adv_cong = TCP_VEGAS;
572 tp->vegas.baseRTT = 0x7fffffff;
573 tcp_vegas_enable(tp);
574 }
575}
576
577/* Do RTT sampling needed for Vegas.
578 * Basically we:
579 * o min-filter RTT samples from within an RTT to get the current
580 * propagation delay + queuing delay (we are min-filtering to try to
581 * avoid the effects of delayed ACKs)
582 * o min-filter RTT samples from a much longer window (forever for now)
583 * to find the propagation delay (baseRTT)
584 */
585static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
586{
587 __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
588
589 /* Filter to find propagation delay: */
590 if (vrtt < tp->vegas.baseRTT)
591 tp->vegas.baseRTT = vrtt;
592
593 /* Find the min RTT during the last RTT to find
594 * the current prop. delay + queuing delay:
595 */
596 tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
597 tp->vegas.cntRTT++;
598}
599
600/* Called to compute a smoothed rtt estimate. The data fed to this 537/* Called to compute a smoothed rtt estimate. The data fed to this
601 * routine either comes from timestamps, or from segments that were 538 * routine either comes from timestamps, or from segments that were
602 * known _not_ to have been retransmitted [see Karn/Partridge 539 * known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
606 * To save cycles in the RFC 1323 implementation it was better to break 543 * To save cycles in the RFC 1323 implementation it was better to break
607 * it up into three procedures. -- erics 544 * it up into three procedures. -- erics
608 */ 545 */
609static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) 546static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
610{ 547{
611 long m = mrtt; /* RTT */ 548 long m = mrtt; /* RTT */
612 549
613 if (tcp_vegas_enabled(tp))
614 vegas_rtt_calc(tp, mrtt);
615
616 /* The following amusing code comes from Jacobson's 550 /* The following amusing code comes from Jacobson's
617 * article in SIGCOMM '88. Note that rtt and mdev 551 * article in SIGCOMM '88. Note that rtt and mdev
618 * are scaled versions of rtt and mean deviation. 552 * are scaled versions of rtt and mean deviation.
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
670 tp->rtt_seq = tp->snd_nxt; 604 tp->rtt_seq = tp->snd_nxt;
671 } 605 }
672 606
673 tcp_westwood_update_rtt(tp, tp->srtt >> 3); 607 if (tp->ca_ops->rtt_sample)
608 tp->ca_ops->rtt_sample(tp, *usrtt);
674} 609}
675 610
676/* Calculate rto without backoff. This is the second half of Van Jacobson's 611/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk)
1185 tp->snd_una == tp->high_seq || 1120 tp->snd_una == tp->high_seq ||
1186 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1121 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1187 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1122 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1188 if (!tcp_westwood_ssthresh(tp)) 1123 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1189 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1124 tcp_ca_event(tp, CA_EVENT_FRTO);
1190 } 1125 }
1191 1126
1192 /* Have to clear retransmission markers here to keep the bookkeeping 1127 /* Have to clear retransmission markers here to keep the bookkeeping
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk)
1252 tcp_set_ca_state(tp, TCP_CA_Loss); 1187 tcp_set_ca_state(tp, TCP_CA_Loss);
1253 tp->high_seq = tp->frto_highmark; 1188 tp->high_seq = tp->frto_highmark;
1254 TCP_ECN_queue_cwr(tp); 1189 TCP_ECN_queue_cwr(tp);
1255
1256 init_bictcp(tp);
1257} 1190}
1258 1191
1259void tcp_clear_retrans(struct tcp_sock *tp) 1192void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how)
1283 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || 1216 if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
1284 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { 1217 (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
1285 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1218 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1286 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1219 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1220 tcp_ca_event(tp, CA_EVENT_LOSS);
1287 } 1221 }
1288 tp->snd_cwnd = 1; 1222 tp->snd_cwnd = 1;
1289 tp->snd_cwnd_cnt = 0; 1223 tp->snd_cwnd_cnt = 0;
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
1596} 1530}
1597 1531
1598/* Decrease cwnd each second ack. */ 1532/* Decrease cwnd each second ack. */
1599
1600static void tcp_cwnd_down(struct tcp_sock *tp) 1533static void tcp_cwnd_down(struct tcp_sock *tp)
1601{ 1534{
1602 int decr = tp->snd_cwnd_cnt + 1; 1535 int decr = tp->snd_cwnd_cnt + 1;
1603 __u32 limit;
1604
1605 /*
1606 * TCP Westwood
1607 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
1608 * in packets we use mss_cache). If sysctl_tcp_westwood is off
1609 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
1610 * still used as usual. It prevents other strange cases in which
1611 * BWE*RTTmin could assume value 0. It should not happen but...
1612 */
1613
1614 if (!(limit = tcp_westwood_bw_rttmin(tp)))
1615 limit = tp->snd_ssthresh/2;
1616 1536
1617 tp->snd_cwnd_cnt = decr&1; 1537 tp->snd_cwnd_cnt = decr&1;
1618 decr >>= 1; 1538 decr >>= 1;
1619 1539
1620 if (decr && tp->snd_cwnd > limit) 1540 if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
1621 tp->snd_cwnd -= decr; 1541 tp->snd_cwnd -= decr;
1622 1542
1623 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); 1543 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
1654static void tcp_undo_cwr(struct tcp_sock *tp, int undo) 1574static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
1655{ 1575{
1656 if (tp->prior_ssthresh) { 1576 if (tp->prior_ssthresh) {
1657 if (tcp_is_bic(tp)) 1577 if (tp->ca_ops->undo_cwnd)
1658 tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); 1578 tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
1659 else 1579 else
1660 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); 1580 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
1661 1581
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1767 1687
1768static inline void tcp_complete_cwr(struct tcp_sock *tp) 1688static inline void tcp_complete_cwr(struct tcp_sock *tp)
1769{ 1689{
1770 if (tcp_westwood_cwnd(tp)) 1690 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1771 tp->snd_ssthresh = tp->snd_cwnd;
1772 else
1773 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
1774 tp->snd_cwnd_stamp = tcp_time_stamp; 1691 tp->snd_cwnd_stamp = tcp_time_stamp;
1692 tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
1775} 1693}
1776 1694
1777static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) 1695static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1946 if (tp->ca_state < TCP_CA_CWR) { 1864 if (tp->ca_state < TCP_CA_CWR) {
1947 if (!(flag&FLAG_ECE)) 1865 if (!(flag&FLAG_ECE))
1948 tp->prior_ssthresh = tcp_current_ssthresh(tp); 1866 tp->prior_ssthresh = tcp_current_ssthresh(tp);
1949 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1867 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1950 TCP_ECN_queue_cwr(tp); 1868 TCP_ECN_queue_cwr(tp);
1951 } 1869 }
1952 1870
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1963/* Read draft-ietf-tcplw-high-performance before mucking 1881/* Read draft-ietf-tcplw-high-performance before mucking
1964 * with this code. (Superceeds RFC1323) 1882 * with this code. (Superceeds RFC1323)
1965 */ 1883 */
1966static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) 1884static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
1967{ 1885{
1968 __u32 seq_rtt; 1886 __u32 seq_rtt;
1969 1887
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
1983 * in window is lost... Voila. --ANK (010210) 1901 * in window is lost... Voila. --ANK (010210)
1984 */ 1902 */
1985 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 1903 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1986 tcp_rtt_estimator(tp, seq_rtt); 1904 tcp_rtt_estimator(tp, seq_rtt, usrtt);
1987 tcp_set_rto(tp); 1905 tcp_set_rto(tp);
1988 tp->backoff = 0; 1906 tp->backoff = 0;
1989 tcp_bound_rto(tp); 1907 tcp_bound_rto(tp);
1990} 1908}
1991 1909
1992static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) 1910static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
1993{ 1911{
1994 /* We don't have a timestamp. Can only use 1912 /* We don't have a timestamp. Can only use
1995 * packets that are not retransmitted to determine 1913 * packets that are not retransmitted to determine
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
2003 if (flag & FLAG_RETRANS_DATA_ACKED) 1921 if (flag & FLAG_RETRANS_DATA_ACKED)
2004 return; 1922 return;
2005 1923
2006 tcp_rtt_estimator(tp, seq_rtt); 1924 tcp_rtt_estimator(tp, seq_rtt, usrtt);
2007 tcp_set_rto(tp); 1925 tcp_set_rto(tp);
2008 tp->backoff = 0; 1926 tp->backoff = 0;
2009 tcp_bound_rto(tp); 1927 tcp_bound_rto(tp);
2010} 1928}
2011 1929
2012static inline void tcp_ack_update_rtt(struct tcp_sock *tp, 1930static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
2013 int flag, s32 seq_rtt) 1931 int flag, s32 seq_rtt, u32 *usrtt)
2014{ 1932{
2015 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 1933 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
2016 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 1934 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2017 tcp_ack_saw_tstamp(tp, flag); 1935 tcp_ack_saw_tstamp(tp, usrtt, flag);
2018 else if (seq_rtt >= 0) 1936 else if (seq_rtt >= 0)
2019 tcp_ack_no_tstamp(tp, seq_rtt, flag); 1937 tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
2020} 1938}
2021 1939
2022/* 1940static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
2023 * Compute congestion window to use. 1941 u32 in_flight, int good)
2024 *
2025 * This is from the implementation of BICTCP in
2026 * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
2027 * "Binary Increase Congestion Control for Fast, Long Distance
2028 * Networks" in InfoComm 2004
2029 * Available from:
2030 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
2031 *
2032 * Unless BIC is enabled and congestion window is large
2033 * this behaves the same as the original Reno.
2034 */
2035static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
2036{
2037 /* orignal Reno behaviour */
2038 if (!tcp_is_bic(tp))
2039 return tp->snd_cwnd;
2040
2041 if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
2042 (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
2043 return tp->bictcp.cnt;
2044
2045 tp->bictcp.last_cwnd = tp->snd_cwnd;
2046 tp->bictcp.last_stamp = tcp_time_stamp;
2047
2048 /* start off normal */
2049 if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
2050 tp->bictcp.cnt = tp->snd_cwnd;
2051
2052 /* binary increase */
2053 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
2054 __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
2055 / BICTCP_B;
2056
2057 if (dist > BICTCP_MAX_INCREMENT)
2058 /* linear increase */
2059 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2060 else if (dist <= 1U)
2061 /* binary search increase */
2062 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2063 / BICTCP_B;
2064 else
2065 /* binary search increase */
2066 tp->bictcp.cnt = tp->snd_cwnd / dist;
2067 } else {
2068 /* slow start amd linear increase */
2069 if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
2070 /* slow start */
2071 tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
2072 / BICTCP_B;
2073 else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
2074 + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
2075 /* slow start */
2076 tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
2077 / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
2078 else
2079 /* linear increase */
2080 tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
2081 }
2082 return tp->bictcp.cnt;
2083}
2084
2085/* This is Jacobson's slow start and congestion avoidance.
2086 * SIGCOMM '88, p. 328.
2087 */
2088static inline void reno_cong_avoid(struct tcp_sock *tp)
2089{ 1942{
2090 if (tp->snd_cwnd <= tp->snd_ssthresh) { 1943 tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
2091 /* In "safe" area, increase. */
2092 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2093 tp->snd_cwnd++;
2094 } else {
2095 /* In dangerous area, increase slowly.
2096 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
2097 */
2098 if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
2099 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
2100 tp->snd_cwnd++;
2101 tp->snd_cwnd_cnt=0;
2102 } else
2103 tp->snd_cwnd_cnt++;
2104 }
2105 tp->snd_cwnd_stamp = tcp_time_stamp; 1944 tp->snd_cwnd_stamp = tcp_time_stamp;
2106} 1945}
2107 1946
2108/* This is based on the congestion detection/avoidance scheme described in
2109 * Lawrence S. Brakmo and Larry L. Peterson.
2110 * "TCP Vegas: End to end congestion avoidance on a global internet."
2111 * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
2112 * October 1995. Available from:
2113 * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
2114 *
2115 * See http://www.cs.arizona.edu/xkernel/ for their implementation.
2116 * The main aspects that distinguish this implementation from the
2117 * Arizona Vegas implementation are:
2118 * o We do not change the loss detection or recovery mechanisms of
2119 * Linux in any way. Linux already recovers from losses quite well,
2120 * using fine-grained timers, NewReno, and FACK.
2121 * o To avoid the performance penalty imposed by increasing cwnd
2122 * only every-other RTT during slow start, we increase during
2123 * every RTT during slow start, just like Reno.
2124 * o Largely to allow continuous cwnd growth during slow start,
2125 * we use the rate at which ACKs come back as the "actual"
2126 * rate, rather than the rate at which data is sent.
2127 * o To speed convergence to the right rate, we set the cwnd
2128 * to achieve the right ("actual") rate when we exit slow start.
2129 * o To filter out the noise caused by delayed ACKs, we use the
2130 * minimum RTT sample observed during the last RTT to calculate
2131 * the actual rate.
2132 * o When the sender re-starts from idle, it waits until it has
2133 * received ACKs for an entire flight of new data before making
2134 * a cwnd adjustment decision. The original Vegas implementation
2135 * assumed senders never went idle.
2136 */
2137static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2138{
2139 /* The key players are v_beg_snd_una and v_beg_snd_nxt.
2140 *
2141 * These are so named because they represent the approximate values
2142 * of snd_una and snd_nxt at the beginning of the current RTT. More
2143 * precisely, they represent the amount of data sent during the RTT.
2144 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
2145 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
2146 * bytes of data have been ACKed during the course of the RTT, giving
2147 * an "actual" rate of:
2148 *
2149 * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
2150 *
2151 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
2152 * because delayed ACKs can cover more than one segment, so they
2153 * don't line up nicely with the boundaries of RTTs.
2154 *
2155 * Another unfortunate fact of life is that delayed ACKs delay the
2156 * advance of the left edge of our send window, so that the number
2157 * of bytes we send in an RTT is often less than our cwnd will allow.
2158 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
2159 */
2160
2161 if (after(ack, tp->vegas.beg_snd_nxt)) {
2162 /* Do the Vegas once-per-RTT cwnd adjustment. */
2163 u32 old_wnd, old_snd_cwnd;
2164
2165
2166 /* Here old_wnd is essentially the window of data that was
2167 * sent during the previous RTT, and has all
2168 * been acknowledged in the course of the RTT that ended
2169 * with the ACK we just received. Likewise, old_snd_cwnd
2170 * is the cwnd during the previous RTT.
2171 */
2172 old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
2173 tp->mss_cache_std;
2174 old_snd_cwnd = tp->vegas.beg_snd_cwnd;
2175
2176 /* Save the extent of the current window so we can use this
2177 * at the end of the next RTT.
2178 */
2179 tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt;
2180 tp->vegas.beg_snd_nxt = tp->snd_nxt;
2181 tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
2182
2183 /* Take into account the current RTT sample too, to
2184 * decrease the impact of delayed acks. This double counts
2185 * this sample since we count it for the next window as well,
2186 * but that's not too awful, since we're taking the min,
2187 * rather than averaging.
2188 */
2189 vegas_rtt_calc(tp, seq_rtt);
2190
2191 /* We do the Vegas calculations only if we got enough RTT
2192 * samples that we can be reasonably sure that we got
2193 * at least one RTT sample that wasn't from a delayed ACK.
2194 * If we only had 2 samples total,
2195 * then that means we're getting only 1 ACK per RTT, which
2196 * means they're almost certainly delayed ACKs.
2197 * If we have 3 samples, we should be OK.
2198 */
2199
2200 if (tp->vegas.cntRTT <= 2) {
2201 /* We don't have enough RTT samples to do the Vegas
2202 * calculation, so we'll behave like Reno.
2203 */
2204 if (tp->snd_cwnd > tp->snd_ssthresh)
2205 tp->snd_cwnd++;
2206 } else {
2207 u32 rtt, target_cwnd, diff;
2208
2209 /* We have enough RTT samples, so, using the Vegas
2210 * algorithm, we determine if we should increase or
2211 * decrease cwnd, and by how much.
2212 */
2213
2214 /* Pluck out the RTT we are using for the Vegas
2215 * calculations. This is the min RTT seen during the
2216 * last RTT. Taking the min filters out the effects
2217 * of delayed ACKs, at the cost of noticing congestion
2218 * a bit later.
2219 */
2220 rtt = tp->vegas.minRTT;
2221
2222 /* Calculate the cwnd we should have, if we weren't
2223 * going too fast.
2224 *
2225 * This is:
2226 * (actual rate in segments) * baseRTT
2227 * We keep it as a fixed point number with
2228 * V_PARAM_SHIFT bits to the right of the binary point.
2229 */
2230 target_cwnd = ((old_wnd * tp->vegas.baseRTT)
2231 << V_PARAM_SHIFT) / rtt;
2232
2233 /* Calculate the difference between the window we had,
2234 * and the window we would like to have. This quantity
2235 * is the "Diff" from the Arizona Vegas papers.
2236 *
2237 * Again, this is a fixed point number with
2238 * V_PARAM_SHIFT bits to the right of the binary
2239 * point.
2240 */
2241 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
2242
2243 if (tp->snd_cwnd < tp->snd_ssthresh) {
2244 /* Slow start. */
2245 if (diff > sysctl_tcp_vegas_gamma) {
2246 /* Going too fast. Time to slow down
2247 * and switch to congestion avoidance.
2248 */
2249 tp->snd_ssthresh = 2;
2250
2251 /* Set cwnd to match the actual rate
2252 * exactly:
2253 * cwnd = (actual rate) * baseRTT
2254 * Then we add 1 because the integer
2255 * truncation robs us of full link
2256 * utilization.
2257 */
2258 tp->snd_cwnd = min(tp->snd_cwnd,
2259 (target_cwnd >>
2260 V_PARAM_SHIFT)+1);
2261
2262 }
2263 } else {
2264 /* Congestion avoidance. */
2265 u32 next_snd_cwnd;
2266
2267 /* Figure out where we would like cwnd
2268 * to be.
2269 */
2270 if (diff > sysctl_tcp_vegas_beta) {
2271 /* The old window was too fast, so
2272 * we slow down.
2273 */
2274 next_snd_cwnd = old_snd_cwnd - 1;
2275 } else if (diff < sysctl_tcp_vegas_alpha) {
2276 /* We don't have enough extra packets
2277 * in the network, so speed up.
2278 */
2279 next_snd_cwnd = old_snd_cwnd + 1;
2280 } else {
2281 /* Sending just as fast as we
2282 * should be.
2283 */
2284 next_snd_cwnd = old_snd_cwnd;
2285 }
2286
2287 /* Adjust cwnd upward or downward, toward the
2288 * desired value.
2289 */
2290 if (next_snd_cwnd > tp->snd_cwnd)
2291 tp->snd_cwnd++;
2292 else if (next_snd_cwnd < tp->snd_cwnd)
2293 tp->snd_cwnd--;
2294 }
2295 }
2296
2297 /* Wipe the slate clean for the next RTT. */
2298 tp->vegas.cntRTT = 0;
2299 tp->vegas.minRTT = 0x7fffffff;
2300 }
2301
2302 /* The following code is executed for every ack we receive,
2303 * except for conditions checked in should_advance_cwnd()
2304 * before the call to tcp_cong_avoid(). Mainly this means that
2305 * we only execute this code if the ack actually acked some
2306 * data.
2307 */
2308
2309 /* If we are in slow start, increase our cwnd in response to this ACK.
2310 * (If we are not in slow start then we are in congestion avoidance,
2311 * and adjust our congestion window only once per RTT. See the code
2312 * above.)
2313 */
2314 if (tp->snd_cwnd <= tp->snd_ssthresh)
2315 tp->snd_cwnd++;
2316
2317 /* to keep cwnd from growing without bound */
2318 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
2319
2320 /* Make sure that we are never so timid as to reduce our cwnd below
2321 * 2 MSS.
2322 *
2323 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
2324 */
2325 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
2326
2327 tp->snd_cwnd_stamp = tcp_time_stamp;
2328}
2329
2330static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
2331{
2332 if (tcp_vegas_enabled(tp))
2333 vegas_cong_avoid(tp, ack, seq_rtt);
2334 else
2335 reno_cong_avoid(tp);
2336}
2337
2338/* Restart timer after forward progress on connection. 1947/* Restart timer after forward progress on connection.
2339 * RFC2988 recommends to restart timer to now+rto. 1948 * RFC2988 recommends to restart timer to now+rto.
2340 */ 1949 */
@@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2415 2024
2416 2025
2417/* Remove acknowledged frames from the retransmission queue. */ 2026/* Remove acknowledged frames from the retransmission queue. */
2418static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) 2027static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
2419{ 2028{
2420 struct tcp_sock *tp = tcp_sk(sk); 2029 struct tcp_sock *tp = tcp_sk(sk);
2421 struct sk_buff *skb; 2030 struct sk_buff *skb;
2422 __u32 now = tcp_time_stamp; 2031 __u32 now = tcp_time_stamp;
2423 int acked = 0; 2032 int acked = 0;
2424 __s32 seq_rtt = -1; 2033 __s32 seq_rtt = -1;
2034 struct timeval usnow;
2035 u32 pkts_acked = 0;
2036
2037 if (seq_usrtt)
2038 do_gettimeofday(&usnow);
2425 2039
2426 while ((skb = skb_peek(&sk->sk_write_queue)) && 2040 while ((skb = skb_peek(&sk->sk_write_queue)) &&
2427 skb != sk->sk_send_head) { 2041 skb != sk->sk_send_head) {
@@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2448 */ 2062 */
2449 if (!(scb->flags & TCPCB_FLAG_SYN)) { 2063 if (!(scb->flags & TCPCB_FLAG_SYN)) {
2450 acked |= FLAG_DATA_ACKED; 2064 acked |= FLAG_DATA_ACKED;
2065 ++pkts_acked;
2451 } else { 2066 } else {
2452 acked |= FLAG_SYN_ACKED; 2067 acked |= FLAG_SYN_ACKED;
2453 tp->retrans_stamp = 0; 2068 tp->retrans_stamp = 0;
@@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2461 seq_rtt = -1; 2076 seq_rtt = -1;
2462 } else if (seq_rtt < 0) 2077 } else if (seq_rtt < 0)
2463 seq_rtt = now - scb->when; 2078 seq_rtt = now - scb->when;
2079 if (seq_usrtt)
2080 *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
2081 + (usnow.tv_usec - skb->stamp.tv_usec);
2082
2464 if (sacked & TCPCB_SACKED_ACKED) 2083 if (sacked & TCPCB_SACKED_ACKED)
2465 tp->sacked_out -= tcp_skb_pcount(skb); 2084 tp->sacked_out -= tcp_skb_pcount(skb);
2466 if (sacked & TCPCB_LOST) 2085 if (sacked & TCPCB_LOST)
@@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2479 } 2098 }
2480 2099
2481 if (acked&FLAG_ACKED) { 2100 if (acked&FLAG_ACKED) {
2482 tcp_ack_update_rtt(tp, acked, seq_rtt); 2101 tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
2483 tcp_ack_packets_out(sk, tp); 2102 tcp_ack_packets_out(sk, tp);
2103
2104 if (tp->ca_ops->pkts_acked)
2105 tp->ca_ops->pkts_acked(tp, pkts_acked);
2484 } 2106 }
2485 2107
2486#if FASTRETRANS_DEBUG > 0 2108#if FASTRETRANS_DEBUG > 0
@@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2624 tp->frto_counter = (tp->frto_counter + 1) % 3; 2246 tp->frto_counter = (tp->frto_counter + 1) % 3;
2625} 2247}
2626 2248
2627/*
2628 * TCP Westwood+
2629 */
2630
2631/*
2632 * @init_westwood
2633 * This function initializes fields used in TCP Westwood+. We can't
2634 * get no information about RTTmin at this time so we simply set it to
2635 * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
2636 * since in this way we're sure it will be updated in a consistent
2637 * way as soon as possible. It will reasonably happen within the first
2638 * RTT period of the connection lifetime.
2639 */
2640
2641static void init_westwood(struct sock *sk)
2642{
2643 struct tcp_sock *tp = tcp_sk(sk);
2644
2645 tp->westwood.bw_ns_est = 0;
2646 tp->westwood.bw_est = 0;
2647 tp->westwood.accounted = 0;
2648 tp->westwood.cumul_ack = 0;
2649 tp->westwood.rtt_win_sx = tcp_time_stamp;
2650 tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
2651 tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
2652 tp->westwood.snd_una = tp->snd_una;
2653}
2654
2655/*
2656 * @westwood_do_filter
2657 * Low-pass filter. Implemented using constant coeffients.
2658 */
2659
2660static inline __u32 westwood_do_filter(__u32 a, __u32 b)
2661{
2662 return (((7 * a) + b) >> 3);
2663}
2664
2665static void westwood_filter(struct sock *sk, __u32 delta)
2666{
2667 struct tcp_sock *tp = tcp_sk(sk);
2668
2669 tp->westwood.bw_ns_est =
2670 westwood_do_filter(tp->westwood.bw_ns_est,
2671 tp->westwood.bk / delta);
2672 tp->westwood.bw_est =
2673 westwood_do_filter(tp->westwood.bw_est,
2674 tp->westwood.bw_ns_est);
2675}
2676
2677/*
2678 * @westwood_update_rttmin
2679 * It is used to update RTTmin. In this case we MUST NOT use
2680 * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
2681 */
2682
2683static inline __u32 westwood_update_rttmin(const struct sock *sk)
2684{
2685 const struct tcp_sock *tp = tcp_sk(sk);
2686 __u32 rttmin = tp->westwood.rtt_min;
2687
2688 if (tp->westwood.rtt != 0 &&
2689 (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
2690 rttmin = tp->westwood.rtt;
2691
2692 return rttmin;
2693}
2694
2695/*
2696 * @westwood_acked
2697 * Evaluate increases for dk.
2698 */
2699
2700static inline __u32 westwood_acked(const struct sock *sk)
2701{
2702 const struct tcp_sock *tp = tcp_sk(sk);
2703
2704 return tp->snd_una - tp->westwood.snd_una;
2705}
2706
2707/*
2708 * @westwood_new_window
2709 * It evaluates if we are receiving data inside the same RTT window as
2710 * when we started.
2711 * Return value:
2712 * It returns 0 if we are still evaluating samples in the same RTT
2713 * window, 1 if the sample has to be considered in the next window.
2714 */
2715
2716static int westwood_new_window(const struct sock *sk)
2717{
2718 const struct tcp_sock *tp = tcp_sk(sk);
2719 __u32 left_bound;
2720 __u32 rtt;
2721 int ret = 0;
2722
2723 left_bound = tp->westwood.rtt_win_sx;
2724 rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
2725
2726 /*
2727 * A RTT-window has passed. Be careful since if RTT is less than
2728 * 50ms we don't filter but we continue 'building the sample'.
2729 * This minimum limit was choosen since an estimation on small
2730 * time intervals is better to avoid...
2731 * Obvioulsy on a LAN we reasonably will always have
2732 * right_bound = left_bound + WESTWOOD_RTT_MIN
2733 */
2734
2735 if ((left_bound + rtt) < tcp_time_stamp)
2736 ret = 1;
2737
2738 return ret;
2739}
2740
2741/*
2742 * @westwood_update_window
2743 * It updates RTT evaluation window if it is the right moment to do
2744 * it. If so it calls filter for evaluating bandwidth.
2745 */
2746
2747static void __westwood_update_window(struct sock *sk, __u32 now)
2748{
2749 struct tcp_sock *tp = tcp_sk(sk);
2750 __u32 delta = now - tp->westwood.rtt_win_sx;
2751
2752 if (delta) {
2753 if (tp->westwood.rtt)
2754 westwood_filter(sk, delta);
2755
2756 tp->westwood.bk = 0;
2757 tp->westwood.rtt_win_sx = tcp_time_stamp;
2758 }
2759}
2760
2761
2762static void westwood_update_window(struct sock *sk, __u32 now)
2763{
2764 if (westwood_new_window(sk))
2765 __westwood_update_window(sk, now);
2766}
2767
2768/*
2769 * @__tcp_westwood_fast_bw
2770 * It is called when we are in fast path. In particular it is called when
2771 * header prediction is successfull. In such case infact update is
2772 * straight forward and doesn't need any particular care.
2773 */
2774
2775static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2776{
2777 struct tcp_sock *tp = tcp_sk(sk);
2778
2779 westwood_update_window(sk, tcp_time_stamp);
2780
2781 tp->westwood.bk += westwood_acked(sk);
2782 tp->westwood.snd_una = tp->snd_una;
2783 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2784}
2785
2786static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
2787{
2788 if (tcp_is_westwood(tcp_sk(sk)))
2789 __tcp_westwood_fast_bw(sk, skb);
2790}
2791
2792
2793/*
2794 * @westwood_dupack_update
2795 * It updates accounted and cumul_ack when receiving a dupack.
2796 */
2797
2798static void westwood_dupack_update(struct sock *sk)
2799{
2800 struct tcp_sock *tp = tcp_sk(sk);
2801
2802 tp->westwood.accounted += tp->mss_cache_std;
2803 tp->westwood.cumul_ack = tp->mss_cache_std;
2804}
2805
2806static inline int westwood_may_change_cumul(struct tcp_sock *tp)
2807{
2808 return (tp->westwood.cumul_ack > tp->mss_cache_std);
2809}
2810
2811static inline void westwood_partial_update(struct tcp_sock *tp)
2812{
2813 tp->westwood.accounted -= tp->westwood.cumul_ack;
2814 tp->westwood.cumul_ack = tp->mss_cache_std;
2815}
2816
2817static inline void westwood_complete_update(struct tcp_sock *tp)
2818{
2819 tp->westwood.cumul_ack -= tp->westwood.accounted;
2820 tp->westwood.accounted = 0;
2821}
2822
2823/*
2824 * @westwood_acked_count
2825 * This function evaluates cumul_ack for evaluating dk in case of
2826 * delayed or partial acks.
2827 */
2828
2829static inline __u32 westwood_acked_count(struct sock *sk)
2830{
2831 struct tcp_sock *tp = tcp_sk(sk);
2832
2833 tp->westwood.cumul_ack = westwood_acked(sk);
2834
2835 /* If cumul_ack is 0 this is a dupack since it's not moving
2836 * tp->snd_una.
2837 */
2838 if (!(tp->westwood.cumul_ack))
2839 westwood_dupack_update(sk);
2840
2841 if (westwood_may_change_cumul(tp)) {
2842 /* Partial or delayed ack */
2843 if (tp->westwood.accounted >= tp->westwood.cumul_ack)
2844 westwood_partial_update(tp);
2845 else
2846 westwood_complete_update(tp);
2847 }
2848
2849 tp->westwood.snd_una = tp->snd_una;
2850
2851 return tp->westwood.cumul_ack;
2852}
2853
2854
2855/*
2856 * @__tcp_westwood_slow_bw
2857 * It is called when something is going wrong..even if there could
2858 * be no problems! Infact a simple delayed packet may trigger a
2859 * dupack. But we need to be careful in such case.
2860 */
2861
2862static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2863{
2864 struct tcp_sock *tp = tcp_sk(sk);
2865
2866 westwood_update_window(sk, tcp_time_stamp);
2867
2868 tp->westwood.bk += westwood_acked_count(sk);
2869 tp->westwood.rtt_min = westwood_update_rttmin(sk);
2870}
2871
2872static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
2873{
2874 if (tcp_is_westwood(tcp_sk(sk)))
2875 __tcp_westwood_slow_bw(sk, skb);
2876}
2877
2878/* This routine deals with incoming acks, but not outgoing ones. */ 2249/* This routine deals with incoming acks, but not outgoing ones. */
2879static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) 2250static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2880{ 2251{
@@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2884 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2255 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2885 u32 prior_in_flight; 2256 u32 prior_in_flight;
2886 s32 seq_rtt; 2257 s32 seq_rtt;
2258 s32 seq_usrtt = 0;
2887 int prior_packets; 2259 int prior_packets;
2888 2260
2889 /* If the ack is newer than sent or older than previous acks 2261 /* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2902 */ 2274 */
2903 tcp_update_wl(tp, ack, ack_seq); 2275 tcp_update_wl(tp, ack, ack_seq);
2904 tp->snd_una = ack; 2276 tp->snd_una = ack;
2905 tcp_westwood_fast_bw(sk, skb);
2906 flag |= FLAG_WIN_UPDATE; 2277 flag |= FLAG_WIN_UPDATE;
2907 2278
2279 tcp_ca_event(tp, CA_EVENT_FAST_ACK);
2280
2908 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); 2281 NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
2909 } else { 2282 } else {
2910 if (ack_seq != TCP_SKB_CB(skb)->end_seq) 2283 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2920 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) 2293 if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
2921 flag |= FLAG_ECE; 2294 flag |= FLAG_ECE;
2922 2295
2923 tcp_westwood_slow_bw(sk,skb); 2296 tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
2924 } 2297 }
2925 2298
2926 /* We passed data and got it acked, remove any soft error 2299 /* We passed data and got it acked, remove any soft error
@@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2935 prior_in_flight = tcp_packets_in_flight(tp); 2308 prior_in_flight = tcp_packets_in_flight(tp);
2936 2309
2937 /* See if we can take anything off of the retransmit queue. */ 2310 /* See if we can take anything off of the retransmit queue. */
2938 flag |= tcp_clean_rtx_queue(sk, &seq_rtt); 2311 flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
2312 tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
2939 2313
2940 if (tp->frto_counter) 2314 if (tp->frto_counter)
2941 tcp_process_frto(sk, prior_snd_una); 2315 tcp_process_frto(sk, prior_snd_una);
2942 2316
2943 if (tcp_ack_is_dubious(tp, flag)) { 2317 if (tcp_ack_is_dubious(tp, flag)) {
2944 /* Advanve CWND, if state allows this. */ 2318 /* Advanve CWND, if state allows this. */
2945 if ((flag & FLAG_DATA_ACKED) && 2319 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
2946 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && 2320 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0);
2947 tcp_may_raise_cwnd(tp, flag))
2948 tcp_cong_avoid(tp, ack, seq_rtt);
2949 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2321 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
2950 } else { 2322 } else {
2951 if ((flag & FLAG_DATA_ACKED) && 2323 if ((flag & FLAG_DATA_ACKED))
2952 (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) 2324 tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
2953 tcp_cong_avoid(tp, ack, seq_rtt);
2954 } 2325 }
2955 2326
2956 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) 2327 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
4552 3923
4553 tcp_init_metrics(sk); 3924 tcp_init_metrics(sk);
4554 3925
3926 tcp_init_congestion_control(tp);
3927
4555 /* Prevent spurious tcp_cwnd_restart() on first data 3928 /* Prevent spurious tcp_cwnd_restart() on first data
4556 * packet. 3929 * packet.
4557 */ 3930 */
@@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4708 if(tp->af_specific->conn_request(sk, skb) < 0) 4081 if(tp->af_specific->conn_request(sk, skb) < 0)
4709 return 1; 4082 return 1;
4710 4083
4711 init_westwood(sk);
4712 init_bictcp(tp);
4713
4714 /* Now we have several options: In theory there is 4084 /* Now we have several options: In theory there is
4715 * nothing else in the frame. KA9Q has an option to 4085 * nothing else in the frame. KA9Q has an option to
4716 * send data with the syn, BSD accepts data with the 4086 * send data with the syn, BSD accepts data with the
@@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4732 goto discard; 4102 goto discard;
4733 4103
4734 case TCP_SYN_SENT: 4104 case TCP_SYN_SENT:
4735 init_westwood(sk);
4736 init_bictcp(tp);
4737
4738 queued = tcp_rcv_synsent_state_process(sk, skb, th, len); 4105 queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
4739 if (queued >= 0) 4106 if (queued >= 0)
4740 return queued; 4107 return queued;
@@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4816 */ 4183 */
4817 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4184 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4818 !tp->srtt) 4185 !tp->srtt)
4819 tcp_ack_saw_tstamp(tp, 0); 4186 tcp_ack_saw_tstamp(tp, 0, 0);
4820 4187
4821 if (tp->rx_opt.tstamp_ok) 4188 if (tp->rx_opt.tstamp_ok)
4822 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4189 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4828 4195
4829 tcp_init_metrics(sk); 4196 tcp_init_metrics(sk);
4830 4197
4198 tcp_init_congestion_control(tp);
4199
4831 /* Prevent spurious tcp_cwnd_restart() on 4200 /* Prevent spurious tcp_cwnd_restart() on
4832 * first data packet. 4201 * first data packet.
4833 */ 4202 */