diff options
author | Stephen Hemminger <shemminger@osdl.org> | 2005-06-23 15:19:55 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2005-06-23 15:19:55 -0400 |
commit | 317a76f9a44b437d6301718f4e5d08bd93f98da7 (patch) | |
tree | caeba9839dee264f59b035b81c3d13d6c61b638e /net/ipv4/tcp_input.c | |
parent | a8ad86f2dc46356f87be1327dabc18bdbda32f50 (diff) |
[TCP]: Add pluggable congestion control algorithm infrastructure.
Allow TCP to have multiple pluggable congestion control algorithms.
Algorithms are defined by a set of operations and can be built in
or modules. The legacy "new RENO" algorithm is used as a starting
point and fallback.
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 737 |
1 files changed, 53 insertions, 684 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bad504630a3..7bbbbc33eb4b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -61,7 +61,6 @@ | |||
61 | * Panu Kuhlberg: Experimental audit of TCP (re)transmission | 61 | * Panu Kuhlberg: Experimental audit of TCP (re)transmission |
62 | * engine. Lots of bugs are found. | 62 | * engine. Lots of bugs are found. |
63 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs | 63 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs |
64 | * Angelo Dell'Aera: TCP Westwood+ support | ||
65 | */ | 64 | */ |
66 | 65 | ||
67 | #include <linux/config.h> | 66 | #include <linux/config.h> |
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337; | |||
88 | int sysctl_tcp_max_orphans = NR_FILE; | 87 | int sysctl_tcp_max_orphans = NR_FILE; |
89 | int sysctl_tcp_frto; | 88 | int sysctl_tcp_frto; |
90 | int sysctl_tcp_nometrics_save; | 89 | int sysctl_tcp_nometrics_save; |
91 | int sysctl_tcp_westwood; | ||
92 | int sysctl_tcp_vegas_cong_avoid; | ||
93 | 90 | ||
94 | int sysctl_tcp_moderate_rcvbuf = 1; | 91 | int sysctl_tcp_moderate_rcvbuf = 1; |
95 | 92 | ||
96 | /* Default values of the Vegas variables, in fixed-point representation | ||
97 | * with V_PARAM_SHIFT bits to the right of the binary point. | ||
98 | */ | ||
99 | #define V_PARAM_SHIFT 1 | ||
100 | int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT; | ||
101 | int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT; | ||
102 | int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT; | ||
103 | int sysctl_tcp_bic = 1; | ||
104 | int sysctl_tcp_bic_fast_convergence = 1; | ||
105 | int sysctl_tcp_bic_low_window = 14; | ||
106 | int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ | ||
107 | |||
108 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 93 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
109 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 94 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
110 | #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ | 95 | #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ |
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk) | |||
333 | tp->snd_cwnd_stamp = tcp_time_stamp; | 318 | tp->snd_cwnd_stamp = tcp_time_stamp; |
334 | } | 319 | } |
335 | 320 | ||
336 | static void init_bictcp(struct tcp_sock *tp) | ||
337 | { | ||
338 | tp->bictcp.cnt = 0; | ||
339 | |||
340 | tp->bictcp.last_max_cwnd = 0; | ||
341 | tp->bictcp.last_cwnd = 0; | ||
342 | tp->bictcp.last_stamp = 0; | ||
343 | } | ||
344 | |||
345 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ | 321 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ |
346 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) | 322 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) |
347 | { | 323 | { |
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
558 | tcp_grow_window(sk, tp, skb); | 534 | tcp_grow_window(sk, tp, skb); |
559 | } | 535 | } |
560 | 536 | ||
561 | /* When starting a new connection, pin down the current choice of | ||
562 | * congestion algorithm. | ||
563 | */ | ||
564 | void tcp_ca_init(struct tcp_sock *tp) | ||
565 | { | ||
566 | if (sysctl_tcp_westwood) | ||
567 | tp->adv_cong = TCP_WESTWOOD; | ||
568 | else if (sysctl_tcp_bic) | ||
569 | tp->adv_cong = TCP_BIC; | ||
570 | else if (sysctl_tcp_vegas_cong_avoid) { | ||
571 | tp->adv_cong = TCP_VEGAS; | ||
572 | tp->vegas.baseRTT = 0x7fffffff; | ||
573 | tcp_vegas_enable(tp); | ||
574 | } | ||
575 | } | ||
576 | |||
577 | /* Do RTT sampling needed for Vegas. | ||
578 | * Basically we: | ||
579 | * o min-filter RTT samples from within an RTT to get the current | ||
580 | * propagation delay + queuing delay (we are min-filtering to try to | ||
581 | * avoid the effects of delayed ACKs) | ||
582 | * o min-filter RTT samples from a much longer window (forever for now) | ||
583 | * to find the propagation delay (baseRTT) | ||
584 | */ | ||
585 | static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) | ||
586 | { | ||
587 | __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ | ||
588 | |||
589 | /* Filter to find propagation delay: */ | ||
590 | if (vrtt < tp->vegas.baseRTT) | ||
591 | tp->vegas.baseRTT = vrtt; | ||
592 | |||
593 | /* Find the min RTT during the last RTT to find | ||
594 | * the current prop. delay + queuing delay: | ||
595 | */ | ||
596 | tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); | ||
597 | tp->vegas.cntRTT++; | ||
598 | } | ||
599 | |||
600 | /* Called to compute a smoothed rtt estimate. The data fed to this | 537 | /* Called to compute a smoothed rtt estimate. The data fed to this |
601 | * routine either comes from timestamps, or from segments that were | 538 | * routine either comes from timestamps, or from segments that were |
602 | * known _not_ to have been retransmitted [see Karn/Partridge | 539 | * known _not_ to have been retransmitted [see Karn/Partridge |
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) | |||
606 | * To save cycles in the RFC 1323 implementation it was better to break | 543 | * To save cycles in the RFC 1323 implementation it was better to break |
607 | * it up into three procedures. -- erics | 544 | * it up into three procedures. -- erics |
608 | */ | 545 | */ |
609 | static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) | 546 | static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) |
610 | { | 547 | { |
611 | long m = mrtt; /* RTT */ | 548 | long m = mrtt; /* RTT */ |
612 | 549 | ||
613 | if (tcp_vegas_enabled(tp)) | ||
614 | vegas_rtt_calc(tp, mrtt); | ||
615 | |||
616 | /* The following amusing code comes from Jacobson's | 550 | /* The following amusing code comes from Jacobson's |
617 | * article in SIGCOMM '88. Note that rtt and mdev | 551 | * article in SIGCOMM '88. Note that rtt and mdev |
618 | * are scaled versions of rtt and mean deviation. | 552 | * are scaled versions of rtt and mean deviation. |
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) | |||
670 | tp->rtt_seq = tp->snd_nxt; | 604 | tp->rtt_seq = tp->snd_nxt; |
671 | } | 605 | } |
672 | 606 | ||
673 | tcp_westwood_update_rtt(tp, tp->srtt >> 3); | 607 | if (tp->ca_ops->rtt_sample) |
608 | tp->ca_ops->rtt_sample(tp, *usrtt); | ||
674 | } | 609 | } |
675 | 610 | ||
676 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 611 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk) | |||
1185 | tp->snd_una == tp->high_seq || | 1120 | tp->snd_una == tp->high_seq || |
1186 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1121 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { |
1187 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1122 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
1188 | if (!tcp_westwood_ssthresh(tp)) | 1123 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
1189 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1124 | tcp_ca_event(tp, CA_EVENT_FRTO); |
1190 | } | 1125 | } |
1191 | 1126 | ||
1192 | /* Have to clear retransmission markers here to keep the bookkeeping | 1127 | /* Have to clear retransmission markers here to keep the bookkeeping |
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk) | |||
1252 | tcp_set_ca_state(tp, TCP_CA_Loss); | 1187 | tcp_set_ca_state(tp, TCP_CA_Loss); |
1253 | tp->high_seq = tp->frto_highmark; | 1188 | tp->high_seq = tp->frto_highmark; |
1254 | TCP_ECN_queue_cwr(tp); | 1189 | TCP_ECN_queue_cwr(tp); |
1255 | |||
1256 | init_bictcp(tp); | ||
1257 | } | 1190 | } |
1258 | 1191 | ||
1259 | void tcp_clear_retrans(struct tcp_sock *tp) | 1192 | void tcp_clear_retrans(struct tcp_sock *tp) |
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
1283 | if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || | 1216 | if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || |
1284 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1217 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { |
1285 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1218 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
1286 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1219 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
1220 | tcp_ca_event(tp, CA_EVENT_LOSS); | ||
1287 | } | 1221 | } |
1288 | tp->snd_cwnd = 1; | 1222 | tp->snd_cwnd = 1; |
1289 | tp->snd_cwnd_cnt = 0; | 1223 | tp->snd_cwnd_cnt = 0; |
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) | |||
1596 | } | 1530 | } |
1597 | 1531 | ||
1598 | /* Decrease cwnd each second ack. */ | 1532 | /* Decrease cwnd each second ack. */ |
1599 | |||
1600 | static void tcp_cwnd_down(struct tcp_sock *tp) | 1533 | static void tcp_cwnd_down(struct tcp_sock *tp) |
1601 | { | 1534 | { |
1602 | int decr = tp->snd_cwnd_cnt + 1; | 1535 | int decr = tp->snd_cwnd_cnt + 1; |
1603 | __u32 limit; | ||
1604 | |||
1605 | /* | ||
1606 | * TCP Westwood | ||
1607 | * Here limit is evaluated as BWestimation*RTTmin (for obtaining it | ||
1608 | * in packets we use mss_cache). If sysctl_tcp_westwood is off | ||
1609 | * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is | ||
1610 | * still used as usual. It prevents other strange cases in which | ||
1611 | * BWE*RTTmin could assume value 0. It should not happen but... | ||
1612 | */ | ||
1613 | |||
1614 | if (!(limit = tcp_westwood_bw_rttmin(tp))) | ||
1615 | limit = tp->snd_ssthresh/2; | ||
1616 | 1536 | ||
1617 | tp->snd_cwnd_cnt = decr&1; | 1537 | tp->snd_cwnd_cnt = decr&1; |
1618 | decr >>= 1; | 1538 | decr >>= 1; |
1619 | 1539 | ||
1620 | if (decr && tp->snd_cwnd > limit) | 1540 | if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) |
1621 | tp->snd_cwnd -= decr; | 1541 | tp->snd_cwnd -= decr; |
1622 | 1542 | ||
1623 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); | 1543 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); |
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) | |||
1654 | static void tcp_undo_cwr(struct tcp_sock *tp, int undo) | 1574 | static void tcp_undo_cwr(struct tcp_sock *tp, int undo) |
1655 | { | 1575 | { |
1656 | if (tp->prior_ssthresh) { | 1576 | if (tp->prior_ssthresh) { |
1657 | if (tcp_is_bic(tp)) | 1577 | if (tp->ca_ops->undo_cwnd) |
1658 | tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); | 1578 | tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); |
1659 | else | 1579 | else |
1660 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); | 1580 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); |
1661 | 1581 | ||
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) | |||
1767 | 1687 | ||
1768 | static inline void tcp_complete_cwr(struct tcp_sock *tp) | 1688 | static inline void tcp_complete_cwr(struct tcp_sock *tp) |
1769 | { | 1689 | { |
1770 | if (tcp_westwood_cwnd(tp)) | 1690 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); |
1771 | tp->snd_ssthresh = tp->snd_cwnd; | ||
1772 | else | ||
1773 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); | ||
1774 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1691 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1692 | tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); | ||
1775 | } | 1693 | } |
1776 | 1694 | ||
1777 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) | 1695 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) |
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
1946 | if (tp->ca_state < TCP_CA_CWR) { | 1864 | if (tp->ca_state < TCP_CA_CWR) { |
1947 | if (!(flag&FLAG_ECE)) | 1865 | if (!(flag&FLAG_ECE)) |
1948 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1866 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
1949 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1867 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
1950 | TCP_ECN_queue_cwr(tp); | 1868 | TCP_ECN_queue_cwr(tp); |
1951 | } | 1869 | } |
1952 | 1870 | ||
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
1963 | /* Read draft-ietf-tcplw-high-performance before mucking | 1881 | /* Read draft-ietf-tcplw-high-performance before mucking |
1964 | * with this code. (Superceeds RFC1323) | 1882 | * with this code. (Superceeds RFC1323) |
1965 | */ | 1883 | */ |
1966 | static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) | 1884 | static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) |
1967 | { | 1885 | { |
1968 | __u32 seq_rtt; | 1886 | __u32 seq_rtt; |
1969 | 1887 | ||
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) | |||
1983 | * in window is lost... Voila. --ANK (010210) | 1901 | * in window is lost... Voila. --ANK (010210) |
1984 | */ | 1902 | */ |
1985 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | 1903 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; |
1986 | tcp_rtt_estimator(tp, seq_rtt); | 1904 | tcp_rtt_estimator(tp, seq_rtt, usrtt); |
1987 | tcp_set_rto(tp); | 1905 | tcp_set_rto(tp); |
1988 | tp->backoff = 0; | 1906 | tp->backoff = 0; |
1989 | tcp_bound_rto(tp); | 1907 | tcp_bound_rto(tp); |
1990 | } | 1908 | } |
1991 | 1909 | ||
1992 | static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) | 1910 | static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) |
1993 | { | 1911 | { |
1994 | /* We don't have a timestamp. Can only use | 1912 | /* We don't have a timestamp. Can only use |
1995 | * packets that are not retransmitted to determine | 1913 | * packets that are not retransmitted to determine |
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) | |||
2003 | if (flag & FLAG_RETRANS_DATA_ACKED) | 1921 | if (flag & FLAG_RETRANS_DATA_ACKED) |
2004 | return; | 1922 | return; |
2005 | 1923 | ||
2006 | tcp_rtt_estimator(tp, seq_rtt); | 1924 | tcp_rtt_estimator(tp, seq_rtt, usrtt); |
2007 | tcp_set_rto(tp); | 1925 | tcp_set_rto(tp); |
2008 | tp->backoff = 0; | 1926 | tp->backoff = 0; |
2009 | tcp_bound_rto(tp); | 1927 | tcp_bound_rto(tp); |
2010 | } | 1928 | } |
2011 | 1929 | ||
2012 | static inline void tcp_ack_update_rtt(struct tcp_sock *tp, | 1930 | static inline void tcp_ack_update_rtt(struct tcp_sock *tp, |
2013 | int flag, s32 seq_rtt) | 1931 | int flag, s32 seq_rtt, u32 *usrtt) |
2014 | { | 1932 | { |
2015 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | 1933 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ |
2016 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | 1934 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
2017 | tcp_ack_saw_tstamp(tp, flag); | 1935 | tcp_ack_saw_tstamp(tp, usrtt, flag); |
2018 | else if (seq_rtt >= 0) | 1936 | else if (seq_rtt >= 0) |
2019 | tcp_ack_no_tstamp(tp, seq_rtt, flag); | 1937 | tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); |
2020 | } | 1938 | } |
2021 | 1939 | ||
2022 | /* | 1940 | static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, |
2023 | * Compute congestion window to use. | 1941 | u32 in_flight, int good) |
2024 | * | ||
2025 | * This is from the implementation of BICTCP in | ||
2026 | * Lison-Xu, Kahaled Harfoush, and Injog Rhee. | ||
2027 | * "Binary Increase Congestion Control for Fast, Long Distance | ||
2028 | * Networks" in InfoComm 2004 | ||
2029 | * Available from: | ||
2030 | * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf | ||
2031 | * | ||
2032 | * Unless BIC is enabled and congestion window is large | ||
2033 | * this behaves the same as the original Reno. | ||
2034 | */ | ||
2035 | static inline __u32 bictcp_cwnd(struct tcp_sock *tp) | ||
2036 | { | ||
2037 | /* orignal Reno behaviour */ | ||
2038 | if (!tcp_is_bic(tp)) | ||
2039 | return tp->snd_cwnd; | ||
2040 | |||
2041 | if (tp->bictcp.last_cwnd == tp->snd_cwnd && | ||
2042 | (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) | ||
2043 | return tp->bictcp.cnt; | ||
2044 | |||
2045 | tp->bictcp.last_cwnd = tp->snd_cwnd; | ||
2046 | tp->bictcp.last_stamp = tcp_time_stamp; | ||
2047 | |||
2048 | /* start off normal */ | ||
2049 | if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) | ||
2050 | tp->bictcp.cnt = tp->snd_cwnd; | ||
2051 | |||
2052 | /* binary increase */ | ||
2053 | else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { | ||
2054 | __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) | ||
2055 | / BICTCP_B; | ||
2056 | |||
2057 | if (dist > BICTCP_MAX_INCREMENT) | ||
2058 | /* linear increase */ | ||
2059 | tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; | ||
2060 | else if (dist <= 1U) | ||
2061 | /* binary search increase */ | ||
2062 | tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR | ||
2063 | / BICTCP_B; | ||
2064 | else | ||
2065 | /* binary search increase */ | ||
2066 | tp->bictcp.cnt = tp->snd_cwnd / dist; | ||
2067 | } else { | ||
2068 | /* slow start amd linear increase */ | ||
2069 | if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) | ||
2070 | /* slow start */ | ||
2071 | tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR | ||
2072 | / BICTCP_B; | ||
2073 | else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd | ||
2074 | + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) | ||
2075 | /* slow start */ | ||
2076 | tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) | ||
2077 | / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); | ||
2078 | else | ||
2079 | /* linear increase */ | ||
2080 | tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; | ||
2081 | } | ||
2082 | return tp->bictcp.cnt; | ||
2083 | } | ||
2084 | |||
2085 | /* This is Jacobson's slow start and congestion avoidance. | ||
2086 | * SIGCOMM '88, p. 328. | ||
2087 | */ | ||
2088 | static inline void reno_cong_avoid(struct tcp_sock *tp) | ||
2089 | { | 1942 | { |
2090 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 1943 | tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); |
2091 | /* In "safe" area, increase. */ | ||
2092 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
2093 | tp->snd_cwnd++; | ||
2094 | } else { | ||
2095 | /* In dangerous area, increase slowly. | ||
2096 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
2097 | */ | ||
2098 | if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { | ||
2099 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
2100 | tp->snd_cwnd++; | ||
2101 | tp->snd_cwnd_cnt=0; | ||
2102 | } else | ||
2103 | tp->snd_cwnd_cnt++; | ||
2104 | } | ||
2105 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1944 | tp->snd_cwnd_stamp = tcp_time_stamp; |
2106 | } | 1945 | } |
2107 | 1946 | ||
2108 | /* This is based on the congestion detection/avoidance scheme described in | ||
2109 | * Lawrence S. Brakmo and Larry L. Peterson. | ||
2110 | * "TCP Vegas: End to end congestion avoidance on a global internet." | ||
2111 | * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, | ||
2112 | * October 1995. Available from: | ||
2113 | * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps | ||
2114 | * | ||
2115 | * See http://www.cs.arizona.edu/xkernel/ for their implementation. | ||
2116 | * The main aspects that distinguish this implementation from the | ||
2117 | * Arizona Vegas implementation are: | ||
2118 | * o We do not change the loss detection or recovery mechanisms of | ||
2119 | * Linux in any way. Linux already recovers from losses quite well, | ||
2120 | * using fine-grained timers, NewReno, and FACK. | ||
2121 | * o To avoid the performance penalty imposed by increasing cwnd | ||
2122 | * only every-other RTT during slow start, we increase during | ||
2123 | * every RTT during slow start, just like Reno. | ||
2124 | * o Largely to allow continuous cwnd growth during slow start, | ||
2125 | * we use the rate at which ACKs come back as the "actual" | ||
2126 | * rate, rather than the rate at which data is sent. | ||
2127 | * o To speed convergence to the right rate, we set the cwnd | ||
2128 | * to achieve the right ("actual") rate when we exit slow start. | ||
2129 | * o To filter out the noise caused by delayed ACKs, we use the | ||
2130 | * minimum RTT sample observed during the last RTT to calculate | ||
2131 | * the actual rate. | ||
2132 | * o When the sender re-starts from idle, it waits until it has | ||
2133 | * received ACKs for an entire flight of new data before making | ||
2134 | * a cwnd adjustment decision. The original Vegas implementation | ||
2135 | * assumed senders never went idle. | ||
2136 | */ | ||
2137 | static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) | ||
2138 | { | ||
2139 | /* The key players are v_beg_snd_una and v_beg_snd_nxt. | ||
2140 | * | ||
2141 | * These are so named because they represent the approximate values | ||
2142 | * of snd_una and snd_nxt at the beginning of the current RTT. More | ||
2143 | * precisely, they represent the amount of data sent during the RTT. | ||
2144 | * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, | ||
2145 | * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding | ||
2146 | * bytes of data have been ACKed during the course of the RTT, giving | ||
2147 | * an "actual" rate of: | ||
2148 | * | ||
2149 | * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) | ||
2150 | * | ||
2151 | * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, | ||
2152 | * because delayed ACKs can cover more than one segment, so they | ||
2153 | * don't line up nicely with the boundaries of RTTs. | ||
2154 | * | ||
2155 | * Another unfortunate fact of life is that delayed ACKs delay the | ||
2156 | * advance of the left edge of our send window, so that the number | ||
2157 | * of bytes we send in an RTT is often less than our cwnd will allow. | ||
2158 | * So we keep track of our cwnd separately, in v_beg_snd_cwnd. | ||
2159 | */ | ||
2160 | |||
2161 | if (after(ack, tp->vegas.beg_snd_nxt)) { | ||
2162 | /* Do the Vegas once-per-RTT cwnd adjustment. */ | ||
2163 | u32 old_wnd, old_snd_cwnd; | ||
2164 | |||
2165 | |||
2166 | /* Here old_wnd is essentially the window of data that was | ||
2167 | * sent during the previous RTT, and has all | ||
2168 | * been acknowledged in the course of the RTT that ended | ||
2169 | * with the ACK we just received. Likewise, old_snd_cwnd | ||
2170 | * is the cwnd during the previous RTT. | ||
2171 | */ | ||
2172 | old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / | ||
2173 | tp->mss_cache_std; | ||
2174 | old_snd_cwnd = tp->vegas.beg_snd_cwnd; | ||
2175 | |||
2176 | /* Save the extent of the current window so we can use this | ||
2177 | * at the end of the next RTT. | ||
2178 | */ | ||
2179 | tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; | ||
2180 | tp->vegas.beg_snd_nxt = tp->snd_nxt; | ||
2181 | tp->vegas.beg_snd_cwnd = tp->snd_cwnd; | ||
2182 | |||
2183 | /* Take into account the current RTT sample too, to | ||
2184 | * decrease the impact of delayed acks. This double counts | ||
2185 | * this sample since we count it for the next window as well, | ||
2186 | * but that's not too awful, since we're taking the min, | ||
2187 | * rather than averaging. | ||
2188 | */ | ||
2189 | vegas_rtt_calc(tp, seq_rtt); | ||
2190 | |||
2191 | /* We do the Vegas calculations only if we got enough RTT | ||
2192 | * samples that we can be reasonably sure that we got | ||
2193 | * at least one RTT sample that wasn't from a delayed ACK. | ||
2194 | * If we only had 2 samples total, | ||
2195 | * then that means we're getting only 1 ACK per RTT, which | ||
2196 | * means they're almost certainly delayed ACKs. | ||
2197 | * If we have 3 samples, we should be OK. | ||
2198 | */ | ||
2199 | |||
2200 | if (tp->vegas.cntRTT <= 2) { | ||
2201 | /* We don't have enough RTT samples to do the Vegas | ||
2202 | * calculation, so we'll behave like Reno. | ||
2203 | */ | ||
2204 | if (tp->snd_cwnd > tp->snd_ssthresh) | ||
2205 | tp->snd_cwnd++; | ||
2206 | } else { | ||
2207 | u32 rtt, target_cwnd, diff; | ||
2208 | |||
2209 | /* We have enough RTT samples, so, using the Vegas | ||
2210 | * algorithm, we determine if we should increase or | ||
2211 | * decrease cwnd, and by how much. | ||
2212 | */ | ||
2213 | |||
2214 | /* Pluck out the RTT we are using for the Vegas | ||
2215 | * calculations. This is the min RTT seen during the | ||
2216 | * last RTT. Taking the min filters out the effects | ||
2217 | * of delayed ACKs, at the cost of noticing congestion | ||
2218 | * a bit later. | ||
2219 | */ | ||
2220 | rtt = tp->vegas.minRTT; | ||
2221 | |||
2222 | /* Calculate the cwnd we should have, if we weren't | ||
2223 | * going too fast. | ||
2224 | * | ||
2225 | * This is: | ||
2226 | * (actual rate in segments) * baseRTT | ||
2227 | * We keep it as a fixed point number with | ||
2228 | * V_PARAM_SHIFT bits to the right of the binary point. | ||
2229 | */ | ||
2230 | target_cwnd = ((old_wnd * tp->vegas.baseRTT) | ||
2231 | << V_PARAM_SHIFT) / rtt; | ||
2232 | |||
2233 | /* Calculate the difference between the window we had, | ||
2234 | * and the window we would like to have. This quantity | ||
2235 | * is the "Diff" from the Arizona Vegas papers. | ||
2236 | * | ||
2237 | * Again, this is a fixed point number with | ||
2238 | * V_PARAM_SHIFT bits to the right of the binary | ||
2239 | * point. | ||
2240 | */ | ||
2241 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | ||
2242 | |||
2243 | if (tp->snd_cwnd < tp->snd_ssthresh) { | ||
2244 | /* Slow start. */ | ||
2245 | if (diff > sysctl_tcp_vegas_gamma) { | ||
2246 | /* Going too fast. Time to slow down | ||
2247 | * and switch to congestion avoidance. | ||
2248 | */ | ||
2249 | tp->snd_ssthresh = 2; | ||
2250 | |||
2251 | /* Set cwnd to match the actual rate | ||
2252 | * exactly: | ||
2253 | * cwnd = (actual rate) * baseRTT | ||
2254 | * Then we add 1 because the integer | ||
2255 | * truncation robs us of full link | ||
2256 | * utilization. | ||
2257 | */ | ||
2258 | tp->snd_cwnd = min(tp->snd_cwnd, | ||
2259 | (target_cwnd >> | ||
2260 | V_PARAM_SHIFT)+1); | ||
2261 | |||
2262 | } | ||
2263 | } else { | ||
2264 | /* Congestion avoidance. */ | ||
2265 | u32 next_snd_cwnd; | ||
2266 | |||
2267 | /* Figure out where we would like cwnd | ||
2268 | * to be. | ||
2269 | */ | ||
2270 | if (diff > sysctl_tcp_vegas_beta) { | ||
2271 | /* The old window was too fast, so | ||
2272 | * we slow down. | ||
2273 | */ | ||
2274 | next_snd_cwnd = old_snd_cwnd - 1; | ||
2275 | } else if (diff < sysctl_tcp_vegas_alpha) { | ||
2276 | /* We don't have enough extra packets | ||
2277 | * in the network, so speed up. | ||
2278 | */ | ||
2279 | next_snd_cwnd = old_snd_cwnd + 1; | ||
2280 | } else { | ||
2281 | /* Sending just as fast as we | ||
2282 | * should be. | ||
2283 | */ | ||
2284 | next_snd_cwnd = old_snd_cwnd; | ||
2285 | } | ||
2286 | |||
2287 | /* Adjust cwnd upward or downward, toward the | ||
2288 | * desired value. | ||
2289 | */ | ||
2290 | if (next_snd_cwnd > tp->snd_cwnd) | ||
2291 | tp->snd_cwnd++; | ||
2292 | else if (next_snd_cwnd < tp->snd_cwnd) | ||
2293 | tp->snd_cwnd--; | ||
2294 | } | ||
2295 | } | ||
2296 | |||
2297 | /* Wipe the slate clean for the next RTT. */ | ||
2298 | tp->vegas.cntRTT = 0; | ||
2299 | tp->vegas.minRTT = 0x7fffffff; | ||
2300 | } | ||
2301 | |||
2302 | /* The following code is executed for every ack we receive, | ||
2303 | * except for conditions checked in should_advance_cwnd() | ||
2304 | * before the call to tcp_cong_avoid(). Mainly this means that | ||
2305 | * we only execute this code if the ack actually acked some | ||
2306 | * data. | ||
2307 | */ | ||
2308 | |||
2309 | /* If we are in slow start, increase our cwnd in response to this ACK. | ||
2310 | * (If we are not in slow start then we are in congestion avoidance, | ||
2311 | * and adjust our congestion window only once per RTT. See the code | ||
2312 | * above.) | ||
2313 | */ | ||
2314 | if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
2315 | tp->snd_cwnd++; | ||
2316 | |||
2317 | /* to keep cwnd from growing without bound */ | ||
2318 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
2319 | |||
2320 | /* Make sure that we are never so timid as to reduce our cwnd below | ||
2321 | * 2 MSS. | ||
2322 | * | ||
2323 | * Going below 2 MSS would risk huge delayed ACKs from our receiver. | ||
2324 | */ | ||
2325 | tp->snd_cwnd = max(tp->snd_cwnd, 2U); | ||
2326 | |||
2327 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
2328 | } | ||
2329 | |||
2330 | static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) | ||
2331 | { | ||
2332 | if (tcp_vegas_enabled(tp)) | ||
2333 | vegas_cong_avoid(tp, ack, seq_rtt); | ||
2334 | else | ||
2335 | reno_cong_avoid(tp); | ||
2336 | } | ||
2337 | |||
2338 | /* Restart timer after forward progress on connection. | 1947 | /* Restart timer after forward progress on connection. |
2339 | * RFC2988 recommends to restart timer to now+rto. | 1948 | * RFC2988 recommends to restart timer to now+rto. |
2340 | */ | 1949 | */ |
@@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, | |||
2415 | 2024 | ||
2416 | 2025 | ||
2417 | /* Remove acknowledged frames from the retransmission queue. */ | 2026 | /* Remove acknowledged frames from the retransmission queue. */ |
2418 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | 2027 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) |
2419 | { | 2028 | { |
2420 | struct tcp_sock *tp = tcp_sk(sk); | 2029 | struct tcp_sock *tp = tcp_sk(sk); |
2421 | struct sk_buff *skb; | 2030 | struct sk_buff *skb; |
2422 | __u32 now = tcp_time_stamp; | 2031 | __u32 now = tcp_time_stamp; |
2423 | int acked = 0; | 2032 | int acked = 0; |
2424 | __s32 seq_rtt = -1; | 2033 | __s32 seq_rtt = -1; |
2034 | struct timeval usnow; | ||
2035 | u32 pkts_acked = 0; | ||
2036 | |||
2037 | if (seq_usrtt) | ||
2038 | do_gettimeofday(&usnow); | ||
2425 | 2039 | ||
2426 | while ((skb = skb_peek(&sk->sk_write_queue)) && | 2040 | while ((skb = skb_peek(&sk->sk_write_queue)) && |
2427 | skb != sk->sk_send_head) { | 2041 | skb != sk->sk_send_head) { |
@@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
2448 | */ | 2062 | */ |
2449 | if (!(scb->flags & TCPCB_FLAG_SYN)) { | 2063 | if (!(scb->flags & TCPCB_FLAG_SYN)) { |
2450 | acked |= FLAG_DATA_ACKED; | 2064 | acked |= FLAG_DATA_ACKED; |
2065 | ++pkts_acked; | ||
2451 | } else { | 2066 | } else { |
2452 | acked |= FLAG_SYN_ACKED; | 2067 | acked |= FLAG_SYN_ACKED; |
2453 | tp->retrans_stamp = 0; | 2068 | tp->retrans_stamp = 0; |
@@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
2461 | seq_rtt = -1; | 2076 | seq_rtt = -1; |
2462 | } else if (seq_rtt < 0) | 2077 | } else if (seq_rtt < 0) |
2463 | seq_rtt = now - scb->when; | 2078 | seq_rtt = now - scb->when; |
2079 | if (seq_usrtt) | ||
2080 | *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 | ||
2081 | + (usnow.tv_usec - skb->stamp.tv_usec); | ||
2082 | |||
2464 | if (sacked & TCPCB_SACKED_ACKED) | 2083 | if (sacked & TCPCB_SACKED_ACKED) |
2465 | tp->sacked_out -= tcp_skb_pcount(skb); | 2084 | tp->sacked_out -= tcp_skb_pcount(skb); |
2466 | if (sacked & TCPCB_LOST) | 2085 | if (sacked & TCPCB_LOST) |
@@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
2479 | } | 2098 | } |
2480 | 2099 | ||
2481 | if (acked&FLAG_ACKED) { | 2100 | if (acked&FLAG_ACKED) { |
2482 | tcp_ack_update_rtt(tp, acked, seq_rtt); | 2101 | tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); |
2483 | tcp_ack_packets_out(sk, tp); | 2102 | tcp_ack_packets_out(sk, tp); |
2103 | |||
2104 | if (tp->ca_ops->pkts_acked) | ||
2105 | tp->ca_ops->pkts_acked(tp, pkts_acked); | ||
2484 | } | 2106 | } |
2485 | 2107 | ||
2486 | #if FASTRETRANS_DEBUG > 0 | 2108 | #if FASTRETRANS_DEBUG > 0 |
@@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) | |||
2624 | tp->frto_counter = (tp->frto_counter + 1) % 3; | 2246 | tp->frto_counter = (tp->frto_counter + 1) % 3; |
2625 | } | 2247 | } |
2626 | 2248 | ||
2627 | /* | ||
2628 | * TCP Westwood+ | ||
2629 | */ | ||
2630 | |||
2631 | /* | ||
2632 | * @init_westwood | ||
2633 | * This function initializes fields used in TCP Westwood+. We can't | ||
2634 | * get no information about RTTmin at this time so we simply set it to | ||
2635 | * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative | ||
2636 | * since in this way we're sure it will be updated in a consistent | ||
2637 | * way as soon as possible. It will reasonably happen within the first | ||
2638 | * RTT period of the connection lifetime. | ||
2639 | */ | ||
2640 | |||
2641 | static void init_westwood(struct sock *sk) | ||
2642 | { | ||
2643 | struct tcp_sock *tp = tcp_sk(sk); | ||
2644 | |||
2645 | tp->westwood.bw_ns_est = 0; | ||
2646 | tp->westwood.bw_est = 0; | ||
2647 | tp->westwood.accounted = 0; | ||
2648 | tp->westwood.cumul_ack = 0; | ||
2649 | tp->westwood.rtt_win_sx = tcp_time_stamp; | ||
2650 | tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; | ||
2651 | tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; | ||
2652 | tp->westwood.snd_una = tp->snd_una; | ||
2653 | } | ||
2654 | |||
2655 | /* | ||
2656 | * @westwood_do_filter | ||
2657 | * Low-pass filter. Implemented using constant coeffients. | ||
2658 | */ | ||
2659 | |||
2660 | static inline __u32 westwood_do_filter(__u32 a, __u32 b) | ||
2661 | { | ||
2662 | return (((7 * a) + b) >> 3); | ||
2663 | } | ||
2664 | |||
2665 | static void westwood_filter(struct sock *sk, __u32 delta) | ||
2666 | { | ||
2667 | struct tcp_sock *tp = tcp_sk(sk); | ||
2668 | |||
2669 | tp->westwood.bw_ns_est = | ||
2670 | westwood_do_filter(tp->westwood.bw_ns_est, | ||
2671 | tp->westwood.bk / delta); | ||
2672 | tp->westwood.bw_est = | ||
2673 | westwood_do_filter(tp->westwood.bw_est, | ||
2674 | tp->westwood.bw_ns_est); | ||
2675 | } | ||
2676 | |||
2677 | /* | ||
2678 | * @westwood_update_rttmin | ||
2679 | * It is used to update RTTmin. In this case we MUST NOT use | ||
2680 | * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! | ||
2681 | */ | ||
2682 | |||
2683 | static inline __u32 westwood_update_rttmin(const struct sock *sk) | ||
2684 | { | ||
2685 | const struct tcp_sock *tp = tcp_sk(sk); | ||
2686 | __u32 rttmin = tp->westwood.rtt_min; | ||
2687 | |||
2688 | if (tp->westwood.rtt != 0 && | ||
2689 | (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) | ||
2690 | rttmin = tp->westwood.rtt; | ||
2691 | |||
2692 | return rttmin; | ||
2693 | } | ||
2694 | |||
2695 | /* | ||
2696 | * @westwood_acked | ||
2697 | * Evaluate increases for dk. | ||
2698 | */ | ||
2699 | |||
2700 | static inline __u32 westwood_acked(const struct sock *sk) | ||
2701 | { | ||
2702 | const struct tcp_sock *tp = tcp_sk(sk); | ||
2703 | |||
2704 | return tp->snd_una - tp->westwood.snd_una; | ||
2705 | } | ||
2706 | |||
2707 | /* | ||
2708 | * @westwood_new_window | ||
2709 | * It evaluates if we are receiving data inside the same RTT window as | ||
2710 | * when we started. | ||
2711 | * Return value: | ||
2712 | * It returns 0 if we are still evaluating samples in the same RTT | ||
2713 | * window, 1 if the sample has to be considered in the next window. | ||
2714 | */ | ||
2715 | |||
2716 | static int westwood_new_window(const struct sock *sk) | ||
2717 | { | ||
2718 | const struct tcp_sock *tp = tcp_sk(sk); | ||
2719 | __u32 left_bound; | ||
2720 | __u32 rtt; | ||
2721 | int ret = 0; | ||
2722 | |||
2723 | left_bound = tp->westwood.rtt_win_sx; | ||
2724 | rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); | ||
2725 | |||
2726 | /* | ||
2727 | * A RTT-window has passed. Be careful since if RTT is less than | ||
2728 | * 50ms we don't filter but we continue 'building the sample'. | ||
2729 | * This minimum limit was choosen since an estimation on small | ||
2730 | * time intervals is better to avoid... | ||
2731 | * Obvioulsy on a LAN we reasonably will always have | ||
2732 | * right_bound = left_bound + WESTWOOD_RTT_MIN | ||
2733 | */ | ||
2734 | |||
2735 | if ((left_bound + rtt) < tcp_time_stamp) | ||
2736 | ret = 1; | ||
2737 | |||
2738 | return ret; | ||
2739 | } | ||
2740 | |||
2741 | /* | ||
2742 | * @westwood_update_window | ||
2743 | * It updates RTT evaluation window if it is the right moment to do | ||
2744 | * it. If so it calls filter for evaluating bandwidth. | ||
2745 | */ | ||
2746 | |||
2747 | static void __westwood_update_window(struct sock *sk, __u32 now) | ||
2748 | { | ||
2749 | struct tcp_sock *tp = tcp_sk(sk); | ||
2750 | __u32 delta = now - tp->westwood.rtt_win_sx; | ||
2751 | |||
2752 | if (delta) { | ||
2753 | if (tp->westwood.rtt) | ||
2754 | westwood_filter(sk, delta); | ||
2755 | |||
2756 | tp->westwood.bk = 0; | ||
2757 | tp->westwood.rtt_win_sx = tcp_time_stamp; | ||
2758 | } | ||
2759 | } | ||
2760 | |||
2761 | |||
2762 | static void westwood_update_window(struct sock *sk, __u32 now) | ||
2763 | { | ||
2764 | if (westwood_new_window(sk)) | ||
2765 | __westwood_update_window(sk, now); | ||
2766 | } | ||
2767 | |||
2768 | /* | ||
2769 | * @__tcp_westwood_fast_bw | ||
2770 | * It is called when we are in fast path. In particular it is called when | ||
2771 | * header prediction is successfull. In such case infact update is | ||
2772 | * straight forward and doesn't need any particular care. | ||
2773 | */ | ||
2774 | |||
2775 | static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) | ||
2776 | { | ||
2777 | struct tcp_sock *tp = tcp_sk(sk); | ||
2778 | |||
2779 | westwood_update_window(sk, tcp_time_stamp); | ||
2780 | |||
2781 | tp->westwood.bk += westwood_acked(sk); | ||
2782 | tp->westwood.snd_una = tp->snd_una; | ||
2783 | tp->westwood.rtt_min = westwood_update_rttmin(sk); | ||
2784 | } | ||
2785 | |||
2786 | static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) | ||
2787 | { | ||
2788 | if (tcp_is_westwood(tcp_sk(sk))) | ||
2789 | __tcp_westwood_fast_bw(sk, skb); | ||
2790 | } | ||
2791 | |||
2792 | |||
2793 | /* | ||
2794 | * @westwood_dupack_update | ||
2795 | * It updates accounted and cumul_ack when receiving a dupack. | ||
2796 | */ | ||
2797 | |||
2798 | static void westwood_dupack_update(struct sock *sk) | ||
2799 | { | ||
2800 | struct tcp_sock *tp = tcp_sk(sk); | ||
2801 | |||
2802 | tp->westwood.accounted += tp->mss_cache_std; | ||
2803 | tp->westwood.cumul_ack = tp->mss_cache_std; | ||
2804 | } | ||
2805 | |||
2806 | static inline int westwood_may_change_cumul(struct tcp_sock *tp) | ||
2807 | { | ||
2808 | return (tp->westwood.cumul_ack > tp->mss_cache_std); | ||
2809 | } | ||
2810 | |||
2811 | static inline void westwood_partial_update(struct tcp_sock *tp) | ||
2812 | { | ||
2813 | tp->westwood.accounted -= tp->westwood.cumul_ack; | ||
2814 | tp->westwood.cumul_ack = tp->mss_cache_std; | ||
2815 | } | ||
2816 | |||
2817 | static inline void westwood_complete_update(struct tcp_sock *tp) | ||
2818 | { | ||
2819 | tp->westwood.cumul_ack -= tp->westwood.accounted; | ||
2820 | tp->westwood.accounted = 0; | ||
2821 | } | ||
2822 | |||
2823 | /* | ||
2824 | * @westwood_acked_count | ||
2825 | * This function evaluates cumul_ack for evaluating dk in case of | ||
2826 | * delayed or partial acks. | ||
2827 | */ | ||
2828 | |||
2829 | static inline __u32 westwood_acked_count(struct sock *sk) | ||
2830 | { | ||
2831 | struct tcp_sock *tp = tcp_sk(sk); | ||
2832 | |||
2833 | tp->westwood.cumul_ack = westwood_acked(sk); | ||
2834 | |||
2835 | /* If cumul_ack is 0 this is a dupack since it's not moving | ||
2836 | * tp->snd_una. | ||
2837 | */ | ||
2838 | if (!(tp->westwood.cumul_ack)) | ||
2839 | westwood_dupack_update(sk); | ||
2840 | |||
2841 | if (westwood_may_change_cumul(tp)) { | ||
2842 | /* Partial or delayed ack */ | ||
2843 | if (tp->westwood.accounted >= tp->westwood.cumul_ack) | ||
2844 | westwood_partial_update(tp); | ||
2845 | else | ||
2846 | westwood_complete_update(tp); | ||
2847 | } | ||
2848 | |||
2849 | tp->westwood.snd_una = tp->snd_una; | ||
2850 | |||
2851 | return tp->westwood.cumul_ack; | ||
2852 | } | ||
2853 | |||
2854 | |||
2855 | /* | ||
2856 | * @__tcp_westwood_slow_bw | ||
2857 | * It is called when something is going wrong..even if there could | ||
2858 | * be no problems! Infact a simple delayed packet may trigger a | ||
2859 | * dupack. But we need to be careful in such case. | ||
2860 | */ | ||
2861 | |||
2862 | static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) | ||
2863 | { | ||
2864 | struct tcp_sock *tp = tcp_sk(sk); | ||
2865 | |||
2866 | westwood_update_window(sk, tcp_time_stamp); | ||
2867 | |||
2868 | tp->westwood.bk += westwood_acked_count(sk); | ||
2869 | tp->westwood.rtt_min = westwood_update_rttmin(sk); | ||
2870 | } | ||
2871 | |||
2872 | static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) | ||
2873 | { | ||
2874 | if (tcp_is_westwood(tcp_sk(sk))) | ||
2875 | __tcp_westwood_slow_bw(sk, skb); | ||
2876 | } | ||
2877 | |||
2878 | /* This routine deals with incoming acks, but not outgoing ones. */ | 2249 | /* This routine deals with incoming acks, but not outgoing ones. */ |
2879 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | 2250 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) |
2880 | { | 2251 | { |
@@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2884 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 2255 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
2885 | u32 prior_in_flight; | 2256 | u32 prior_in_flight; |
2886 | s32 seq_rtt; | 2257 | s32 seq_rtt; |
2258 | s32 seq_usrtt = 0; | ||
2887 | int prior_packets; | 2259 | int prior_packets; |
2888 | 2260 | ||
2889 | /* If the ack is newer than sent or older than previous acks | 2261 | /* If the ack is newer than sent or older than previous acks |
@@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2902 | */ | 2274 | */ |
2903 | tcp_update_wl(tp, ack, ack_seq); | 2275 | tcp_update_wl(tp, ack, ack_seq); |
2904 | tp->snd_una = ack; | 2276 | tp->snd_una = ack; |
2905 | tcp_westwood_fast_bw(sk, skb); | ||
2906 | flag |= FLAG_WIN_UPDATE; | 2277 | flag |= FLAG_WIN_UPDATE; |
2907 | 2278 | ||
2279 | tcp_ca_event(tp, CA_EVENT_FAST_ACK); | ||
2280 | |||
2908 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); | 2281 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); |
2909 | } else { | 2282 | } else { |
2910 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) | 2283 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) |
@@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2920 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) | 2293 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) |
2921 | flag |= FLAG_ECE; | 2294 | flag |= FLAG_ECE; |
2922 | 2295 | ||
2923 | tcp_westwood_slow_bw(sk,skb); | 2296 | tcp_ca_event(tp, CA_EVENT_SLOW_ACK); |
2924 | } | 2297 | } |
2925 | 2298 | ||
2926 | /* We passed data and got it acked, remove any soft error | 2299 | /* We passed data and got it acked, remove any soft error |
@@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2935 | prior_in_flight = tcp_packets_in_flight(tp); | 2308 | prior_in_flight = tcp_packets_in_flight(tp); |
2936 | 2309 | ||
2937 | /* See if we can take anything off of the retransmit queue. */ | 2310 | /* See if we can take anything off of the retransmit queue. */ |
2938 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt); | 2311 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt, |
2312 | tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); | ||
2939 | 2313 | ||
2940 | if (tp->frto_counter) | 2314 | if (tp->frto_counter) |
2941 | tcp_process_frto(sk, prior_snd_una); | 2315 | tcp_process_frto(sk, prior_snd_una); |
2942 | 2316 | ||
2943 | if (tcp_ack_is_dubious(tp, flag)) { | 2317 | if (tcp_ack_is_dubious(tp, flag)) { |
2944 | /* Advanve CWND, if state allows this. */ | 2318 | /* Advanve CWND, if state allows this. */ |
2945 | if ((flag & FLAG_DATA_ACKED) && | 2319 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) |
2946 | (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && | 2320 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); |
2947 | tcp_may_raise_cwnd(tp, flag)) | ||
2948 | tcp_cong_avoid(tp, ack, seq_rtt); | ||
2949 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); | 2321 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); |
2950 | } else { | 2322 | } else { |
2951 | if ((flag & FLAG_DATA_ACKED) && | 2323 | if ((flag & FLAG_DATA_ACKED)) |
2952 | (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) | 2324 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); |
2953 | tcp_cong_avoid(tp, ack, seq_rtt); | ||
2954 | } | 2325 | } |
2955 | 2326 | ||
2956 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) | 2327 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) |
@@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
4552 | 3923 | ||
4553 | tcp_init_metrics(sk); | 3924 | tcp_init_metrics(sk); |
4554 | 3925 | ||
3926 | tcp_init_congestion_control(tp); | ||
3927 | |||
4555 | /* Prevent spurious tcp_cwnd_restart() on first data | 3928 | /* Prevent spurious tcp_cwnd_restart() on first data |
4556 | * packet. | 3929 | * packet. |
4557 | */ | 3930 | */ |
@@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4708 | if(tp->af_specific->conn_request(sk, skb) < 0) | 4081 | if(tp->af_specific->conn_request(sk, skb) < 0) |
4709 | return 1; | 4082 | return 1; |
4710 | 4083 | ||
4711 | init_westwood(sk); | ||
4712 | init_bictcp(tp); | ||
4713 | |||
4714 | /* Now we have several options: In theory there is | 4084 | /* Now we have several options: In theory there is |
4715 | * nothing else in the frame. KA9Q has an option to | 4085 | * nothing else in the frame. KA9Q has an option to |
4716 | * send data with the syn, BSD accepts data with the | 4086 | * send data with the syn, BSD accepts data with the |
@@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4732 | goto discard; | 4102 | goto discard; |
4733 | 4103 | ||
4734 | case TCP_SYN_SENT: | 4104 | case TCP_SYN_SENT: |
4735 | init_westwood(sk); | ||
4736 | init_bictcp(tp); | ||
4737 | |||
4738 | queued = tcp_rcv_synsent_state_process(sk, skb, th, len); | 4105 | queued = tcp_rcv_synsent_state_process(sk, skb, th, len); |
4739 | if (queued >= 0) | 4106 | if (queued >= 0) |
4740 | return queued; | 4107 | return queued; |
@@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4816 | */ | 4183 | */ |
4817 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 4184 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
4818 | !tp->srtt) | 4185 | !tp->srtt) |
4819 | tcp_ack_saw_tstamp(tp, 0); | 4186 | tcp_ack_saw_tstamp(tp, 0, 0); |
4820 | 4187 | ||
4821 | if (tp->rx_opt.tstamp_ok) | 4188 | if (tp->rx_opt.tstamp_ok) |
4822 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 4189 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
@@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4828 | 4195 | ||
4829 | tcp_init_metrics(sk); | 4196 | tcp_init_metrics(sk); |
4830 | 4197 | ||
4198 | tcp_init_congestion_control(tp); | ||
4199 | |||
4831 | /* Prevent spurious tcp_cwnd_restart() on | 4200 | /* Prevent spurious tcp_cwnd_restart() on |
4832 | * first data packet. | 4201 | * first data packet. |
4833 | */ | 4202 | */ |