diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 737 |
1 files changed, 53 insertions, 684 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bad504630a3..7bbbbc33eb4b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
| @@ -61,7 +61,6 @@ | |||
| 61 | * Panu Kuhlberg: Experimental audit of TCP (re)transmission | 61 | * Panu Kuhlberg: Experimental audit of TCP (re)transmission |
| 62 | * engine. Lots of bugs are found. | 62 | * engine. Lots of bugs are found. |
| 63 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs | 63 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs |
| 64 | * Angelo Dell'Aera: TCP Westwood+ support | ||
| 65 | */ | 64 | */ |
| 66 | 65 | ||
| 67 | #include <linux/config.h> | 66 | #include <linux/config.h> |
| @@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337; | |||
| 88 | int sysctl_tcp_max_orphans = NR_FILE; | 87 | int sysctl_tcp_max_orphans = NR_FILE; |
| 89 | int sysctl_tcp_frto; | 88 | int sysctl_tcp_frto; |
| 90 | int sysctl_tcp_nometrics_save; | 89 | int sysctl_tcp_nometrics_save; |
| 91 | int sysctl_tcp_westwood; | ||
| 92 | int sysctl_tcp_vegas_cong_avoid; | ||
| 93 | 90 | ||
| 94 | int sysctl_tcp_moderate_rcvbuf = 1; | 91 | int sysctl_tcp_moderate_rcvbuf = 1; |
| 95 | 92 | ||
| 96 | /* Default values of the Vegas variables, in fixed-point representation | ||
| 97 | * with V_PARAM_SHIFT bits to the right of the binary point. | ||
| 98 | */ | ||
| 99 | #define V_PARAM_SHIFT 1 | ||
| 100 | int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT; | ||
| 101 | int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT; | ||
| 102 | int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT; | ||
| 103 | int sysctl_tcp_bic = 1; | ||
| 104 | int sysctl_tcp_bic_fast_convergence = 1; | ||
| 105 | int sysctl_tcp_bic_low_window = 14; | ||
| 106 | int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ | ||
| 107 | |||
| 108 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 93 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
| 109 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 94 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
| 110 | #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ | 95 | #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ |
| @@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk) | |||
| 333 | tp->snd_cwnd_stamp = tcp_time_stamp; | 318 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 334 | } | 319 | } |
| 335 | 320 | ||
| 336 | static void init_bictcp(struct tcp_sock *tp) | ||
| 337 | { | ||
| 338 | tp->bictcp.cnt = 0; | ||
| 339 | |||
| 340 | tp->bictcp.last_max_cwnd = 0; | ||
| 341 | tp->bictcp.last_cwnd = 0; | ||
| 342 | tp->bictcp.last_stamp = 0; | ||
| 343 | } | ||
| 344 | |||
| 345 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ | 321 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ |
| 346 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) | 322 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) |
| 347 | { | 323 | { |
| @@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
| 558 | tcp_grow_window(sk, tp, skb); | 534 | tcp_grow_window(sk, tp, skb); |
| 559 | } | 535 | } |
| 560 | 536 | ||
| 561 | /* When starting a new connection, pin down the current choice of | ||
| 562 | * congestion algorithm. | ||
| 563 | */ | ||
| 564 | void tcp_ca_init(struct tcp_sock *tp) | ||
| 565 | { | ||
| 566 | if (sysctl_tcp_westwood) | ||
| 567 | tp->adv_cong = TCP_WESTWOOD; | ||
| 568 | else if (sysctl_tcp_bic) | ||
| 569 | tp->adv_cong = TCP_BIC; | ||
| 570 | else if (sysctl_tcp_vegas_cong_avoid) { | ||
| 571 | tp->adv_cong = TCP_VEGAS; | ||
| 572 | tp->vegas.baseRTT = 0x7fffffff; | ||
| 573 | tcp_vegas_enable(tp); | ||
| 574 | } | ||
| 575 | } | ||
| 576 | |||
| 577 | /* Do RTT sampling needed for Vegas. | ||
| 578 | * Basically we: | ||
| 579 | * o min-filter RTT samples from within an RTT to get the current | ||
| 580 | * propagation delay + queuing delay (we are min-filtering to try to | ||
| 581 | * avoid the effects of delayed ACKs) | ||
| 582 | * o min-filter RTT samples from a much longer window (forever for now) | ||
| 583 | * to find the propagation delay (baseRTT) | ||
| 584 | */ | ||
| 585 | static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) | ||
| 586 | { | ||
| 587 | __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ | ||
| 588 | |||
| 589 | /* Filter to find propagation delay: */ | ||
| 590 | if (vrtt < tp->vegas.baseRTT) | ||
| 591 | tp->vegas.baseRTT = vrtt; | ||
| 592 | |||
| 593 | /* Find the min RTT during the last RTT to find | ||
| 594 | * the current prop. delay + queuing delay: | ||
| 595 | */ | ||
| 596 | tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); | ||
| 597 | tp->vegas.cntRTT++; | ||
| 598 | } | ||
| 599 | |||
| 600 | /* Called to compute a smoothed rtt estimate. The data fed to this | 537 | /* Called to compute a smoothed rtt estimate. The data fed to this |
| 601 | * routine either comes from timestamps, or from segments that were | 538 | * routine either comes from timestamps, or from segments that were |
| 602 | * known _not_ to have been retransmitted [see Karn/Partridge | 539 | * known _not_ to have been retransmitted [see Karn/Partridge |
| @@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) | |||
| 606 | * To save cycles in the RFC 1323 implementation it was better to break | 543 | * To save cycles in the RFC 1323 implementation it was better to break |
| 607 | * it up into three procedures. -- erics | 544 | * it up into three procedures. -- erics |
| 608 | */ | 545 | */ |
| 609 | static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) | 546 | static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) |
| 610 | { | 547 | { |
| 611 | long m = mrtt; /* RTT */ | 548 | long m = mrtt; /* RTT */ |
| 612 | 549 | ||
| 613 | if (tcp_vegas_enabled(tp)) | ||
| 614 | vegas_rtt_calc(tp, mrtt); | ||
| 615 | |||
| 616 | /* The following amusing code comes from Jacobson's | 550 | /* The following amusing code comes from Jacobson's |
| 617 | * article in SIGCOMM '88. Note that rtt and mdev | 551 | * article in SIGCOMM '88. Note that rtt and mdev |
| 618 | * are scaled versions of rtt and mean deviation. | 552 | * are scaled versions of rtt and mean deviation. |
| @@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) | |||
| 670 | tp->rtt_seq = tp->snd_nxt; | 604 | tp->rtt_seq = tp->snd_nxt; |
| 671 | } | 605 | } |
| 672 | 606 | ||
| 673 | tcp_westwood_update_rtt(tp, tp->srtt >> 3); | 607 | if (tp->ca_ops->rtt_sample) |
| 608 | tp->ca_ops->rtt_sample(tp, *usrtt); | ||
| 674 | } | 609 | } |
| 675 | 610 | ||
| 676 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 611 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
| @@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk) | |||
| 1185 | tp->snd_una == tp->high_seq || | 1120 | tp->snd_una == tp->high_seq || |
| 1186 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1121 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { |
| 1187 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1122 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
| 1188 | if (!tcp_westwood_ssthresh(tp)) | 1123 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
| 1189 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1124 | tcp_ca_event(tp, CA_EVENT_FRTO); |
| 1190 | } | 1125 | } |
| 1191 | 1126 | ||
| 1192 | /* Have to clear retransmission markers here to keep the bookkeeping | 1127 | /* Have to clear retransmission markers here to keep the bookkeeping |
| @@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk) | |||
| 1252 | tcp_set_ca_state(tp, TCP_CA_Loss); | 1187 | tcp_set_ca_state(tp, TCP_CA_Loss); |
| 1253 | tp->high_seq = tp->frto_highmark; | 1188 | tp->high_seq = tp->frto_highmark; |
| 1254 | TCP_ECN_queue_cwr(tp); | 1189 | TCP_ECN_queue_cwr(tp); |
| 1255 | |||
| 1256 | init_bictcp(tp); | ||
| 1257 | } | 1190 | } |
| 1258 | 1191 | ||
| 1259 | void tcp_clear_retrans(struct tcp_sock *tp) | 1192 | void tcp_clear_retrans(struct tcp_sock *tp) |
| @@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
| 1283 | if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || | 1216 | if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || |
| 1284 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1217 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { |
| 1285 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1218 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
| 1286 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1219 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
| 1220 | tcp_ca_event(tp, CA_EVENT_LOSS); | ||
| 1287 | } | 1221 | } |
| 1288 | tp->snd_cwnd = 1; | 1222 | tp->snd_cwnd = 1; |
| 1289 | tp->snd_cwnd_cnt = 0; | 1223 | tp->snd_cwnd_cnt = 0; |
| @@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) | |||
| 1596 | } | 1530 | } |
| 1597 | 1531 | ||
| 1598 | /* Decrease cwnd each second ack. */ | 1532 | /* Decrease cwnd each second ack. */ |
| 1599 | |||
| 1600 | static void tcp_cwnd_down(struct tcp_sock *tp) | 1533 | static void tcp_cwnd_down(struct tcp_sock *tp) |
| 1601 | { | 1534 | { |
| 1602 | int decr = tp->snd_cwnd_cnt + 1; | 1535 | int decr = tp->snd_cwnd_cnt + 1; |
| 1603 | __u32 limit; | ||
| 1604 | |||
| 1605 | /* | ||
| 1606 | * TCP Westwood | ||
| 1607 | * Here limit is evaluated as BWestimation*RTTmin (for obtaining it | ||
| 1608 | * in packets we use mss_cache). If sysctl_tcp_westwood is off | ||
| 1609 | * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is | ||
| 1610 | * still used as usual. It prevents other strange cases in which | ||
| 1611 | * BWE*RTTmin could assume value 0. It should not happen but... | ||
| 1612 | */ | ||
| 1613 | |||
| 1614 | if (!(limit = tcp_westwood_bw_rttmin(tp))) | ||
| 1615 | limit = tp->snd_ssthresh/2; | ||
| 1616 | 1536 | ||
| 1617 | tp->snd_cwnd_cnt = decr&1; | 1537 | tp->snd_cwnd_cnt = decr&1; |
| 1618 | decr >>= 1; | 1538 | decr >>= 1; |
| 1619 | 1539 | ||
| 1620 | if (decr && tp->snd_cwnd > limit) | 1540 | if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) |
| 1621 | tp->snd_cwnd -= decr; | 1541 | tp->snd_cwnd -= decr; |
| 1622 | 1542 | ||
| 1623 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); | 1543 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); |
| @@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) | |||
| 1654 | static void tcp_undo_cwr(struct tcp_sock *tp, int undo) | 1574 | static void tcp_undo_cwr(struct tcp_sock *tp, int undo) |
| 1655 | { | 1575 | { |
| 1656 | if (tp->prior_ssthresh) { | 1576 | if (tp->prior_ssthresh) { |
| 1657 | if (tcp_is_bic(tp)) | 1577 | if (tp->ca_ops->undo_cwnd) |
| 1658 | tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); | 1578 | tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); |
| 1659 | else | 1579 | else |
| 1660 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); | 1580 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); |
| 1661 | 1581 | ||
| @@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) | |||
| 1767 | 1687 | ||
| 1768 | static inline void tcp_complete_cwr(struct tcp_sock *tp) | 1688 | static inline void tcp_complete_cwr(struct tcp_sock *tp) |
| 1769 | { | 1689 | { |
| 1770 | if (tcp_westwood_cwnd(tp)) | 1690 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); |
| 1771 | tp->snd_ssthresh = tp->snd_cwnd; | ||
| 1772 | else | ||
| 1773 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); | ||
| 1774 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1691 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 1692 | tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); | ||
| 1775 | } | 1693 | } |
| 1776 | 1694 | ||
| 1777 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) | 1695 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) |
| @@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1946 | if (tp->ca_state < TCP_CA_CWR) { | 1864 | if (tp->ca_state < TCP_CA_CWR) { |
| 1947 | if (!(flag&FLAG_ECE)) | 1865 | if (!(flag&FLAG_ECE)) |
| 1948 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1866 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
| 1949 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1867 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
| 1950 | TCP_ECN_queue_cwr(tp); | 1868 | TCP_ECN_queue_cwr(tp); |
| 1951 | } | 1869 | } |
| 1952 | 1870 | ||
| @@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1963 | /* Read draft-ietf-tcplw-high-performance before mucking | 1881 | /* Read draft-ietf-tcplw-high-performance before mucking |
| 1964 | * with this code. (Superceeds RFC1323) | 1882 | * with this code. (Superceeds RFC1323) |
| 1965 | */ | 1883 | */ |
| 1966 | static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) | 1884 | static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) |
| 1967 | { | 1885 | { |
| 1968 | __u32 seq_rtt; | 1886 | __u32 seq_rtt; |
| 1969 | 1887 | ||
| @@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) | |||
| 1983 | * in window is lost... Voila. --ANK (010210) | 1901 | * in window is lost... Voila. --ANK (010210) |
| 1984 | */ | 1902 | */ |
| 1985 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | 1903 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; |
| 1986 | tcp_rtt_estimator(tp, seq_rtt); | 1904 | tcp_rtt_estimator(tp, seq_rtt, usrtt); |
| 1987 | tcp_set_rto(tp); | 1905 | tcp_set_rto(tp); |
| 1988 | tp->backoff = 0; | 1906 | tp->backoff = 0; |
| 1989 | tcp_bound_rto(tp); | 1907 | tcp_bound_rto(tp); |
| 1990 | } | 1908 | } |
| 1991 | 1909 | ||
| 1992 | static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) | 1910 | static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) |
| 1993 | { | 1911 | { |
| 1994 | /* We don't have a timestamp. Can only use | 1912 | /* We don't have a timestamp. Can only use |
| 1995 | * packets that are not retransmitted to determine | 1913 | * packets that are not retransmitted to determine |
| @@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) | |||
| 2003 | if (flag & FLAG_RETRANS_DATA_ACKED) | 1921 | if (flag & FLAG_RETRANS_DATA_ACKED) |
| 2004 | return; | 1922 | return; |
| 2005 | 1923 | ||
| 2006 | tcp_rtt_estimator(tp, seq_rtt); | 1924 | tcp_rtt_estimator(tp, seq_rtt, usrtt); |
| 2007 | tcp_set_rto(tp); | 1925 | tcp_set_rto(tp); |
| 2008 | tp->backoff = 0; | 1926 | tp->backoff = 0; |
| 2009 | tcp_bound_rto(tp); | 1927 | tcp_bound_rto(tp); |
| 2010 | } | 1928 | } |
| 2011 | 1929 | ||
| 2012 | static inline void tcp_ack_update_rtt(struct tcp_sock *tp, | 1930 | static inline void tcp_ack_update_rtt(struct tcp_sock *tp, |
| 2013 | int flag, s32 seq_rtt) | 1931 | int flag, s32 seq_rtt, u32 *usrtt) |
| 2014 | { | 1932 | { |
| 2015 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | 1933 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ |
| 2016 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | 1934 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
| 2017 | tcp_ack_saw_tstamp(tp, flag); | 1935 | tcp_ack_saw_tstamp(tp, usrtt, flag); |
| 2018 | else if (seq_rtt >= 0) | 1936 | else if (seq_rtt >= 0) |
| 2019 | tcp_ack_no_tstamp(tp, seq_rtt, flag); | 1937 | tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); |
| 2020 | } | 1938 | } |
| 2021 | 1939 | ||
| 2022 | /* | 1940 | static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, |
| 2023 | * Compute congestion window to use. | 1941 | u32 in_flight, int good) |
| 2024 | * | ||
| 2025 | * This is from the implementation of BICTCP in | ||
| 2026 | * Lison-Xu, Kahaled Harfoush, and Injog Rhee. | ||
| 2027 | * "Binary Increase Congestion Control for Fast, Long Distance | ||
| 2028 | * Networks" in InfoComm 2004 | ||
| 2029 | * Available from: | ||
| 2030 | * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf | ||
| 2031 | * | ||
| 2032 | * Unless BIC is enabled and congestion window is large | ||
| 2033 | * this behaves the same as the original Reno. | ||
| 2034 | */ | ||
| 2035 | static inline __u32 bictcp_cwnd(struct tcp_sock *tp) | ||
| 2036 | { | ||
| 2037 | /* orignal Reno behaviour */ | ||
| 2038 | if (!tcp_is_bic(tp)) | ||
| 2039 | return tp->snd_cwnd; | ||
| 2040 | |||
| 2041 | if (tp->bictcp.last_cwnd == tp->snd_cwnd && | ||
| 2042 | (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) | ||
| 2043 | return tp->bictcp.cnt; | ||
| 2044 | |||
| 2045 | tp->bictcp.last_cwnd = tp->snd_cwnd; | ||
| 2046 | tp->bictcp.last_stamp = tcp_time_stamp; | ||
| 2047 | |||
| 2048 | /* start off normal */ | ||
| 2049 | if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) | ||
| 2050 | tp->bictcp.cnt = tp->snd_cwnd; | ||
| 2051 | |||
| 2052 | /* binary increase */ | ||
| 2053 | else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { | ||
| 2054 | __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) | ||
| 2055 | / BICTCP_B; | ||
| 2056 | |||
| 2057 | if (dist > BICTCP_MAX_INCREMENT) | ||
| 2058 | /* linear increase */ | ||
| 2059 | tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; | ||
| 2060 | else if (dist <= 1U) | ||
| 2061 | /* binary search increase */ | ||
| 2062 | tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR | ||
| 2063 | / BICTCP_B; | ||
| 2064 | else | ||
| 2065 | /* binary search increase */ | ||
| 2066 | tp->bictcp.cnt = tp->snd_cwnd / dist; | ||
| 2067 | } else { | ||
| 2068 | /* slow start amd linear increase */ | ||
| 2069 | if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) | ||
| 2070 | /* slow start */ | ||
| 2071 | tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR | ||
| 2072 | / BICTCP_B; | ||
| 2073 | else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd | ||
| 2074 | + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) | ||
| 2075 | /* slow start */ | ||
| 2076 | tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) | ||
| 2077 | / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); | ||
| 2078 | else | ||
| 2079 | /* linear increase */ | ||
| 2080 | tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; | ||
| 2081 | } | ||
| 2082 | return tp->bictcp.cnt; | ||
| 2083 | } | ||
| 2084 | |||
| 2085 | /* This is Jacobson's slow start and congestion avoidance. | ||
| 2086 | * SIGCOMM '88, p. 328. | ||
| 2087 | */ | ||
| 2088 | static inline void reno_cong_avoid(struct tcp_sock *tp) | ||
| 2089 | { | 1942 | { |
| 2090 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 1943 | tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); |
| 2091 | /* In "safe" area, increase. */ | ||
| 2092 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 2093 | tp->snd_cwnd++; | ||
| 2094 | } else { | ||
| 2095 | /* In dangerous area, increase slowly. | ||
| 2096 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
| 2097 | */ | ||
| 2098 | if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { | ||
| 2099 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 2100 | tp->snd_cwnd++; | ||
| 2101 | tp->snd_cwnd_cnt=0; | ||
| 2102 | } else | ||
| 2103 | tp->snd_cwnd_cnt++; | ||
| 2104 | } | ||
| 2105 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1944 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 2106 | } | 1945 | } |
| 2107 | 1946 | ||
| 2108 | /* This is based on the congestion detection/avoidance scheme described in | ||
| 2109 | * Lawrence S. Brakmo and Larry L. Peterson. | ||
| 2110 | * "TCP Vegas: End to end congestion avoidance on a global internet." | ||
| 2111 | * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, | ||
| 2112 | * October 1995. Available from: | ||
| 2113 | * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps | ||
| 2114 | * | ||
| 2115 | * See http://www.cs.arizona.edu/xkernel/ for their implementation. | ||
| 2116 | * The main aspects that distinguish this implementation from the | ||
| 2117 | * Arizona Vegas implementation are: | ||
| 2118 | * o We do not change the loss detection or recovery mechanisms of | ||
| 2119 | * Linux in any way. Linux already recovers from losses quite well, | ||
| 2120 | * using fine-grained timers, NewReno, and FACK. | ||
| 2121 | * o To avoid the performance penalty imposed by increasing cwnd | ||
| 2122 | * only every-other RTT during slow start, we increase during | ||
| 2123 | * every RTT during slow start, just like Reno. | ||
| 2124 | * o Largely to allow continuous cwnd growth during slow start, | ||
| 2125 | * we use the rate at which ACKs come back as the "actual" | ||
| 2126 | * rate, rather than the rate at which data is sent. | ||
| 2127 | * o To speed convergence to the right rate, we set the cwnd | ||
| 2128 | * to achieve the right ("actual") rate when we exit slow start. | ||
| 2129 | * o To filter out the noise caused by delayed ACKs, we use the | ||
| 2130 | * minimum RTT sample observed during the last RTT to calculate | ||
| 2131 | * the actual rate. | ||
| 2132 | * o When the sender re-starts from idle, it waits until it has | ||
| 2133 | * received ACKs for an entire flight of new data before making | ||
| 2134 | * a cwnd adjustment decision. The original Vegas implementation | ||
| 2135 | * assumed senders never went idle. | ||
| 2136 | */ | ||
| 2137 | static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) | ||
| 2138 | { | ||
| 2139 | /* The key players are v_beg_snd_una and v_beg_snd_nxt. | ||
| 2140 | * | ||
| 2141 | * These are so named because they represent the approximate values | ||
| 2142 | * of snd_una and snd_nxt at the beginning of the current RTT. More | ||
| 2143 | * precisely, they represent the amount of data sent during the RTT. | ||
| 2144 | * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, | ||
| 2145 | * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding | ||
| 2146 | * bytes of data have been ACKed during the course of the RTT, giving | ||
| 2147 | * an "actual" rate of: | ||
| 2148 | * | ||
| 2149 | * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) | ||
| 2150 | * | ||
| 2151 | * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, | ||
| 2152 | * because delayed ACKs can cover more than one segment, so they | ||
| 2153 | * don't line up nicely with the boundaries of RTTs. | ||
| 2154 | * | ||
| 2155 | * Another unfortunate fact of life is that delayed ACKs delay the | ||
| 2156 | * advance of the left edge of our send window, so that the number | ||
| 2157 | * of bytes we send in an RTT is often less than our cwnd will allow. | ||
| 2158 | * So we keep track of our cwnd separately, in v_beg_snd_cwnd. | ||
| 2159 | */ | ||
| 2160 | |||
| 2161 | if (after(ack, tp->vegas.beg_snd_nxt)) { | ||
| 2162 | /* Do the Vegas once-per-RTT cwnd adjustment. */ | ||
| 2163 | u32 old_wnd, old_snd_cwnd; | ||
| 2164 | |||
| 2165 | |||
| 2166 | /* Here old_wnd is essentially the window of data that was | ||
| 2167 | * sent during the previous RTT, and has all | ||
| 2168 | * been acknowledged in the course of the RTT that ended | ||
| 2169 | * with the ACK we just received. Likewise, old_snd_cwnd | ||
| 2170 | * is the cwnd during the previous RTT. | ||
| 2171 | */ | ||
| 2172 | old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / | ||
| 2173 | tp->mss_cache_std; | ||
| 2174 | old_snd_cwnd = tp->vegas.beg_snd_cwnd; | ||
| 2175 | |||
| 2176 | /* Save the extent of the current window so we can use this | ||
| 2177 | * at the end of the next RTT. | ||
| 2178 | */ | ||
| 2179 | tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; | ||
| 2180 | tp->vegas.beg_snd_nxt = tp->snd_nxt; | ||
| 2181 | tp->vegas.beg_snd_cwnd = tp->snd_cwnd; | ||
| 2182 | |||
| 2183 | /* Take into account the current RTT sample too, to | ||
| 2184 | * decrease the impact of delayed acks. This double counts | ||
| 2185 | * this sample since we count it for the next window as well, | ||
| 2186 | * but that's not too awful, since we're taking the min, | ||
| 2187 | * rather than averaging. | ||
| 2188 | */ | ||
| 2189 | vegas_rtt_calc(tp, seq_rtt); | ||
| 2190 | |||
| 2191 | /* We do the Vegas calculations only if we got enough RTT | ||
| 2192 | * samples that we can be reasonably sure that we got | ||
| 2193 | * at least one RTT sample that wasn't from a delayed ACK. | ||
| 2194 | * If we only had 2 samples total, | ||
| 2195 | * then that means we're getting only 1 ACK per RTT, which | ||
| 2196 | * means they're almost certainly delayed ACKs. | ||
| 2197 | * If we have 3 samples, we should be OK. | ||
| 2198 | */ | ||
| 2199 | |||
| 2200 | if (tp->vegas.cntRTT <= 2) { | ||
| 2201 | /* We don't have enough RTT samples to do the Vegas | ||
| 2202 | * calculation, so we'll behave like Reno. | ||
| 2203 | */ | ||
| 2204 | if (tp->snd_cwnd > tp->snd_ssthresh) | ||
| 2205 | tp->snd_cwnd++; | ||
| 2206 | } else { | ||
| 2207 | u32 rtt, target_cwnd, diff; | ||
| 2208 | |||
| 2209 | /* We have enough RTT samples, so, using the Vegas | ||
| 2210 | * algorithm, we determine if we should increase or | ||
| 2211 | * decrease cwnd, and by how much. | ||
| 2212 | */ | ||
| 2213 | |||
| 2214 | /* Pluck out the RTT we are using for the Vegas | ||
| 2215 | * calculations. This is the min RTT seen during the | ||
| 2216 | * last RTT. Taking the min filters out the effects | ||
| 2217 | * of delayed ACKs, at the cost of noticing congestion | ||
| 2218 | * a bit later. | ||
| 2219 | */ | ||
| 2220 | rtt = tp->vegas.minRTT; | ||
| 2221 | |||
| 2222 | /* Calculate the cwnd we should have, if we weren't | ||
| 2223 | * going too fast. | ||
| 2224 | * | ||
| 2225 | * This is: | ||
| 2226 | * (actual rate in segments) * baseRTT | ||
| 2227 | * We keep it as a fixed point number with | ||
| 2228 | * V_PARAM_SHIFT bits to the right of the binary point. | ||
| 2229 | */ | ||
| 2230 | target_cwnd = ((old_wnd * tp->vegas.baseRTT) | ||
| 2231 | << V_PARAM_SHIFT) / rtt; | ||
| 2232 | |||
| 2233 | /* Calculate the difference between the window we had, | ||
| 2234 | * and the window we would like to have. This quantity | ||
| 2235 | * is the "Diff" from the Arizona Vegas papers. | ||
| 2236 | * | ||
| 2237 | * Again, this is a fixed point number with | ||
| 2238 | * V_PARAM_SHIFT bits to the right of the binary | ||
| 2239 | * point. | ||
| 2240 | */ | ||
| 2241 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | ||
| 2242 | |||
| 2243 | if (tp->snd_cwnd < tp->snd_ssthresh) { | ||
| 2244 | /* Slow start. */ | ||
| 2245 | if (diff > sysctl_tcp_vegas_gamma) { | ||
| 2246 | /* Going too fast. Time to slow down | ||
| 2247 | * and switch to congestion avoidance. | ||
| 2248 | */ | ||
| 2249 | tp->snd_ssthresh = 2; | ||
| 2250 | |||
| 2251 | /* Set cwnd to match the actual rate | ||
| 2252 | * exactly: | ||
| 2253 | * cwnd = (actual rate) * baseRTT | ||
| 2254 | * Then we add 1 because the integer | ||
| 2255 | * truncation robs us of full link | ||
| 2256 | * utilization. | ||
| 2257 | */ | ||
| 2258 | tp->snd_cwnd = min(tp->snd_cwnd, | ||
| 2259 | (target_cwnd >> | ||
| 2260 | V_PARAM_SHIFT)+1); | ||
| 2261 | |||
| 2262 | } | ||
| 2263 | } else { | ||
| 2264 | /* Congestion avoidance. */ | ||
| 2265 | u32 next_snd_cwnd; | ||
| 2266 | |||
| 2267 | /* Figure out where we would like cwnd | ||
| 2268 | * to be. | ||
| 2269 | */ | ||
| 2270 | if (diff > sysctl_tcp_vegas_beta) { | ||
| 2271 | /* The old window was too fast, so | ||
| 2272 | * we slow down. | ||
| 2273 | */ | ||
| 2274 | next_snd_cwnd = old_snd_cwnd - 1; | ||
| 2275 | } else if (diff < sysctl_tcp_vegas_alpha) { | ||
| 2276 | /* We don't have enough extra packets | ||
| 2277 | * in the network, so speed up. | ||
| 2278 | */ | ||
| 2279 | next_snd_cwnd = old_snd_cwnd + 1; | ||
| 2280 | } else { | ||
| 2281 | /* Sending just as fast as we | ||
| 2282 | * should be. | ||
| 2283 | */ | ||
| 2284 | next_snd_cwnd = old_snd_cwnd; | ||
| 2285 | } | ||
| 2286 | |||
| 2287 | /* Adjust cwnd upward or downward, toward the | ||
| 2288 | * desired value. | ||
| 2289 | */ | ||
| 2290 | if (next_snd_cwnd > tp->snd_cwnd) | ||
| 2291 | tp->snd_cwnd++; | ||
| 2292 | else if (next_snd_cwnd < tp->snd_cwnd) | ||
| 2293 | tp->snd_cwnd--; | ||
| 2294 | } | ||
| 2295 | } | ||
| 2296 | |||
| 2297 | /* Wipe the slate clean for the next RTT. */ | ||
| 2298 | tp->vegas.cntRTT = 0; | ||
| 2299 | tp->vegas.minRTT = 0x7fffffff; | ||
| 2300 | } | ||
| 2301 | |||
| 2302 | /* The following code is executed for every ack we receive, | ||
| 2303 | * except for conditions checked in should_advance_cwnd() | ||
| 2304 | * before the call to tcp_cong_avoid(). Mainly this means that | ||
| 2305 | * we only execute this code if the ack actually acked some | ||
| 2306 | * data. | ||
| 2307 | */ | ||
| 2308 | |||
| 2309 | /* If we are in slow start, increase our cwnd in response to this ACK. | ||
| 2310 | * (If we are not in slow start then we are in congestion avoidance, | ||
| 2311 | * and adjust our congestion window only once per RTT. See the code | ||
| 2312 | * above.) | ||
| 2313 | */ | ||
| 2314 | if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
| 2315 | tp->snd_cwnd++; | ||
| 2316 | |||
| 2317 | /* to keep cwnd from growing without bound */ | ||
| 2318 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
| 2319 | |||
| 2320 | /* Make sure that we are never so timid as to reduce our cwnd below | ||
| 2321 | * 2 MSS. | ||
| 2322 | * | ||
| 2323 | * Going below 2 MSS would risk huge delayed ACKs from our receiver. | ||
| 2324 | */ | ||
| 2325 | tp->snd_cwnd = max(tp->snd_cwnd, 2U); | ||
| 2326 | |||
| 2327 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
| 2328 | } | ||
| 2329 | |||
| 2330 | static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) | ||
| 2331 | { | ||
| 2332 | if (tcp_vegas_enabled(tp)) | ||
| 2333 | vegas_cong_avoid(tp, ack, seq_rtt); | ||
| 2334 | else | ||
| 2335 | reno_cong_avoid(tp); | ||
| 2336 | } | ||
| 2337 | |||
| 2338 | /* Restart timer after forward progress on connection. | 1947 | /* Restart timer after forward progress on connection. |
| 2339 | * RFC2988 recommends to restart timer to now+rto. | 1948 | * RFC2988 recommends to restart timer to now+rto. |
| 2340 | */ | 1949 | */ |
| @@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, | |||
| 2415 | 2024 | ||
| 2416 | 2025 | ||
| 2417 | /* Remove acknowledged frames from the retransmission queue. */ | 2026 | /* Remove acknowledged frames from the retransmission queue. */ |
| 2418 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | 2027 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) |
| 2419 | { | 2028 | { |
| 2420 | struct tcp_sock *tp = tcp_sk(sk); | 2029 | struct tcp_sock *tp = tcp_sk(sk); |
| 2421 | struct sk_buff *skb; | 2030 | struct sk_buff *skb; |
| 2422 | __u32 now = tcp_time_stamp; | 2031 | __u32 now = tcp_time_stamp; |
| 2423 | int acked = 0; | 2032 | int acked = 0; |
| 2424 | __s32 seq_rtt = -1; | 2033 | __s32 seq_rtt = -1; |
| 2034 | struct timeval usnow; | ||
| 2035 | u32 pkts_acked = 0; | ||
| 2036 | |||
| 2037 | if (seq_usrtt) | ||
| 2038 | do_gettimeofday(&usnow); | ||
| 2425 | 2039 | ||
| 2426 | while ((skb = skb_peek(&sk->sk_write_queue)) && | 2040 | while ((skb = skb_peek(&sk->sk_write_queue)) && |
| 2427 | skb != sk->sk_send_head) { | 2041 | skb != sk->sk_send_head) { |
| @@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
| 2448 | */ | 2062 | */ |
| 2449 | if (!(scb->flags & TCPCB_FLAG_SYN)) { | 2063 | if (!(scb->flags & TCPCB_FLAG_SYN)) { |
| 2450 | acked |= FLAG_DATA_ACKED; | 2064 | acked |= FLAG_DATA_ACKED; |
| 2065 | ++pkts_acked; | ||
| 2451 | } else { | 2066 | } else { |
| 2452 | acked |= FLAG_SYN_ACKED; | 2067 | acked |= FLAG_SYN_ACKED; |
| 2453 | tp->retrans_stamp = 0; | 2068 | tp->retrans_stamp = 0; |
| @@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
| 2461 | seq_rtt = -1; | 2076 | seq_rtt = -1; |
| 2462 | } else if (seq_rtt < 0) | 2077 | } else if (seq_rtt < 0) |
| 2463 | seq_rtt = now - scb->when; | 2078 | seq_rtt = now - scb->when; |
| 2079 | if (seq_usrtt) | ||
| 2080 | *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 | ||
| 2081 | + (usnow.tv_usec - skb->stamp.tv_usec); | ||
| 2082 | |||
| 2464 | if (sacked & TCPCB_SACKED_ACKED) | 2083 | if (sacked & TCPCB_SACKED_ACKED) |
| 2465 | tp->sacked_out -= tcp_skb_pcount(skb); | 2084 | tp->sacked_out -= tcp_skb_pcount(skb); |
| 2466 | if (sacked & TCPCB_LOST) | 2085 | if (sacked & TCPCB_LOST) |
| @@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
| 2479 | } | 2098 | } |
| 2480 | 2099 | ||
| 2481 | if (acked&FLAG_ACKED) { | 2100 | if (acked&FLAG_ACKED) { |
| 2482 | tcp_ack_update_rtt(tp, acked, seq_rtt); | 2101 | tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); |
| 2483 | tcp_ack_packets_out(sk, tp); | 2102 | tcp_ack_packets_out(sk, tp); |
| 2103 | |||
| 2104 | if (tp->ca_ops->pkts_acked) | ||
| 2105 | tp->ca_ops->pkts_acked(tp, pkts_acked); | ||
| 2484 | } | 2106 | } |
| 2485 | 2107 | ||
| 2486 | #if FASTRETRANS_DEBUG > 0 | 2108 | #if FASTRETRANS_DEBUG > 0 |
| @@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) | |||
| 2624 | tp->frto_counter = (tp->frto_counter + 1) % 3; | 2246 | tp->frto_counter = (tp->frto_counter + 1) % 3; |
| 2625 | } | 2247 | } |
| 2626 | 2248 | ||
| 2627 | /* | ||
| 2628 | * TCP Westwood+ | ||
| 2629 | */ | ||
| 2630 | |||
| 2631 | /* | ||
| 2632 | * @init_westwood | ||
| 2633 | * This function initializes fields used in TCP Westwood+. We can't | ||
| 2634 | * get no information about RTTmin at this time so we simply set it to | ||
| 2635 | * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative | ||
| 2636 | * since in this way we're sure it will be updated in a consistent | ||
| 2637 | * way as soon as possible. It will reasonably happen within the first | ||
| 2638 | * RTT period of the connection lifetime. | ||
| 2639 | */ | ||
| 2640 | |||
| 2641 | static void init_westwood(struct sock *sk) | ||
| 2642 | { | ||
| 2643 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2644 | |||
| 2645 | tp->westwood.bw_ns_est = 0; | ||
| 2646 | tp->westwood.bw_est = 0; | ||
| 2647 | tp->westwood.accounted = 0; | ||
| 2648 | tp->westwood.cumul_ack = 0; | ||
| 2649 | tp->westwood.rtt_win_sx = tcp_time_stamp; | ||
| 2650 | tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; | ||
| 2651 | tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; | ||
| 2652 | tp->westwood.snd_una = tp->snd_una; | ||
| 2653 | } | ||
| 2654 | |||
| 2655 | /* | ||
| 2656 | * @westwood_do_filter | ||
| 2657 | * Low-pass filter. Implemented using constant coeffients. | ||
| 2658 | */ | ||
| 2659 | |||
| 2660 | static inline __u32 westwood_do_filter(__u32 a, __u32 b) | ||
| 2661 | { | ||
| 2662 | return (((7 * a) + b) >> 3); | ||
| 2663 | } | ||
| 2664 | |||
| 2665 | static void westwood_filter(struct sock *sk, __u32 delta) | ||
| 2666 | { | ||
| 2667 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2668 | |||
| 2669 | tp->westwood.bw_ns_est = | ||
| 2670 | westwood_do_filter(tp->westwood.bw_ns_est, | ||
| 2671 | tp->westwood.bk / delta); | ||
| 2672 | tp->westwood.bw_est = | ||
| 2673 | westwood_do_filter(tp->westwood.bw_est, | ||
| 2674 | tp->westwood.bw_ns_est); | ||
| 2675 | } | ||
| 2676 | |||
| 2677 | /* | ||
| 2678 | * @westwood_update_rttmin | ||
| 2679 | * It is used to update RTTmin. In this case we MUST NOT use | ||
| 2680 | * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! | ||
| 2681 | */ | ||
| 2682 | |||
| 2683 | static inline __u32 westwood_update_rttmin(const struct sock *sk) | ||
| 2684 | { | ||
| 2685 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2686 | __u32 rttmin = tp->westwood.rtt_min; | ||
| 2687 | |||
| 2688 | if (tp->westwood.rtt != 0 && | ||
| 2689 | (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) | ||
| 2690 | rttmin = tp->westwood.rtt; | ||
| 2691 | |||
| 2692 | return rttmin; | ||
| 2693 | } | ||
| 2694 | |||
| 2695 | /* | ||
| 2696 | * @westwood_acked | ||
| 2697 | * Evaluate increases for dk. | ||
| 2698 | */ | ||
| 2699 | |||
| 2700 | static inline __u32 westwood_acked(const struct sock *sk) | ||
| 2701 | { | ||
| 2702 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2703 | |||
| 2704 | return tp->snd_una - tp->westwood.snd_una; | ||
| 2705 | } | ||
| 2706 | |||
| 2707 | /* | ||
| 2708 | * @westwood_new_window | ||
| 2709 | * It evaluates if we are receiving data inside the same RTT window as | ||
| 2710 | * when we started. | ||
| 2711 | * Return value: | ||
| 2712 | * It returns 0 if we are still evaluating samples in the same RTT | ||
| 2713 | * window, 1 if the sample has to be considered in the next window. | ||
| 2714 | */ | ||
| 2715 | |||
| 2716 | static int westwood_new_window(const struct sock *sk) | ||
| 2717 | { | ||
| 2718 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2719 | __u32 left_bound; | ||
| 2720 | __u32 rtt; | ||
| 2721 | int ret = 0; | ||
| 2722 | |||
| 2723 | left_bound = tp->westwood.rtt_win_sx; | ||
| 2724 | rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); | ||
| 2725 | |||
| 2726 | /* | ||
| 2727 | * A RTT-window has passed. Be careful since if RTT is less than | ||
| 2728 | * 50ms we don't filter but we continue 'building the sample'. | ||
| 2729 | * This minimum limit was choosen since an estimation on small | ||
| 2730 | * time intervals is better to avoid... | ||
| 2731 | * Obvioulsy on a LAN we reasonably will always have | ||
| 2732 | * right_bound = left_bound + WESTWOOD_RTT_MIN | ||
| 2733 | */ | ||
| 2734 | |||
| 2735 | if ((left_bound + rtt) < tcp_time_stamp) | ||
| 2736 | ret = 1; | ||
| 2737 | |||
| 2738 | return ret; | ||
| 2739 | } | ||
| 2740 | |||
| 2741 | /* | ||
| 2742 | * @westwood_update_window | ||
| 2743 | * It updates RTT evaluation window if it is the right moment to do | ||
| 2744 | * it. If so it calls filter for evaluating bandwidth. | ||
| 2745 | */ | ||
| 2746 | |||
| 2747 | static void __westwood_update_window(struct sock *sk, __u32 now) | ||
| 2748 | { | ||
| 2749 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2750 | __u32 delta = now - tp->westwood.rtt_win_sx; | ||
| 2751 | |||
| 2752 | if (delta) { | ||
| 2753 | if (tp->westwood.rtt) | ||
| 2754 | westwood_filter(sk, delta); | ||
| 2755 | |||
| 2756 | tp->westwood.bk = 0; | ||
| 2757 | tp->westwood.rtt_win_sx = tcp_time_stamp; | ||
| 2758 | } | ||
| 2759 | } | ||
| 2760 | |||
| 2761 | |||
| 2762 | static void westwood_update_window(struct sock *sk, __u32 now) | ||
| 2763 | { | ||
| 2764 | if (westwood_new_window(sk)) | ||
| 2765 | __westwood_update_window(sk, now); | ||
| 2766 | } | ||
| 2767 | |||
| 2768 | /* | ||
| 2769 | * @__tcp_westwood_fast_bw | ||
| 2770 | * It is called when we are in fast path. In particular it is called when | ||
| 2771 | * header prediction is successfull. In such case infact update is | ||
| 2772 | * straight forward and doesn't need any particular care. | ||
| 2773 | */ | ||
| 2774 | |||
| 2775 | static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2776 | { | ||
| 2777 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2778 | |||
| 2779 | westwood_update_window(sk, tcp_time_stamp); | ||
| 2780 | |||
| 2781 | tp->westwood.bk += westwood_acked(sk); | ||
| 2782 | tp->westwood.snd_una = tp->snd_una; | ||
| 2783 | tp->westwood.rtt_min = westwood_update_rttmin(sk); | ||
| 2784 | } | ||
| 2785 | |||
| 2786 | static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2787 | { | ||
| 2788 | if (tcp_is_westwood(tcp_sk(sk))) | ||
| 2789 | __tcp_westwood_fast_bw(sk, skb); | ||
| 2790 | } | ||
| 2791 | |||
| 2792 | |||
| 2793 | /* | ||
| 2794 | * @westwood_dupack_update | ||
| 2795 | * It updates accounted and cumul_ack when receiving a dupack. | ||
| 2796 | */ | ||
| 2797 | |||
| 2798 | static void westwood_dupack_update(struct sock *sk) | ||
| 2799 | { | ||
| 2800 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2801 | |||
| 2802 | tp->westwood.accounted += tp->mss_cache_std; | ||
| 2803 | tp->westwood.cumul_ack = tp->mss_cache_std; | ||
| 2804 | } | ||
| 2805 | |||
| 2806 | static inline int westwood_may_change_cumul(struct tcp_sock *tp) | ||
| 2807 | { | ||
| 2808 | return (tp->westwood.cumul_ack > tp->mss_cache_std); | ||
| 2809 | } | ||
| 2810 | |||
| 2811 | static inline void westwood_partial_update(struct tcp_sock *tp) | ||
| 2812 | { | ||
| 2813 | tp->westwood.accounted -= tp->westwood.cumul_ack; | ||
| 2814 | tp->westwood.cumul_ack = tp->mss_cache_std; | ||
| 2815 | } | ||
| 2816 | |||
| 2817 | static inline void westwood_complete_update(struct tcp_sock *tp) | ||
| 2818 | { | ||
| 2819 | tp->westwood.cumul_ack -= tp->westwood.accounted; | ||
| 2820 | tp->westwood.accounted = 0; | ||
| 2821 | } | ||
| 2822 | |||
| 2823 | /* | ||
| 2824 | * @westwood_acked_count | ||
| 2825 | * This function evaluates cumul_ack for evaluating dk in case of | ||
| 2826 | * delayed or partial acks. | ||
| 2827 | */ | ||
| 2828 | |||
| 2829 | static inline __u32 westwood_acked_count(struct sock *sk) | ||
| 2830 | { | ||
| 2831 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2832 | |||
| 2833 | tp->westwood.cumul_ack = westwood_acked(sk); | ||
| 2834 | |||
| 2835 | /* If cumul_ack is 0 this is a dupack since it's not moving | ||
| 2836 | * tp->snd_una. | ||
| 2837 | */ | ||
| 2838 | if (!(tp->westwood.cumul_ack)) | ||
| 2839 | westwood_dupack_update(sk); | ||
| 2840 | |||
| 2841 | if (westwood_may_change_cumul(tp)) { | ||
| 2842 | /* Partial or delayed ack */ | ||
| 2843 | if (tp->westwood.accounted >= tp->westwood.cumul_ack) | ||
| 2844 | westwood_partial_update(tp); | ||
| 2845 | else | ||
| 2846 | westwood_complete_update(tp); | ||
| 2847 | } | ||
| 2848 | |||
| 2849 | tp->westwood.snd_una = tp->snd_una; | ||
| 2850 | |||
| 2851 | return tp->westwood.cumul_ack; | ||
| 2852 | } | ||
| 2853 | |||
| 2854 | |||
| 2855 | /* | ||
| 2856 | * @__tcp_westwood_slow_bw | ||
| 2857 | * It is called when something is going wrong..even if there could | ||
| 2858 | * be no problems! Infact a simple delayed packet may trigger a | ||
| 2859 | * dupack. But we need to be careful in such case. | ||
| 2860 | */ | ||
| 2861 | |||
| 2862 | static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2863 | { | ||
| 2864 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2865 | |||
| 2866 | westwood_update_window(sk, tcp_time_stamp); | ||
| 2867 | |||
| 2868 | tp->westwood.bk += westwood_acked_count(sk); | ||
| 2869 | tp->westwood.rtt_min = westwood_update_rttmin(sk); | ||
| 2870 | } | ||
| 2871 | |||
| 2872 | static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2873 | { | ||
| 2874 | if (tcp_is_westwood(tcp_sk(sk))) | ||
| 2875 | __tcp_westwood_slow_bw(sk, skb); | ||
| 2876 | } | ||
| 2877 | |||
| 2878 | /* This routine deals with incoming acks, but not outgoing ones. */ | 2249 | /* This routine deals with incoming acks, but not outgoing ones. */ |
| 2879 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | 2250 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) |
| 2880 | { | 2251 | { |
| @@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2884 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 2255 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
| 2885 | u32 prior_in_flight; | 2256 | u32 prior_in_flight; |
| 2886 | s32 seq_rtt; | 2257 | s32 seq_rtt; |
| 2258 | s32 seq_usrtt = 0; | ||
| 2887 | int prior_packets; | 2259 | int prior_packets; |
| 2888 | 2260 | ||
| 2889 | /* If the ack is newer than sent or older than previous acks | 2261 | /* If the ack is newer than sent or older than previous acks |
| @@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2902 | */ | 2274 | */ |
| 2903 | tcp_update_wl(tp, ack, ack_seq); | 2275 | tcp_update_wl(tp, ack, ack_seq); |
| 2904 | tp->snd_una = ack; | 2276 | tp->snd_una = ack; |
| 2905 | tcp_westwood_fast_bw(sk, skb); | ||
| 2906 | flag |= FLAG_WIN_UPDATE; | 2277 | flag |= FLAG_WIN_UPDATE; |
| 2907 | 2278 | ||
| 2279 | tcp_ca_event(tp, CA_EVENT_FAST_ACK); | ||
| 2280 | |||
| 2908 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); | 2281 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); |
| 2909 | } else { | 2282 | } else { |
| 2910 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) | 2283 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) |
| @@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2920 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) | 2293 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) |
| 2921 | flag |= FLAG_ECE; | 2294 | flag |= FLAG_ECE; |
| 2922 | 2295 | ||
| 2923 | tcp_westwood_slow_bw(sk,skb); | 2296 | tcp_ca_event(tp, CA_EVENT_SLOW_ACK); |
| 2924 | } | 2297 | } |
| 2925 | 2298 | ||
| 2926 | /* We passed data and got it acked, remove any soft error | 2299 | /* We passed data and got it acked, remove any soft error |
| @@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2935 | prior_in_flight = tcp_packets_in_flight(tp); | 2308 | prior_in_flight = tcp_packets_in_flight(tp); |
| 2936 | 2309 | ||
| 2937 | /* See if we can take anything off of the retransmit queue. */ | 2310 | /* See if we can take anything off of the retransmit queue. */ |
| 2938 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt); | 2311 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt, |
| 2312 | tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); | ||
| 2939 | 2313 | ||
| 2940 | if (tp->frto_counter) | 2314 | if (tp->frto_counter) |
| 2941 | tcp_process_frto(sk, prior_snd_una); | 2315 | tcp_process_frto(sk, prior_snd_una); |
| 2942 | 2316 | ||
| 2943 | if (tcp_ack_is_dubious(tp, flag)) { | 2317 | if (tcp_ack_is_dubious(tp, flag)) { |
| 2944 | /* Advanve CWND, if state allows this. */ | 2318 | /* Advanve CWND, if state allows this. */ |
| 2945 | if ((flag & FLAG_DATA_ACKED) && | 2319 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) |
| 2946 | (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && | 2320 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); |
| 2947 | tcp_may_raise_cwnd(tp, flag)) | ||
| 2948 | tcp_cong_avoid(tp, ack, seq_rtt); | ||
| 2949 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); | 2321 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); |
| 2950 | } else { | 2322 | } else { |
| 2951 | if ((flag & FLAG_DATA_ACKED) && | 2323 | if ((flag & FLAG_DATA_ACKED)) |
| 2952 | (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) | 2324 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); |
| 2953 | tcp_cong_avoid(tp, ack, seq_rtt); | ||
| 2954 | } | 2325 | } |
| 2955 | 2326 | ||
| 2956 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) | 2327 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) |
| @@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4552 | 3923 | ||
| 4553 | tcp_init_metrics(sk); | 3924 | tcp_init_metrics(sk); |
| 4554 | 3925 | ||
| 3926 | tcp_init_congestion_control(tp); | ||
| 3927 | |||
| 4555 | /* Prevent spurious tcp_cwnd_restart() on first data | 3928 | /* Prevent spurious tcp_cwnd_restart() on first data |
| 4556 | * packet. | 3929 | * packet. |
| 4557 | */ | 3930 | */ |
| @@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4708 | if(tp->af_specific->conn_request(sk, skb) < 0) | 4081 | if(tp->af_specific->conn_request(sk, skb) < 0) |
| 4709 | return 1; | 4082 | return 1; |
| 4710 | 4083 | ||
| 4711 | init_westwood(sk); | ||
| 4712 | init_bictcp(tp); | ||
| 4713 | |||
| 4714 | /* Now we have several options: In theory there is | 4084 | /* Now we have several options: In theory there is |
| 4715 | * nothing else in the frame. KA9Q has an option to | 4085 | * nothing else in the frame. KA9Q has an option to |
| 4716 | * send data with the syn, BSD accepts data with the | 4086 | * send data with the syn, BSD accepts data with the |
| @@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4732 | goto discard; | 4102 | goto discard; |
| 4733 | 4103 | ||
| 4734 | case TCP_SYN_SENT: | 4104 | case TCP_SYN_SENT: |
| 4735 | init_westwood(sk); | ||
| 4736 | init_bictcp(tp); | ||
| 4737 | |||
| 4738 | queued = tcp_rcv_synsent_state_process(sk, skb, th, len); | 4105 | queued = tcp_rcv_synsent_state_process(sk, skb, th, len); |
| 4739 | if (queued >= 0) | 4106 | if (queued >= 0) |
| 4740 | return queued; | 4107 | return queued; |
| @@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4816 | */ | 4183 | */ |
| 4817 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 4184 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
| 4818 | !tp->srtt) | 4185 | !tp->srtt) |
| 4819 | tcp_ack_saw_tstamp(tp, 0); | 4186 | tcp_ack_saw_tstamp(tp, 0, 0); |
| 4820 | 4187 | ||
| 4821 | if (tp->rx_opt.tstamp_ok) | 4188 | if (tp->rx_opt.tstamp_ok) |
| 4822 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 4189 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
| @@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4828 | 4195 | ||
| 4829 | tcp_init_metrics(sk); | 4196 | tcp_init_metrics(sk); |
| 4830 | 4197 | ||
| 4198 | tcp_init_congestion_control(tp); | ||
| 4199 | |||
| 4831 | /* Prevent spurious tcp_cwnd_restart() on | 4200 | /* Prevent spurious tcp_cwnd_restart() on |
| 4832 | * first data packet. | 4201 | * first data packet. |
| 4833 | */ | 4202 | */ |
